Professional Documents
Culture Documents
Name: Reg. No.: Lab Exercise:: Shivam Batra 19BPS1131
Name: Reg. No.: Lab Exercise:: Shivam Batra 19BPS1131
Methods -
2. Data preparation
4. ROC Curve
STEPS:
#Descriptive statistics
summary(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 0:1382
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1: 217
## Median :3.310 Median :0.6200 Median :10.20
## Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :4.010 Max. :2.0000 Max. :14.90
Univariate analysis
#Dependent variable
#Frequency plot
par(mfrow=c(1,1))
barplot(table(data[[12]]),
main = sprintf('Frequency plot of the variable: %s',
colnames(data[12])),
xlab = colnames(data[12]),
ylab = 'Frequency')
#Check class BIAS
table(data$quality)
##
## 0 1
## 1382 217
round(prop.table((table(data$quality))),2)
##
## 0 1
## 0.86 0.14
#Independent variable
#Boxplots
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
boxplot(x = data[i],
horizontal = TRUE,
main = sprintf('Boxplot of the variable: %s',
colnames(data[i])),
xlab = colnames(data[i]))
}
#Histograms
par(mfrow=c(3,4))
for (i in 1:(length(data)-1)){
hist(x = data[[i]],
main = sprintf('Histogram of the variable: %s',
colnames(data[i])),
xlab = colnames(data[i]))
}
Bivariate analysis
#Correlation matrix
library(ggcorrplot)
ggcorrplot(round(cor(data[-12]), 2),
type = "lower",
lab = TRUE,
title =
'Correlation matrix of the red wine quality dataset')
Data preparation
#Missing values
sum(is.na(data))
## [1] 0
#Outliers
#Identifing outliers
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) |
x > quantile(x, 0.75) + 1.5 * IQR(x))
}
outlier <- data.frame(variable = character(),
sum_outliers = integer(),
stringsAsFactors=FALSE)
for (j in 1:(length(data)-1)){
variable <- colnames(data[j])
for (i in data[j]){
sum_outliers <- sum(is_outlier(i))
}
row <- data.frame(variable,sum_outliers)
outlier <- rbind(outlier, row)
}
outlier
## variable sum_outliers
## 1 fixed.acidity 49
## 2 volatile.acidity 19
## 3 citric.acid 1
## 4 residual.sugar 155
## 5 chlorides 112
## 6 free.sulfur.dioxide 30
## 7 total.sulfur.dioxide 55
## 8 density 45
## 9 pH 35
## 10 sulphates 59
## 11 alcohol 13
Modeling
#Splitting the dataset into the Training set and Test set
#Stratified sample
data_ones <- data[which(data$quality == 1), ]
data_zeros <- data[which(data$quality == 0), ]
#Train data
set.seed(123)
train_ones_rows <- sample(1:nrow(data_ones), 0.8*nrow(data_ones))
train_zeros_rows <- sample(1:nrow(data_zeros), 0.8*nrow(data_ones))
train_ones <- data_ones[train_ones_rows, ]
train_zeros <- data_zeros[train_zeros_rows, ]
train_set <- rbind(train_ones, train_zeros)
table(train_set$quality)
##
## 0 1
## 173 173
#Test Data
test_ones <- data_ones[-train_ones_rows, ]
test_zeros <- data_zeros[-train_zeros_rows, ]
test_set <- rbind(test_ones, test_zeros)
table(test_set$quality)
##
## 0 1
## 1209 44
Logistic regression
#Logistic Regression
lr = glm(formula = quality ~.,
data = train_set,
family = binomial)
#Predictions
prob_pred = predict(lr,
type = 'response',
newdata = test_set[-12])
library(InformationValue)
optCutOff <- optimalCutoff(test_set$quality, prob_pred)[1]
y_pred = ifelse(prob_pred > optCutOff, 1, 0)
## y_pred
## 0 1
## 0 1207 2
## 1 43 1
#Accuracy
accuracy_lr = (cm_lr[1,1] + cm_lr[1,1])/
(cm_lr[1,1] + cm_lr[1,1] + cm_lr[2,1] + cm_lr[1,2])
accuracy_lr
## [1] 0.9816999
#ROC curve
library(ROSE)