Code Analysis

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

1 Vietnam National University – Ho Chi Minh City

2 Ho Chi Minh City University of Technology


3 Department of Applied Science
4 ----------------------------------------------------------------
5 PROBABILITY & STATISTICS
6 Semester: 231
7 Class: CC01
8 Group: 9
9 ----------------------------------------------------------------
10 R CODE:
11
12 #>>> INSTALL NECESSARY LIBRARIES
13
14 #Recalling necessary libraries
15 install.packages("ggplot2")
16 library(ggplot2)
17 library(corrplot)
18 library(caret)
19
20
21 #>>> DATA READING & PRE-PROCESSING
22
23 #Read data
24 wp=read.csv("E:/XSTK - BTL/water_potability.csv")
25 head(wp)
26
27 #Categorize potability data as factor
28 wp$Potability <- as.factor(wp$Potability)
29
30 #Data summary
31 table(wp$Potability)
32
33 #Counting missing values
34 colSums(is.na(wp))
35
36 #Replace missing values
37 wp$ph[is.na(wp$ph)] <- mean(wp$ph,na.rm=T)
38 wp$Sulfate[is.na(wp$Sulfate)] <- mean(wp$Sulfate,na.rm=T)
39 wp$Trihalomethanes[is.na(wp$Trihalomethanes)] <- mean(wp$Trihalomethanes,na.rm=T)
40
41
42 #>>> DESCRIPTIVE STATISTICS
43
44 #Graphical parameters
45 par(family = "serif")
46
47 #Box-plots of pH with respect to potability
48 p1 <- boxplot(ph~Potability,
49 data=wp,
50 main="Box-plot of pH by potability",
51 ylab="Potability",
52 xlab="pH",
53 horizontal=TRUE)
54
55
56 #Box-plots of hardness with respect to potability
57 p2 <- boxplot(Hardness~Potability,
58 data=wp,
59 main="Box-plot of hardness by potability",
60 ylab="Potability",
61 xlab="Hardness",
62 horizontal=TRUE)
63
64
65 #Box-plots of solids with respect to potability
66 p3 <- boxplot(Solids~Potability,
67 data=wp,
68 main="Box-plot of solids by potability",
69 ylab="Potability",
70 xlab="Solids",
71 horizontal=TRUE)
72
73
74 #Box-plots of chloramines with respect to potability
75 p4 <- boxplot(Chloramines~Potability,
76 data=wp,
77 main="Box-plot of chloramines by potability",
78 ylab="Potability",
79 xlab="Chloramines",
80 horizontal=TRUE)
81
82
83 #Box-plots of sulfate with respect to potability
84 p5 <- boxplot(Sulfate~Potability,
85 data=wp,
86 main="Box-plot of sulfate by potability",
87 ylab="Potability",
88 xlab="Sulfate",
89 horizontal=TRUE)
90
91
92 #Box-plots of conductivity with respect to potability
93 p6 <- boxplot(Conductivity~Potability,
94 data=wp,
95 main="Box-plot of conductivity by potability",
96 ylab="Potability",
97 xlab="Conductivity",
98 horizontal=TRUE)
99
100
101 #Box-plots of organic carbon with respect to potability
102 p7 <- boxplot(Organic_carbon~Potability,
103 data=wp,
104 main="Box-plot of organic carbon by potability",
105 ylab="Potability",
106 xlab="Organic Carbon",
107 horizontal=TRUE)
108
109
110 #Box-plots of trihalomethanes with respect to potability
111 p8 <- boxplot(Trihalomethanes~Potability,
112 data=wp,
113 main="Box-plot of trihalomethanes by potability",
114 ylab="Potability",
115 xlab="Trihalomethanes",
116 horizontal=TRUE)
117
118
119 #Box-plots of turbidity with respect to potability
120 p9 <- boxplot(Turbidity~Potability,
121 data=wp,
122 main="Box-plot of turbidity by potability",
123 ylab="Potability",
124 xlab="Turbidity",
125 horizontal=TRUE)
126
127 #Correlation matrix
128 round(cor(wp[,1:9]),3)
129
130 #Correlation plot
131 corrplot <- corrplot(cor(wp[,1:9]))
132
133
134 #>>> SHUFFLE, SPLIT & FIT INTO LOGISTIC REGRESSION
135
136 # Shuffle the rows of the data set
137 shuffled_wp <- wp[sample(nrow(wp)), ]
138
139 # Set a seed for reproducibility
140 set.seed(123)
141
142 # Setting your response variable as in the column "Potability"
143 r_var <- "Potability"
144
145 # Use createDataPartition to split the data into training and testing sets
146 index <- createDataPartition(shuffled_wp[[r_var]], p = 0.8, list = FALSE)
147
148 # Create the training set
149 training_set <- shuffled_wp[index, ]
150
151 # Create the testing set
152 testing_set <- shuffled_wp[-index, ]
153
154 # Fit a logistic regression model
155 logistic_model <- glm(Potability ~ ., data = training_set, family = "binomial")
156
157 # Display the summary of the model
158 summary(logistic_model)
159
160
161 #>>> LOGISTIC REGRESSION - PREDICTION
162
163 # Use the model to predict probabilities on the testing set
164 predicted_probs <- predict(logistic_model, newdata = testing_set
165 , type = "response")
166
167 # Convert probabilities to binary predictions (0 or 1)
168 predicted_labels <- ifelse(predicted_probs > 0.5, 1, 0)
169
170 # Assuming your actual labels are in the "Potability" column of the testing set
171 actual_labels <- testing_set$Potability
172
173 # Create a confusion matrix to evaluate model performance
174 confusion_matrix <- table(Actual = actual_labels, Predicted = predicted_labels)
175
176 # Calculate accuracy, 95% CI, no-information rate, and p-value
177 model_performance <- confusionMatrix(data = as.factor(predicted_labels),
178 reference = as.factor(actual_labels))
179
180 # Display the model performance summary
181 print(model_performance)
182
183 #>>> ROC CURVE - HOSMER & LEMESHOW TEST
184
185 #Load necessary libraries
186 library(pROC)
187 library(ResourceSelection)
188
189 #Create ROC object
190 roc_obj <- roc(testing_set$Potability, predicted_probs)
191
192 #Plot the ROC curve
193 plot(roc_obj)
194
195 #Calculate the AUC
196 auc(roc_obj)
197
198 #Perform the Hosmer-Lemeshow test
199 hoslem.test(logistic_model$y, fitted(logistic_model))
200
201
202 #>>> FIT INTO RANDOM FOREST MODEL
203
204 #Import Random Forest library
205 install.packages("randomForest")
206 library(randomForest)
207
208 #Use Random Forest
209 rf_model <- randomForest(Potability ~ ph + Conductivity + Trihalomethanes +
210 Hardness + Solids + Chloramines + Sulfate
211 + Turbidity + Organic_carbon, data = training_set)
212
213 #Print the model
214 print(rf_model)
215
216
217 #>>> RANDOM FOREST - PREDICTION
218
219 #Test the model
220 predicted_rf <- predict(rf_model, newdata = testing_set)
221 confusion_matrix2 <- table(predicted_rf, testing_set$Potability)
222 print(confusion_matrix2)
223
224 #Print the model performance
225 model_performance_rf <- confusionMatrix(data = as.factor(predicted_rf), reference =
226 as.factor(testing_set$Potability))
227 print(model_performance_rf)

You might also like