Code Analysis

1 Vietnam National University – Ho Chi Minh City
2 Ho Chi Minh City University of Technology

3 Department of Applied Science
4 ----------------------------------------------------------------
5 PROBABILITY & STATISTICS
6 Semester: 231
7 Class: CC01
8 Group: 9
9 ----------------------------------------------------------------
10 R CODE:
11
12 #>>> INSTALL NECESSARY LIBRARIES
13
14 #Recalling necessary libraries
15 install.packages("ggplot2")
16 library(ggplot2)
17 library(corrplot)
18 library(caret)
19
20
21 #>>> DATA READING & PRE-PROCESSING
22
23 #Read data
24 wp=read.csv("E:/XSTK - BTL/water_potability.csv")
25 head(wp)
26
27 #Categorize potability data as factor
28 wp$Potability <- as.factor(wp$Potability)
29
30 #Data summary
31 table(wp$Potability)
32
33 #Counting missing values
34 colSums(is.na(wp))
35
36 #Replace missing values
37 wp$ph[is.na(wp$ph)] <- mean(wp$ph,na.rm=T)
38 wp$Sulfate[is.na(wp$Sulfate)] <- mean(wp$Sulfate,na.rm=T)
39 wp$Trihalomethanes[is.na(wp$Trihalomethanes)] <- mean(wp$Trihalomethanes,na.rm=T)
40
41
42 #>>> DESCRIPTIVE STATISTICS
43
44 #Graphical parameters
45 par(family = "serif")
46
47 #Box-plots of pH with respect to potability
48 p1 <- boxplot(ph~Potability,
49 data=wp,
50 main="Box-plot of pH by potability",
51 ylab="Potability",
52 xlab="pH",
53 horizontal=TRUE)
54
55
56 #Box-plots of hardness with respect to potability
57 p2 <- boxplot(Hardness~Potability,
58 data=wp,
59 main="Box-plot of hardness by potability",
61 xlab="Hardness",
62 horizontal=TRUE)
63
64
65 #Box-plots of solids with respect to potability
66 p3 <- boxplot(Solids~Potability,
67 data=wp,
68 main="Box-plot of solids by potability",
70 xlab="Solids",
71 horizontal=TRUE)
72
73
74 #Box-plots of chloramines with respect to potability
75 p4 <- boxplot(Chloramines~Potability,
76 data=wp,
77 main="Box-plot of chloramines by potability",
79 xlab="Chloramines",
80 horizontal=TRUE)
81
82
83 #Box-plots of sulfate with respect to potability
84 p5 <- boxplot(Sulfate~Potability,
85 data=wp,
86 main="Box-plot of sulfate by potability",
88 xlab="Sulfate",
89 horizontal=TRUE)
90
91
92 #Box-plots of conductivity with respect to potability
93 p6 <- boxplot(Conductivity~Potability,
94 data=wp,
95 main="Box-plot of conductivity by potability",
97 xlab="Conductivity",
98 horizontal=TRUE)
99
100
101 #Box-plots of organic carbon with respect to potability
102 p7 <- boxplot(Organic_carbon~Potability,
103 data=wp,
104 main="Box-plot of organic carbon by potability",
106 xlab="Organic Carbon",
107 horizontal=TRUE)
108
109
110 #Box-plots of trihalomethanes with respect to potability
111 p8 <- boxplot(Trihalomethanes~Potability,
112 data=wp,
113 main="Box-plot of trihalomethanes by potability",
115 xlab="Trihalomethanes",
117
118
119 #Box-plots of turbidity with respect to potability
120 p9 <- boxplot(Turbidity~Potability,
121 data=wp,
122 main="Box-plot of turbidity by potability",
124 xlab="Turbidity",
126
127 #Correlation matrix
128 round(cor(wp[,1:9]),3)
129
130 #Correlation plot
131 corrplot <- corrplot(cor(wp[,1:9]))
132
133
134 #>>> SHUFFLE, SPLIT & FIT INTO LOGISTIC REGRESSION
135
136 # Shuffle the rows of the data set
137 shuffled_wp <- wp[sample(nrow(wp)), ]
138
139 # Set a seed for reproducibility
140 set.seed(123)
141
142 # Setting your response variable as in the column "Potability"
143 r_var <- "Potability"
144
145 # Use createDataPartition to split the data into training and testing sets
146 index <- createDataPartition(shuffled_wp[[r_var]], p = 0.8, list = FALSE)
147
148 # Create the training set
149 training_set <- shuffled_wp[index, ]
150
151 # Create the testing set
152 testing_set <- shuffled_wp[-index, ]
153
154 # Fit a logistic regression model
155 logistic_model <- glm(Potability ~ ., data = training_set, family = "binomial")
156
157 # Display the summary of the model
158 summary(logistic_model)
159
160
161 #>>> LOGISTIC REGRESSION - PREDICTION
162
163 # Use the model to predict probabilities on the testing set
164 predicted_probs <- predict(logistic_model, newdata = testing_set
165 , type = "response")
166
167 # Convert probabilities to binary predictions (0 or 1)
168 predicted_labels <- ifelse(predicted_probs > 0.5, 1, 0)
169
170 # Assuming your actual labels are in the "Potability" column of the testing set
171 actual_labels <- testing_set$Potability
172
173 # Create a confusion matrix to evaluate model performance
174 confusion_matrix <- table(Actual = actual_labels, Predicted = predicted_labels)
175
176 # Calculate accuracy, 95% CI, no-information rate, and p-value
177 model_performance <- confusionMatrix(data = as.factor(predicted_labels),
178 reference = as.factor(actual_labels))
179
180 # Display the model performance summary
181 print(model_performance)
182
183 #>>> ROC CURVE - HOSMER & LEMESHOW TEST
184
185 #Load necessary libraries
186 library(pROC)
187 library(ResourceSelection)
188
189 #Create ROC object
190 roc_obj <- roc(testing_set$Potability, predicted_probs)
191
192 #Plot the ROC curve
193 plot(roc_obj)
194
195 #Calculate the AUC
196 auc(roc_obj)
197
198 #Perform the Hosmer-Lemeshow test
199 hoslem.test(logistic_model$y, fitted(logistic_model))
200
201
202 #>>> FIT INTO RANDOM FOREST MODEL
203
204 #Import Random Forest library
205 install.packages("randomForest")
206 library(randomForest)
207
208 #Use Random Forest
209 rf_model <- randomForest(Potability ~ ph + Conductivity + Trihalomethanes +
210 Hardness + Solids + Chloramines + Sulfate
211 + Turbidity + Organic_carbon, data = training_set)
212
213 #Print the model
214 print(rf_model)
215
216
217 #>>> RANDOM FOREST - PREDICTION
218
219 #Test the model
220 predicted_rf <- predict(rf_model, newdata = testing_set)
221 confusion_matrix2 <- table(predicted_rf, testing_set$Potability)
222 print(confusion_matrix2)
223
224 #Print the model performance
225 model_performance_rf <- confusionMatrix(data = as.factor(predicted_rf), reference =
226 as.factor(testing_set$Potability))
227 print(model_performance_rf)

Code Analysis

Uploaded by

Copyright:

Available Formats

You might also like

Code Analysis

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Code Analysis

Uploaded by

Copyright:

Available Formats

1 Vietnam National University – Ho Chi Minh City

2 Ho Chi Minh City University of Technology

You might also like