Assignment 1

COMP5318 Assignment 1: Classification
Group number: A1 Part1 Group 43 , SID1: 530242293 SID2: 520597749
In [1]: # Import all libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from warnings import simplefilter
# ignore warnings related to some packages.
simplefilter(action='ignore', category=FutureWarning)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
In [2]: # Load dataset
df= pd.read_csv("breast-cancer-wisconsin.csv")
Pre-processing dataset
In [3]: # checking all the rows which have missing values "?"
df[df.values=='?']
Out[3]:
Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses class
23 8 4 5 1 2 ? 7 3 1 class2
40 6 6 6 9 6 ? 7 8 1 class1
139 1 1 1 1 1 ? 2 1 1 class1
145 1 1 3 1 2 ? 2 1 1 class1
158 1 1 2 1 3 ? 1 1 1 class1
164 5 1 1 1 2 ? 3 1 1 class1
235 3 1 4 1 2 ? 3 1 1 class1
249 3 1 1 1 2 ? 3 1 1 class1
275 3 1 3 1 2 ? 2 1 1 class1
292 8 8 8 1 2 ? 6 10 1 class2
294 1 1 1 1 2 ? 2 1 1 class1
297 5 4 3 1 2 ? 2 3 1 class1
315 4 6 5 6 7 ? 4 9 1 class1
321 3 1 1 1 2 ? 3 1 1 class1
411 1 1 1 1 1 ? 2 1 1 class1
617 1 1 1 1 1 ? 1 1 1 class1
In [4]: # converting all ? to NaNs
df=df.replace(['?'],np.nan)
In [5]: # checking if ? have been correctly changed to NaNs in DataFrame
df[df.isna().any(axis=1)]
Out[5]:
Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape Marginal Adhesion Single Epithelial Cell Size Bare Nuclei Bland Chromatin Normal Nucleoli Mitoses class
23 8 4 5 1 2 NaN 7 3 1 class2
40 6 6 6 9 6 NaN 7 8 1 class1
139 1 1 1 1 1 NaN 2 1 1 class1
145 1 1 3 1 2 NaN 2 1 1 class1
158 1 1 2 1 3 NaN 1 1 1 class1
164 5 1 1 1 2 NaN 3 1 1 class1
235 3 1 4 1 2 NaN 3 1 1 class1
249 3 1 1 1 2 NaN 3 1 1 class1
275 3 1 3 1 2 NaN 2 1 1 class1
292 8 8 8 1 2 NaN 6 10 1 class2
294 1 1 1 1 2 NaN 2 1 1 class1
297 5 4 3 1 2 NaN 2 3 1 class1
315 4 6 5 6 7 NaN 4 9 1 class1
321 3 1 1 1 2 NaN 3 1 1 class1
411 1 1 1 1 1 NaN 2 1 1 class1
617 1 1 1 1 1 NaN 1 1 1 class1
In [6]: # changing class1 and class2 to 0 and 1 respectively in the last column of dataframe
LastColumn= df.columns[-1] #extracting the name of the last column in dataframe

df[LastColumn]= df[LastColumn].replace(['class1'],0)
df[LastColumn]= df[LastColumn].replace(['class2'],1)
In [7]: # Splitting dataset into X(all columns except class) and Y(class column).
X= df.drop(LastColumn, axis=1)
y= df[LastColumn]
In [8]: # filling missing Values using SimpleImputer
imputer= SimpleImputer(strategy='mean', missing_values=np.nan)
imputer= imputer.fit(X)
Xfilled= imputer.transform(X)
In [9]: # Normalisation
scaler= MinMaxScaler()
XNormalised= scaler.fit_transform(Xfilled)
print(XNormalised)
[[0.44444444 0. 0. ... 0.22222222 0. 0. ]

[0.44444444 0.33333333 0.33333333 ... 0.22222222 0.11111111 0. ]
[0.22222222 0. 0. ... 0.22222222 0. 0. ]
...
[0.44444444 1. 1. ... 0.77777778 1. 0.11111111]
[0.33333333 0.77777778 0.55555556 ... 1. 0.55555556 0. ]
[0.33333333 0.77777778 0.77777778 ... 1. 0.33333333 0. ]]
In [10]: # Defining a function to Print the first ten rows of pre-processed dataset to 4 decimal places:
def print_data(X, y, n_rows):
for i in range(n_rows):
for feature in X[i]:

print("{:.4f}".format(feature), end=",")
if i == len(X)-1:
print(y[i],end="")
else:
print(y[i])
In [11]: #printing the first ten rows of pre-processed dataset to 4 decimal places using the above function:
print_data(XNormalised,y,10)
0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0
Part 1: Cross-validation without parameter tuning

In [12]: ## Setting the 10 fold stratified cross-validation
cvKFold= StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
In [13]: # creating the training and test data splits
X_train, X_test, y_train, y_test = train_test_split(

XNormalised, y, stratify=y, random_state=0)
# Since XNormalised being passed to the train_test_split function here is already normalised.
# Therefore X_train and y_train are automatically normalised.
# Thus no need to separately normalise the training data sets further.
In [14]: # creating and applying KNN Classifier to dataset
knn= KNeighborsClassifier() #load an instance of the classifier.

knn.fit(X_train, y_train) #creating a model
PredictionKNN = knn.predict(X_test) #making prediction using the model
print("Test set predictions:\n", PredictionKNN)

print("Accuracy on test set: {:.4f}".format(knn.score(X_test, y_test)))
Test set predictions:

[1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1
1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0
0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0
0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 0]
Accuracy on test set: 0.9600
In [15]: # Defining function for KNN cross-validation score
def KNNClassifier(X, y):
knn = KNeighborsClassifier() #load an instance of the classifier.
scores= cross_val_score(knn, X,y,cv=cvKFold) # perform cross validation
return round(scores.mean(),4) # return mean score of cross validation
In [16]: #Running the above function for KNN cross-validation score
print(KNNClassifier(XNormalised, y))
0.9671
In [ ]:
In [17]: # creating and applying Logistic regression Classifier to dataset
logreg= LogisticRegression() #load an instance of the classifier.

logreg.fit(X_train, y_train) #creating a model
PredictionLR= logreg.predict(X_test) #making prediction using the model
print("Test set predictions:\n", PredictionLR)

print("Accuracy on test set: {:.4f}".format(logreg.score(X_test, y_test)))

[1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1
1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0
0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0
0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 0]
In [18]: # Defining function for Logistic regression cross-validation score
def logregClassifier(X, y):
logreg= LogisticRegression()
scores= cross_val_score(logreg, X,y,cv=cvKFold)
return round(scores.mean(),4)
In [19]: # Running the above function for Logistic regression cross-validation score
logregClassifier(XNormalised, y)
Out[19]: 0.9642
In [ ]:
In [20]: # creating and applying Naive Bayes Classifier to dataset
nb= GaussianNB() #load an instance of the classifier.

nb.fit(X_train, y_train) #creating a model
PredictionNB = nb.predict(X_test) #making prediction using the model
print("Test set predictions:\n", PredictionNB)

print("Accuracy on test set: {:.4f}".format(nb.score(X_test, y_test)))

[1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1
1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0
0 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0
0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 0]
In [21]: # Defining function for Naive Bayes cross-validation score
def nbClassifier(X, y):
nb= GaussianNB() #load an instance of the classifier.

scores= cross_val_score(nb, X,y,cv=cvKFold) # perform cross validation
In [22]: # Running the above function for Naive Bayes cross-validation score
nbClassifier(XNormalised, y)
Out[22]: 0.9585
In [ ]:
In [23]: # creating and applying Decision Tree Classifier to dataset
tree= DecisionTreeClassifier(criterion='entropy', random_state=0) #load an instance of the classifier.

tree.fit(X_train, y_train) #creating a model
print("Accuracy on test set: {:.4f}".format(tree.score(X_test, y_test)))
In [24]: # Defining function for Decision Tree cross-validation score
def dtClassifier(X, y):
tree= DecisionTreeClassifier(criterion='entropy', random_state=0)

scores= cross_val_score(tree, X, y,cv=cvKFold) # perform cross validation
In [25]: # Running the above function for Decision Tree cross-validation score
dtClassifier(XNormalised, y)
Out[25]: 0.9385
In [ ]:
In [26]: # creating and applying Bagging Classifier to dataset
bagC= BaggingClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=0), n_estimators=60,
max_samples=100, bootstrap=True, random_state=0) #load an instance of the classifier.
bagC.fit(X_train, y_train) #creating a model
print("Accuracy on test set: {:.4f}".format(bagC.score(X_test, y_test)))
In [27]: # Defining function for Bagging cross-validation score
def bagClassifier(X, y, n_estimators, max_samples, max_depth):
bagC = BaggingClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0), n_estimators=n_estimators,
max_samples=max_samples, bootstrap=True, random_state=0) #load an instance of the classifier.
scores= cross_val_score(bagC, X, y,cv=cvKFold) # perform cross validation
In [28]: # Running the above function for Bagging cross-validation score
bagClassifier(XNormalised, y, 60, 100, 6)
Out[28]: 0.9571
In [ ]:
In [29]: # creating and applying AdaBoost Classifier to dataset
adaB = AdaBoostClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=6), n_estimators=60, learning_rate=
0.5, random_state=0)
adaB.fit(X_train, y_train) #creating a model
print("Accuracy on test set: {:.4f}".format(adaB.score(X_test, y_test)))

In [30]: # Defining function for AdaBoost cross-validation score
def adaBClassifier(X, y, n_estimators, learning_rate, max_depth):
adaB = AdaBoostClassifier(
DecisionTreeClassifier(criterion='entropy', max_depth=max_depth), n_estimators=n_estimators, learning_rate=
learning_rate, random_state=0)
scores= cross_val_score(adaB, X, y,cv=cvKFold) # perform cross validation
In [31]: # Running the above function for AdaBoost cross-validation score
adaBClassifier(XNormalised, y, 60, 0.5, 6)
Out[31]: 0.9542
In [ ]:
In [32]: # creating and applying Gradient Boosting Classifier to dataset
GB= GradientBoostingClassifier(n_estimators=60, learning_rate=0.5, random_state=0) #load an instance of the classifier.
GB.fit(X_train, y_train) #creating a model
print("Accuracy on test set: {:.4f}".format(GB.score(X_test, y_test)))
In [33]: # Defining function for Gradient Boosting cross-validation score
def gbClassifier(X, y, n_estimators, learning_rate):
GB = GradientBoostingClassifier(max_depth=1, n_estimators=n_estimators, learning_rate=learning_rate,

random_state=0)
scores= cross_val_score(GB, X, y,cv=cvKFold) # perform cross validation
In [34]: # Running the function for AdaBoost cross-validation score

print(gbClassifier(XNormalised, y, 60, 0.5))
0.9571
In [ ]:
Part 1 Results
In [35]: # Parameters for Part 1:
#Bagging
bag_n_estimators = 60
bag_max_samples = 100
bag_max_depth = 6
#AdaBoost
ada_n_estimators = 60
ada_learning_rate = 0.5
ada_bag_max_depth = 6
#GB
gb_n_estimators = 60
gb_learning_rate = 0.5
# Print results for each classifier in part 1 to 4 decimal places:

print("LR average cross-validation accuracy: ","{:.4f}".format(logregClassifier(XNormalised, y))) # using function created above
print("NB average cross-validation accuracy: ","{:.4f}".format(nbClassifier(XNormalised, y))) # using function created above
print("DT average cross-validation accuracy: ","{:.4f}".format(dtClassifier(XNormalised, y))) # using function created above
print("Bagging average cross-validation accuracy: ","{:.4f}".format(bagClassifier(XNormalised, y, 60, 100, 6))) # using function created above
print("AdaBoost average cross-validation accuracy: ","{:.4f}".format(adaBClassifier(XNormalised, y, 60, 0.5, 6))) # using function created above
print("GB average cross-validation accuracy: ","{:.4f}".format(gbClassifier(XNormalised, y, 60, 0.5))) # using function created above
LR average cross-validation accuracy: 0.9642

NB average cross-validation accuracy: 0.9585
DT average cross-validation accuracy: 0.9385
Bagging average cross-validation accuracy: 0.9571
AdaBoost average cross-validation accuracy: 0.9542
GB average cross-validation accuracy: 0.9571
Part 2: Cross-validation with parameter tuning

In [36]: # defining KNN function with parameter tuning and cross-validation
parameters= {'n_neighbors': [1, 3, 5, 7, 9], 'p': [1, 2]}
def bestKNNClassifier(X, y, S):
# X for normalised data set and y for class

# S=1 for calling KNN best k from function
# S=2 for calling KNN best p from function
# S=3 for calling KNN cross-validation from function
# S=4 for calling KNN test accuracy from function

X, y, stratify=y, random_state=0)
grid= GridSearchCV(KNeighborsClassifier(), parameters, cv=cvKFold,return_train_score=True)
grid.fit(X_train, y_train)
BestParams=grid.best_params_
if S==1:
return print(BestParams['n_neighbors']) # for KNN best k
elif S==2:
return print(BestParams['p']) # for KNN best p
elif S==3:
return print("{:.4f}".format(grid.best_score_)) # for KNN cross-validation accuracy
elif S==4:
return print("{:.4f}".format(grid.score(X_test, y_test))) # for KNN Test set accuracy
else:
return (print("please input S=1-4 and try again"))
In [37]: # finding best k using function

bestKNNClassifier(XNormalised,y,1)
In [38]: # finding best p using function

In [39]: # finding KNN cross-validation accuracy using function

0.9695
In [40]: # finding KNN Test set accuracy using function

0.9543
In [ ]:
In [41]: # creating and applying SVM Classifier to dataset
svmR = SVC(kernel="rbf") #using rbf kernel to create SVM classifier

svmR.fit(X_train, y_train)
PredictionSVMR= svmR.predict(X_test)
print("SVM(rbf) - test set accuracy:", "{:.4f}".format(accuracy_score(y_test, PredictionSVMR)))
svmL = SVC(kernel="linear") #using linear kernel to create SVM classifier

svmL.fit(X_train, y_train)
PredictionSVML= svmL.predict(X_test)
print("SVM(linear) - test set accuracy: ","{:.4f}".format(accuracy_score(y_test, PredictionSVML)))
SVM(rbf) - test set accuracy: 0.9657

SVM(linear) - test set accuracy: 0.9714
In [42]: # defining SVM function with parameter tuning and cross-validation
SVMgrid = {'C': [0.01, 0.1, 1, 5, 15],'gamma': [0.01, 0.1, 1, 10, 50]}
def bestSVMClassifier(X, y, S):

# S=1 for calling SVM best parameter C
# S=2 for calling SVM best parameter Gamma
# S=3 for calling SVM cross-validation accuracy
# S=4 for calling SVM test set accuracy

svm=SVC(kernel='rbf') # kernel set to rbf

SVM= GridSearchCV (svm, SVMgrid, cv=cvKFold,return_train_score=True)
SVM.fit(X_train, y_train)
BestParams=SVM.best_params_
if S==1:
return print(BestParams['C']) # SVM best C
elif S==2:
return print(BestParams['gamma']) # SVM best Gamma
elif S==3:
return print("{:.4f}".format(SVM.best_score_)) # SVM cross-validation accuracy
elif S==4:
return print("{:.4f}".format(SVM.score(X_test, y_test))) # SVM test set accuracy
else:
In [43]: # Finding SVM best C using function
bestSVMClassifier(XNormalised, y,1)
In [44]: # Finding SVM best Gamma using function

0.1
In [45]: # Finding SVM cross-validation accuracy using function

0.9676
In [46]: # Finding SVM test set accuracy using function

0.9714
In [ ]:
In [47]: # creating and applying Random Forest Classifier to dataset
RF= RandomForestClassifier(criterion='entropy', n_estimators=50, max_leaf_nodes=12, random_state=0)
RF.fit(X_train, y_train)
PredictionRF = RF.predict(X_test)
print("Random forest test set accuracy: ","{:.4f}".format(accuracy_score(y_test, PredictionRF)))
Random forest test set accuracy: 0.9714

In [48]: # Defining Random Forest function with parameter tuning and cross-validation
RFparameters={'n_estimators': [10, 30, 60, 100, 150], 'max_leaf_nodes':[6,12,18]}
def bestRFClassifier(X, y, S):

# S=1 for calling RF best n_estimators
# S=2 for calling RF best max_leaf_nodes
# S=3 for calling RF cross-validation accuracy
# S=4 for calling RF Test set accuracy

rf = RandomForestClassifier(criterion='entropy', max_features='sqrt',random_state=0) #information gain and max_features set to 'sqrt'

RF= GridSearchCV(rf, RFparameters, cv=cvKFold, return_train_score=True)
RF.fit(X_train,y_train)
BestParams=RF.best_params_
actual = y_test
predicted = RF.predict(X_test)
if S==1:
return print(BestParams['n_estimators']) #RF best n_estimators
elif S==2:
return print(BestParams['max_leaf_nodes']) #RF best max_leaf_nodes
elif S==3:
return print("{:.4f}".format(RF.best_score_)) #RF cross-validation accuracy
elif S==4:
return print("{:.4f}".format(RF.score(X_test, y_test))) #RF Test set accuracy
elif S==5:
return print("{:.4f}".format(f1_score(actual, predicted, average='macro'))) #RF test set macro average F1
elif S==6:
return print("{:.4f}".format(f1_score(actual, predicted, average='weighted'))) #RF test set weighted average F1
else:
In [49]: # finding RF best n_estimators using fucntion

bestRFClassifier(XNormalised, y,1)
150
In [50]: # finding RF best max_leaf_nodes using fucntion

6
In [51]: # finding RF cross-validation accuracy using fucntion
0.9675
In [52]: # finding RF Test set accuracy using fucntion

0.9657
In [53]: # finding RF test set macro average F1 score using fucntion

0.9628
In [54]: # finding RF test set weighted average F1 score using fucntion

0.9661
Part 2: Results
In [55]:
# printing the results using fucntions defined above
print("KNN best k: ", end="")

bestKNNClassifier(XNormalised,y,1) # calling KNN best k from the function
print("KNN best p: ", end="")

bestKNNClassifier(XNormalised,y,2) # calling KNN best p from the function
print("KNN cross-validation accuracy: ", end='')

bestKNNClassifier(XNormalised,y,3) # calling KNN cross-validation the from function
print("KNN test set accuracy: ",end='')

bestKNNClassifier(XNormalised,y,4) # calling KNN test accuracy from the function
print()
print("SVM best C: ",end='')

bestSVMClassifier(XNormalised, y,1) # calling SVM best C from the function
print("SVM best gamma: ",end='')

bestSVMClassifier(XNormalised, y,2) # calling SVM best gamma from the function
print("SVM cross-validation accuracy: ",end='')

bestSVMClassifier(XNormalised, y,3) # calling SVM cross-validation accuracy from the function
print("SVM test set accuracy: ",end='')

bestSVMClassifier(XNormalised, y,4) # calling SVM test set accuracy from the function
print()
print("RF best n_estimators: ",end='')

bestRFClassifier(XNormalised, y,1) # calling RF best n_estimators from the function
print("RF best max_leaf_nodes: ",end='')

bestRFClassifier(XNormalised, y,2) # calling RF best max leaf nodes from the function
print("RF cross-validation accuracy: ",end='')

bestRFClassifier(XNormalised, y,3) # calling RF cross-validation accuracy from the function
print("RF test set accuracy: ",end='')

bestRFClassifier(XNormalised, y,4) # calling RF test accuracy from the function
print("RF test set macro average F1: ",end='')

bestRFClassifier(XNormalised, y,5) # calling macro F1 score from the function
print("RF test set weighted average F1: ",end='')

bestRFClassifier(XNormalised, y,6) # calling weighted F1 score from the function
KNN best k: 3
KNN best p: 1
KNN cross-validation accuracy: 0.9695
KNN test set accuracy: 0.9543
SVM best C: 5
SVM best gamma: 0.1
SVM cross-validation accuracy: 0.9676
SVM test set accuracy: 0.9714
RF best n_estimators: 150

RF best max_leaf_nodes: 6
RF cross-validation accuracy: 0.9675
RF test set accuracy: 0.9657
RF test set macro average F1: 0.9628
RF test set weighted average F1: 0.9661
In [ ]:

Assignment 1

Uploaded by

Copyright:

Available Formats

You might also like

Assignment 1

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Assignment 1

Uploaded by

Copyright:

Available Formats

COMP5318 Assignment 1: Classification

Group number: A1 Part1 Group 43 , SID1: 530242293 SID2: 520597749

In [1]: # Import all libraries

In [2]: # Load dataset

In [4]: # converting all ? to NaNs

139 1 1 1 1 1 NaN 2 1 1 class1

145 1 1 3 1 2 NaN 2 1 1 class1

158 1 1 2 1 3 NaN 1 1 1 class1

164 5 1 1 1 2 NaN 3 1 1 class1

235 3 1 4 1 2 NaN 3 1 1 class1

249 3 1 1 1 2 NaN 3 1 1 class1

275 3 1 3 1 2 NaN 2 1 1 class1

292 8 8 8 1 2 NaN 6 10 1 class2

294 1 1 1 1 2 NaN 2 1 1 class1

297 5 4 3 1 2 NaN 2 3 1 class1

315 4 6 5 6 7 NaN 4 9 1 class1

321 3 1 1 1 2 NaN 3 1 1 class1

411 1 1 1 1 1 NaN 2 1 1 class1

617 1 1 1 1 1 NaN 1 1 1 class1

LastColumn= df.columns[-1] #extracting the name of the last column in dataframe

In [8]: # filling missing Values using SimpleImputer

imputer= SimpleImputer(strategy='mean', missing_values=np.nan)

[[0.44444444 0. 0. ... 0.22222222 0. 0. ]

def print_data(X, y, n_rows):

for feature in X[i]:

Part 1: Cross-validation without parameter tuning

X_train, X_test, y_train, y_test = train_test_split(

In [14]: # creating and applying KNN Classifier to dataset

knn= KNeighborsClassifier() #load an instance of the classifier.

print("Test set predictions:\n", PredictionKNN)

Test set predictions:

In [15]: # Defining function for KNN cross-validation score

def KNNClassifier(X, y):

knn = KNeighborsClassifier() #load an instance of the classifier.

scores= cross_val_score(knn, X,y,cv=cvKFold) # perform cross validation

return round(scores.mean(),4) # return mean score of cross validation

In [16]: #Running the above function for KNN cross-validation score

logreg= LogisticRegression() #load an instance of the classifier.

print("Test set predictions:\n", PredictionLR)

Test set predictions:

In [18]: # Defining function for Logistic regression cross-validation score

def logregClassifier(X, y):

scores= cross_val_score(logreg, X,y,cv=cvKFold)

In [20]: # creating and applying Naive Bayes Classifier to dataset

nb= GaussianNB() #load an instance of the classifier.

print("Test set predictions:\n", PredictionNB)

Test set predictions:

def nbClassifier(X, y):

nb= GaussianNB() #load an instance of the classifier.

In [23]: # creating and applying Decision Tree Classifier to dataset

tree= DecisionTreeClassifier(criterion='entropy', random_state=0) #load an instance of the classifier.

print("Accuracy on test set: {:.4f}".format(tree.score(X_test, y_test)))

Accuracy on test set: 0.9600

In [24]: # Defining function for Decision Tree cross-validation score

def dtClassifier(X, y):

tree= DecisionTreeClassifier(criterion='entropy', random_state=0)

print("Accuracy on test set: {:.4f}".format(bagC.score(X_test, y_test)))

Accuracy on test set: 0.9543