QUIZ Week 2 CART Practice PDF

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 10

In [60]:

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
In [61]:
df=pd.read_csv('heart.csv')
In [62]:
df.head()

Out[62]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target

0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1

1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1

2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1

3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1

4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1

In [63]:
df.shape

Out[63]:
(303, 14)
In [64]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 303 non-null int64
1 sex 303 non-null int64
2 cp 303 non-null int64
3 trestbps 303 non-null int64
4 chol 303 non-null int64
5 fbs 303 non-null int64
6 restecg 303 non-null int64
7 thalach 303 non-null int64
8 exang 303 non-null int64
9 oldpeak 303 non-null float64
10 slope 303 non-null int64
11 ca 303 non-null int64
12 thal 303 non-null int64
13 target 303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
In [65]:
print('age \n',df.age.value_counts())
print('\n')
print('sex \n',df.sex.value_counts())
print('\n')
print('cp \n',df.cp.value_counts())
print('\n')
print('trestbps \n',df.trestbps.value_counts())
print('\n')
print('chol \n',df.chol.value_counts())
print('\n')
print('fbs \n',df.fbs.value_counts())
print('restecg \n',df.restecg.value_counts())
print('thalach \n',df.thalach.value_counts())
print('exang \n',df.exang.value_counts())
print('oldpeak \n',df.oldpeak.value_counts())
print('slope \n',df.slope.value_counts())
print('ca \n',df.ca.value_counts())
print('thal \n',df.thal.value_counts())
print('target \n',df.target.value_counts())
age
58 19
57 17
54 16
59 14
52 13
52 13
51 12
62 11
44 11
60 11
56 11
64 10
41 10
63 9
67 9
55 8
45 8
42 8
53 8
61 8
65 8
43 8
66 7
50 7
48 7
46 7
49 5
47 5
39 4
35 4
68 4
70 4
40 3
71 3
69 3
38 3
34 2
37 2
77 1
76 1
74 1
29 1
Name: age, dtype: int64

sex
1 207
0 96
Name: sex, dtype: int64

cp
0 143
2 87
1 50
3 23
Name: cp, dtype: int64

trestbps
120 37
130 36
140 32
110 19
150 17
138 13
128 12
125 11
160 11
112 9
132 8
118 7
135 6
108 6
124 6
145 5
134 5
152 5
122 4
170 4
100 4
142 3
115 3
115 3
136 3
105 3
180 3
126 3
102 2
94 2
144 2
178 2
146 2
148 2
129 1
165 1
101 1
174 1
104 1
172 1
106 1
156 1
164 1
192 1
114 1
155 1
117 1
154 1
123 1
200 1
Name: trestbps, dtype: int64

chol
234 6
204 6
197 6
269 5
212 5
..
278 1
281 1
284 1
290 1
564 1
Name: chol, Length: 152, dtype: int64

fbs
0 258
1 45
Name: fbs, dtype: int64
restecg
1 152
0 147
2 4
Name: restecg, dtype: int64
thalach
162 11
160 9
163 9
173 8
152 8
..
129 1
128 1
127 1
124 1
71 1
Name: thalach, Length: 91, dtype: int64
exang
0 204
1 99
Name: exang, dtype: int64
oldpeak
0.0 99
1.2 17
0.6 14
1.0 14
0.8 13
1.4 13
1.4 13
0.2 12
1.6 11
1.8 10
0.4 9
2.0 9
0.1 7
2.6 6
2.8 6
1.9 5
1.5 5
0.5 5
3.0 5
2.2 4
3.6 4
2.4 3
3.4 3
4.0 3
0.9 3
0.3 3
2.3 2
4.2 2
1.1 2
2.5 2
3.2 2
6.2 1
2.1 1
4.4 1
1.3 1
3.1 1
0.7 1
5.6 1
3.8 1
2.9 1
3.5 1
Name: oldpeak, dtype: int64
slope
2 142
1 140
0 21
Name: slope, dtype: int64
ca
0 175
1 65
2 38
3 20
4 5
Name: ca, dtype: int64
thal
2 166
3 117
1 18
0 2
Name: thal, dtype: int64
target
1 165
0 138
Name: target, dtype: int64
In [66]:
df.target.value_counts(normalize=True)

Out[66]:
1 0.544554
0 0.455446
Name: target, dtype: float64
In [67]:
print(df.target.value_counts())
print('%1s',165/(165+138))
print('%0s',138/(165+138))
1 165
0 138
Name: target, dtype: int64
%1s 0.5445544554455446
%0s 0.45544554455445546
In [68]:
x= df.drop("target",axis=1)
y= df.pop("target")
x.head()

Out[68]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal

0 63 1 3 145 233 1 0 150 0 2.3 0 0 1

1 37 1 2 130 250 0 1 187 0 3.5 0 0 2

2 41 0 1 130 204 0 0 172 0 1.4 2 0 2

3 56 1 1 120 236 0 1 178 0 0.8 2 0 2

4 57 0 0 120 354 0 1 163 1 0.6 2 0 2

In [69]:
x['age'].shape

Out[69]:
(303,)

splitting data
In [70]:
from sklearn.model_selection import train_test_split
x_train,x_test,train_labels,test_labels= train_test_split(x,y,test_size=30,random_state=0)
In [71]:
print('x_train',x_train.shape)
print('x_test',x_test.shape)
print('train_labels',train_labels.shape)
print('test_labels',test_labels.shape)
print('Total obs', 273+30)
x_train (273, 13)
x_test (30, 13)
train_labels (273,)
test_labels (30,)
Total obs 303

Building Decision Tree Classifier


In [72]:
dt_model= DecisionTreeClassifier(criterion='gini',max_depth=7,random_state=0)
In [73]:
dt_model.fit(x_train,train_labels)

Out[73]:
DecisionTreeClassifier(max_depth=7, random_state=0)
In [74]:
from sklearn import tree
train_char_label=['No','Yes']
Tree_file=open('H:\Heart_tree.dot','w')
dot_data=tree.export_graphviz(dt_model,
out_file= Tree_file,
feature_names=list(x_train),
class_names=list(train_char_label))
Tree_file.close

Out[74]:
<function TextIOWrapper.close()>
In [75]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = x_train.columns).sort_values
Imp
cp 0.270042
oldpeak 0.148047
ca 0.143606
age 0.087232
thalach 0.079384
exang 0.061583
trestbps 0.059199
slope 0.055828
chol 0.041199
sex 0.036754
thal 0.017125
fbs 0.000000
restecg 0.000000
In [76]:
y_predict=dt_model.predict(x_test)
In [77]:
y_predict.shape

Out[77]:
(30,)
In [78]:
from sklearn.metrics import classification_report,confusion_matrix
In [79]:
confusion_matrix(train_labels,ytrain_predict)

Out[79]:
array([[100, 22],
[ 14, 137]], dtype=int64)
In [80]:
reg_dt_model.score(x_train,train_labels)

Out[80]:
0.8681318681318682
In [81]:
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support

0 0.88 0.82 0.85 122


1 0.86 0.91 0.88 151

accuracy 0.87 273


macro avg 0.87 0.86 0.87 273
weighted avg 0.87 0.87 0.87 273

Regularising Decision Tree


In [22]:
reg_dt_model = DecisionTreeClassifier(criterion = 'gini', max_depth = 7,min_samples_leaf=10,min_samples_split
reg_dt_model.fit(x_train, train_labels)

Out[22]:
DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=15,
random_state=0)
In [23]:
# Generating new Tree
Heart_Tree_regularize=open('H:\Heart_Tree_regularize.dot','w')
dot_data= tree.export_graphviz(reg_dt_model,out_file=Heart_Tree_regularize,feature_names=list(x_train),class_nam
Heart_Tree_regularize.close()
dot_data
In [24]:
print(pd.DataFrame(reg_dt_model.feature_importances_,columns=['Imp'],index=x_train.columns).sort_values('Imp'
Imp
cp 0.422485
ca 0.187972
oldpeak 0.133907
exang 0.096348
sex 0.057503
chol 0.037751
age 0.035525
thal 0.015018
thalach 0.013491
trestbps 0.000000
fbs 0.000000
restecg 0.000000
slope 0.000000

Predicting on Training and Test Dataset


In [25]:
ytrain_predict = reg_dt_model.predict(x_train)
ytest_predict = reg_dt_model.predict(x_test)
In [26]:
print('ytrain_predict',ytrain_predict.shape)
print('ytest_predict',ytest_predict.shape)
ytrain_predict (273,)
ytest_predict (30,)
In [27]:
ytest_predict

Out[27]:
array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
0, 0, 1, 1, 0, 1, 1, 1], dtype=int64)
In [28]:
ytrain_predict

Out[28]:
array([0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 1, 0], dtype=int64)
In [29]:
ytest_predict_prob=reg_dt_model.predict_proba(x_test)
ytest_predict_prob
Out[29]:
array([[0.89473684, 0.10526316],
[0.28571429, 0.71428571],
[0.8 , 0.2 ],
[1. , 0. ],
[0.28571429, 0.71428571],
[0.11111111, 0.88888889],
[0.89473684, 0.10526316],
[1. , 0. ],
[0.78571429, 0.21428571],
[1. , 0. ],
[0.28571429, 0.71428571],
[0.3 , 0.7 ],
[1. , 0. ],
[0. , 1. ],
[0. , 1. ],
[0.2 , 0.8 ],
[1. , 0. ],
[0. , 1. ],
[0.9 , 0.1 ],
[0.26666667, 0.73333333],
[0.3 , 0.7 ],
[0.11111111, 0.88888889],
[0.8 , 0.2 ],
[0.9 , 0.1 ],
[0. , 1. ],
[0. , 1. ],
[1. , 0. ],
[0.11111111, 0.88888889],
[0.2 , 0.8 ],
[0. , 1. ]])
In [30]:
pd.DataFrame(ytest_predict_prob).head()

Out[30]:
0 1

0 0.894737 0.105263

1 0.285714 0.714286

2 0.800000 0.200000

3 1.000000 0.000000

4 0.285714 0.714286

Model Evaluation

Measuring AUC & ROC Curve


In [31]:
import matplotlib.pyplot as plt

AUC and ROC for the training data


In [32]:
# predict probabilities
probs = reg_dt_model.predict_proba(x_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.938

AUC and ROC for the test data


In [33]:
# predict probabilities
probs = reg_dt_model.predict_proba(x_test)

# keep probabilities for the positive outcome only


probs = probs[:, 1]

# calculate AUC
from sklearn.metrics import roc_auc_score
auc=roc_auc_score(test_labels,probs)
print('AUC:%.3f'%auc)

# calculateauc_score
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')

# plot the roc curve for the model


plt.plot(fpr, tpr, marker='.')

# show the plot


plt.show()
AUC:0.786

Confusion matrix for training data


In [34]:
from sklearn.metrics import classification_report,confusion_matrix
In [35]:
confusion_matrix(train_labels,ytrain_predict)

Out[35]:
array([[100, 22],
[ 14, 137]], dtype=int64)
In [36]:
#Train Data Accuracy
reg_dt_model.score(x_train,train_labels)

Out[36]:
0.8681318681318682
In [37]:
print((100+137)/(100+137+14+22))
0.8681318681318682
In [38]:
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support

0 0.88 0.82 0.85 122


1 0.86 0.91 0.88 151

accuracy 0.87 273


macro avg 0.87 0.86 0.87 273
weighted avg 0.87 0.87 0.87 273

Confusion matrix for test data


In [39]:
confusion_matrix(test_labels, ytest_predict)

Out[39]:
array([[12, 4],
[ 1, 13]], dtype=int64)
In [40]:
#Test Data Accuracy
reg_dt_model.score(x_test,test_labels)

Out[40]:
0.8333333333333334
In [41]:
print((12+13)/(12+13+1+4))
0.8333333333333334
In [42]:
print(classification_report(test_labels, ytest_predict))
precision recall f1-score support

0 0.92 0.75 0.83 16


1 0.76 0.93 0.84 14

accuracy 0.83 30
macro avg 0.84 0.84 0.83 30
weighted avg 0.85 0.83 0.83 30

Conclusion
Accuracy on the training data: 87%
Accuracy on the test data: 83%
AUC on the Training Data: 93.8%
AUC on the Test: 78.6%
In [ ]:

In [ ]:

In [ ]:

You might also like