Professional Documents
Culture Documents
QUIZ Week 2 CART Practice PDF
QUIZ Week 2 CART Practice PDF
QUIZ Week 2 CART Practice PDF
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
In [61]:
df=pd.read_csv('heart.csv')
In [62]:
df.head()
Out[62]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
In [63]:
df.shape
Out[63]:
(303, 14)
In [64]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 303 non-null int64
1 sex 303 non-null int64
2 cp 303 non-null int64
3 trestbps 303 non-null int64
4 chol 303 non-null int64
5 fbs 303 non-null int64
6 restecg 303 non-null int64
7 thalach 303 non-null int64
8 exang 303 non-null int64
9 oldpeak 303 non-null float64
10 slope 303 non-null int64
11 ca 303 non-null int64
12 thal 303 non-null int64
13 target 303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
In [65]:
print('age \n',df.age.value_counts())
print('\n')
print('sex \n',df.sex.value_counts())
print('\n')
print('cp \n',df.cp.value_counts())
print('\n')
print('trestbps \n',df.trestbps.value_counts())
print('\n')
print('chol \n',df.chol.value_counts())
print('\n')
print('fbs \n',df.fbs.value_counts())
print('restecg \n',df.restecg.value_counts())
print('thalach \n',df.thalach.value_counts())
print('exang \n',df.exang.value_counts())
print('oldpeak \n',df.oldpeak.value_counts())
print('slope \n',df.slope.value_counts())
print('ca \n',df.ca.value_counts())
print('thal \n',df.thal.value_counts())
print('target \n',df.target.value_counts())
age
58 19
57 17
54 16
59 14
52 13
52 13
51 12
62 11
44 11
60 11
56 11
64 10
41 10
63 9
67 9
55 8
45 8
42 8
53 8
61 8
65 8
43 8
66 7
50 7
48 7
46 7
49 5
47 5
39 4
35 4
68 4
70 4
40 3
71 3
69 3
38 3
34 2
37 2
77 1
76 1
74 1
29 1
Name: age, dtype: int64
sex
1 207
0 96
Name: sex, dtype: int64
cp
0 143
2 87
1 50
3 23
Name: cp, dtype: int64
trestbps
120 37
130 36
140 32
110 19
150 17
138 13
128 12
125 11
160 11
112 9
132 8
118 7
135 6
108 6
124 6
145 5
134 5
152 5
122 4
170 4
100 4
142 3
115 3
115 3
136 3
105 3
180 3
126 3
102 2
94 2
144 2
178 2
146 2
148 2
129 1
165 1
101 1
174 1
104 1
172 1
106 1
156 1
164 1
192 1
114 1
155 1
117 1
154 1
123 1
200 1
Name: trestbps, dtype: int64
chol
234 6
204 6
197 6
269 5
212 5
..
278 1
281 1
284 1
290 1
564 1
Name: chol, Length: 152, dtype: int64
fbs
0 258
1 45
Name: fbs, dtype: int64
restecg
1 152
0 147
2 4
Name: restecg, dtype: int64
thalach
162 11
160 9
163 9
173 8
152 8
..
129 1
128 1
127 1
124 1
71 1
Name: thalach, Length: 91, dtype: int64
exang
0 204
1 99
Name: exang, dtype: int64
oldpeak
0.0 99
1.2 17
0.6 14
1.0 14
0.8 13
1.4 13
1.4 13
0.2 12
1.6 11
1.8 10
0.4 9
2.0 9
0.1 7
2.6 6
2.8 6
1.9 5
1.5 5
0.5 5
3.0 5
2.2 4
3.6 4
2.4 3
3.4 3
4.0 3
0.9 3
0.3 3
2.3 2
4.2 2
1.1 2
2.5 2
3.2 2
6.2 1
2.1 1
4.4 1
1.3 1
3.1 1
0.7 1
5.6 1
3.8 1
2.9 1
3.5 1
Name: oldpeak, dtype: int64
slope
2 142
1 140
0 21
Name: slope, dtype: int64
ca
0 175
1 65
2 38
3 20
4 5
Name: ca, dtype: int64
thal
2 166
3 117
1 18
0 2
Name: thal, dtype: int64
target
1 165
0 138
Name: target, dtype: int64
In [66]:
df.target.value_counts(normalize=True)
Out[66]:
1 0.544554
0 0.455446
Name: target, dtype: float64
In [67]:
print(df.target.value_counts())
print('%1s',165/(165+138))
print('%0s',138/(165+138))
1 165
0 138
Name: target, dtype: int64
%1s 0.5445544554455446
%0s 0.45544554455445546
In [68]:
x= df.drop("target",axis=1)
y= df.pop("target")
x.head()
Out[68]:
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
In [69]:
x['age'].shape
Out[69]:
(303,)
splitting data
In [70]:
from sklearn.model_selection import train_test_split
x_train,x_test,train_labels,test_labels= train_test_split(x,y,test_size=30,random_state=0)
In [71]:
print('x_train',x_train.shape)
print('x_test',x_test.shape)
print('train_labels',train_labels.shape)
print('test_labels',test_labels.shape)
print('Total obs', 273+30)
x_train (273, 13)
x_test (30, 13)
train_labels (273,)
test_labels (30,)
Total obs 303
Out[73]:
DecisionTreeClassifier(max_depth=7, random_state=0)
In [74]:
from sklearn import tree
train_char_label=['No','Yes']
Tree_file=open('H:\Heart_tree.dot','w')
dot_data=tree.export_graphviz(dt_model,
out_file= Tree_file,
feature_names=list(x_train),
class_names=list(train_char_label))
Tree_file.close
Out[74]:
<function TextIOWrapper.close()>
In [75]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = x_train.columns).sort_values
Imp
cp 0.270042
oldpeak 0.148047
ca 0.143606
age 0.087232
thalach 0.079384
exang 0.061583
trestbps 0.059199
slope 0.055828
chol 0.041199
sex 0.036754
thal 0.017125
fbs 0.000000
restecg 0.000000
In [76]:
y_predict=dt_model.predict(x_test)
In [77]:
y_predict.shape
Out[77]:
(30,)
In [78]:
from sklearn.metrics import classification_report,confusion_matrix
In [79]:
confusion_matrix(train_labels,ytrain_predict)
Out[79]:
array([[100, 22],
[ 14, 137]], dtype=int64)
In [80]:
reg_dt_model.score(x_train,train_labels)
Out[80]:
0.8681318681318682
In [81]:
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support
Out[22]:
DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=15,
random_state=0)
In [23]:
# Generating new Tree
Heart_Tree_regularize=open('H:\Heart_Tree_regularize.dot','w')
dot_data= tree.export_graphviz(reg_dt_model,out_file=Heart_Tree_regularize,feature_names=list(x_train),class_nam
Heart_Tree_regularize.close()
dot_data
In [24]:
print(pd.DataFrame(reg_dt_model.feature_importances_,columns=['Imp'],index=x_train.columns).sort_values('Imp'
Imp
cp 0.422485
ca 0.187972
oldpeak 0.133907
exang 0.096348
sex 0.057503
chol 0.037751
age 0.035525
thal 0.015018
thalach 0.013491
trestbps 0.000000
fbs 0.000000
restecg 0.000000
slope 0.000000
Out[27]:
array([0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
0, 0, 1, 1, 0, 1, 1, 1], dtype=int64)
In [28]:
ytrain_predict
Out[28]:
array([0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 1, 0], dtype=int64)
In [29]:
ytest_predict_prob=reg_dt_model.predict_proba(x_test)
ytest_predict_prob
Out[29]:
array([[0.89473684, 0.10526316],
[0.28571429, 0.71428571],
[0.8 , 0.2 ],
[1. , 0. ],
[0.28571429, 0.71428571],
[0.11111111, 0.88888889],
[0.89473684, 0.10526316],
[1. , 0. ],
[0.78571429, 0.21428571],
[1. , 0. ],
[0.28571429, 0.71428571],
[0.3 , 0.7 ],
[1. , 0. ],
[0. , 1. ],
[0. , 1. ],
[0.2 , 0.8 ],
[1. , 0. ],
[0. , 1. ],
[0.9 , 0.1 ],
[0.26666667, 0.73333333],
[0.3 , 0.7 ],
[0.11111111, 0.88888889],
[0.8 , 0.2 ],
[0.9 , 0.1 ],
[0. , 1. ],
[0. , 1. ],
[1. , 0. ],
[0.11111111, 0.88888889],
[0.2 , 0.8 ],
[0. , 1. ]])
In [30]:
pd.DataFrame(ytest_predict_prob).head()
Out[30]:
0 1
0 0.894737 0.105263
1 0.285714 0.714286
2 0.800000 0.200000
3 1.000000 0.000000
4 0.285714 0.714286
Model Evaluation
# calculate AUC
from sklearn.metrics import roc_auc_score
auc=roc_auc_score(test_labels,probs)
print('AUC:%.3f'%auc)
# calculateauc_score
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
Out[35]:
array([[100, 22],
[ 14, 137]], dtype=int64)
In [36]:
#Train Data Accuracy
reg_dt_model.score(x_train,train_labels)
Out[36]:
0.8681318681318682
In [37]:
print((100+137)/(100+137+14+22))
0.8681318681318682
In [38]:
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support
Out[39]:
array([[12, 4],
[ 1, 13]], dtype=int64)
In [40]:
#Test Data Accuracy
reg_dt_model.score(x_test,test_labels)
Out[40]:
0.8333333333333334
In [41]:
print((12+13)/(12+13+1+4))
0.8333333333333334
In [42]:
print(classification_report(test_labels, ytest_predict))
precision recall f1-score support
accuracy 0.83 30
macro avg 0.84 0.84 0.83 30
weighted avg 0.85 0.83 0.83 30
Conclusion
Accuracy on the training data: 87%
Accuracy on the test data: 83%
AUC on the Training Data: 93.8%
AUC on the Test: 78.6%
In [ ]:
In [ ]:
In [ ]: