Download as pdf or txt
Download as pdf or txt
You are on page 1of 8

#Vince Jayson Lahica 01 - DA - STM

#import python libraries


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import seaborn as sns

data = pd.read_csv("Copy of Diabetes2.csv")

data.head()

pregnant glucose bp skin insulin bmi pedigree age label


0 6 148 72 35 0 34 1 50 1
1 1 85 66 29 0 27 0 31 0
2 8 183 64 0 0 23 1 32 1
3 1 89 66 23 94 28 0 21 0
4 0 137 40 35 168 43 2 33 1

#displays the rows and columns of data


data.shape

(768, 9)

#displays tye types of attributes


data.dtypes

pregnant int64
glucose int64
bp int64
skin int64
insulin int64
bmi int64
pedigree int64
age int64
label int64
dtype: object

#display the statistical information of dataset


data.describe().transpose()

count mean std min 25% 50% 75%


max
pregnant 768.0 3.845052 3.369578 0.0 1.0 3.0 6.00
17.0
glucose 768.0 120.894531 31.972618 0.0 99.0 117.0 140.25
199.0
bp 768.0 69.105469 19.355807 0.0 62.0 72.0 80.00
122.0
skin 768.0 20.536458 15.952218 0.0 0.0 23.0 32.00
99.0
insulin 768.0 79.799479 115.244002 0.0 0.0 30.5 127.25
846.0
bmi 768.0 32.046875 7.888095 0.0 27.0 32.0 37.00
67.0
pedigree 768.0 0.373698 0.510322 0.0 0.0 0.0 1.00
2.0
age 768.0 33.240885 11.760232 21.0 24.0 29.0 41.00
81.0
label 768.0 0.348958 0.476951 0.0 0.0 0.0 1.00
1.0

#storing in predictor variables,(X) are gmat, gpa, work experience


#storing in target variable(y) is admitted

target_variable = data.label
predictor_variables = data[['pregnant', 'insulin', 'bmi',
'age','glucose','bp','pedigree']]
X = predictor_variables # Features
y =target_variable # Target variable

#displaing the 5 records of predictor_variables


X.head()

pregnant insulin bmi age glucose bp pedigree


0 6 0 34 50 148 72 1
1 1 0 27 31 85 66 0
2 8 0 23 32 183 64 1
3 1 94 28 21 89 66 0
4 0 168 43 33 137 40 2

#displaing the 5 records of target_variable


y.head()

0 1
1 0
2 1
3 0
4 1
Name: label, dtype: int64

#splitting the dataset in the train and test data


X_train,X_test,y_train,y_test =
train_test_split(predictor_variables,target_variable,test_size=0.25,ra
ndom_state=0)
#building the model then fitting the data in the train
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

C:\Users\samsung\anaconda3\Lib\site-packages\sklearn\linear_model\
_logistic.py:458: ConvergenceWarning: lbfgs failed to converge
(status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as


shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:

https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(

LogisticRegression()

#displaying the number of columns in X_train dataset


X_train.shape[1]

#displaying the names of columns in X_train dataset


X_train.columns

Index(['pregnant', 'insulin', 'bmi', 'age', 'glucose', 'bp',


'pedigree'], dtype='object')

#displaying the 75% of the dataset - 50 x 0.75 = 37 X_train data


X_train.shape[0]

576

#displaying the values of 75% dataset (37) X_train data


X_train

pregnant insulin bmi age glucose bp pedigree


762 9 0 23 33 89 62 0
127 1 94 33 23 118 58 0
564 0 0 32 27 91 80 1
375 12 325 39 58 140 82 1
663 9 130 38 40 145 80 1
.. ... ... ... ... ... .. ...
763 10 180 33 63 101 76 0
192 7 0 30 36 159 66 0
629 4 0 25 21 94 65 0
559 11 0 30 35 85 74 0
684 5 0 0 69 136 82 1
[576 rows x 7 columns]

#displaying the number of columns in y_train dataset


y_train.shape[0]

576

#displaying the values of 75% dataset (37) y_train data


y_train

762 0
127 0
564 0
375 1
663 1
..
763 0
192 1
629 0
559 0
684 0
Name: label, Length: 576, dtype: int64

#displaying the 25% of the dataset - 50 x 0.25 = 13 X_test data


X_test.shape[0]

192

#displaying the values of 25% dataset (13) X_test data


X_test.head()

pregnant insulin bmi age glucose bp pedigree


661 1 0 43 22 199 76 1
122 2 100 34 23 107 74 0
113 4 0 34 25 76 62 0
14 5 175 26 51 166 72 1
529 0 0 25 31 111 65 1

#displaying the 25% of the dataset - 50 x 0.25 = 13 y_test data


y_test.shape[0]

192

#displaying the values of 25% dataset (13) y_test data


y_test

661 1
122 0
113 0
14 1
529 0
..
366 1
301 1
382 0
140 0
463 0
Name: label, Length: 192, dtype: int64

#make a prediction
y_pred=logreg.predict(X_test)

#predicted values of y_test


y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1,
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
1,
1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1,
0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0,
1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

while True:
Pregnancy = raw_input('Enter Num of Pregnancy (or "quit" to quit):
')
if Pregnancy=='quit': break
Insulin = raw_input('Enter insulin value')
Bmi = raw_input('Enter BMI ')
Age = raw_input('Enter Age ')
Glucose = raw_input('Enter Glucose ')
Bp = raw_input('Enter BP value')
Pedigree = raw_input('Enter Pedigree value')

Sdata=logreg.predict([[float(Pregnancy),float(Insulin),float(Bmi),floa
t(Age),float(Glucose),float(Bp),float(Pedigree)]])
print ("The Predicted Diabetes Value is : ",Sdata)

----------------------------------------------------------------------
-----
NameError Traceback (most recent call
last)
Cell In[24], line 2
1 while True:
----> 2 Pregnancy = raw_input('Enter Num of Pregnancy (or "quit"
to quit): ')
3 if Pregnancy=='quit': break
4 Insulin = raw_input('Enter insulin value')

NameError: name 'raw_input' is not defined

#confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual
Values'], colnames=['Predicted Values'])
sns.heatmap(confusion_matrix, annot=True)

<Axes: xlabel='Predicted Values', ylabel='Actual Values'>

#accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)

# precision: tp / (tp + fp)


precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)

# f1: 2*R*P / R+P or 2 tp / (2 tp + fp + fn)


f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

Accuracy: 0.791667
Precision: 0.729167
Recall: 0.564516
F1 score: 0.636364

from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred))

precision recall f1-score support

0 0.81 0.90 0.85 130


1 0.73 0.56 0.64 62

accuracy 0.79 192


macro avg 0.77 0.73 0.75 192
weighted avg 0.79 0.79 0.78 192

#plotting the probability of the graph curve


#curve is another common tool used with binary classifiers
#the dotted line represents the ROC curve of a purely random
classifier
#good classifier stays as far away from that line as possible (toward
the top-left corner)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)
[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %
logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

You might also like