Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import From Import Import As

#Vince Jayson Lahica 01 - DA - STM
#import python libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import seaborn as sns
data = pd.read_csv("Copy of Diabetes2.csv")
data.head()
pregnant glucose bp skin insulin bmi pedigree age label

0 6 148 72 35 0 34 1 50 1
1 1 85 66 29 0 27 0 31 0
2 8 183 64 0 0 23 1 32 1
3 1 89 66 23 94 28 0 21 0
4 0 137 40 35 168 43 2 33 1
#displays the rows and columns of data

data.shape
(768, 9)
#displays tye types of attributes

data.dtypes
pregnant int64
glucose int64
bp int64
skin int64
insulin int64
bmi int64
pedigree int64
age int64
label int64
dtype: object
#display the statistical information of dataset

data.describe().transpose()
count mean std min 25% 50% 75%

max
pregnant 768.0 3.845052 3.369578 0.0 1.0 3.0 6.00
17.0
glucose 768.0 120.894531 31.972618 0.0 99.0 117.0 140.25
199.0
bp 768.0 69.105469 19.355807 0.0 62.0 72.0 80.00
122.0
skin 768.0 20.536458 15.952218 0.0 0.0 23.0 32.00
99.0
insulin 768.0 79.799479 115.244002 0.0 0.0 30.5 127.25
846.0
bmi 768.0 32.046875 7.888095 0.0 27.0 32.0 37.00
67.0
pedigree 768.0 0.373698 0.510322 0.0 0.0 0.0 1.00
2.0
age 768.0 33.240885 11.760232 21.0 24.0 29.0 41.00
81.0
label 768.0 0.348958 0.476951 0.0 0.0 0.0 1.00
1.0
#storing in predictor variables,(X) are gmat, gpa, work experience

#storing in target variable(y) is admitted
target_variable = data.label
predictor_variables = data[['pregnant', 'insulin', 'bmi',
'age','glucose','bp','pedigree']]
X = predictor_variables # Features
y =target_variable # Target variable
#displaing the 5 records of predictor_variables

X.head()
pregnant insulin bmi age glucose bp pedigree

0 6 0 34 50 148 72 1
1 1 0 27 31 85 66 0
2 8 0 23 32 183 64 1
3 1 94 28 21 89 66 0
4 0 168 43 33 137 40 2
#displaing the 5 records of target_variable

y.head()
0 1
1 0
2 1
3 0
4 1
Name: label, dtype: int64
#splitting the dataset in the train and test data

X_train,X_test,y_train,y_test =
train_test_split(predictor_variables,target_variable,test_size=0.25,ra
ndom_state=0)
#building the model then fitting the data in the train
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
C:\Users\samsung\anaconda3\Lib\site-packages\sklearn\linear_model\
_logistic.py:458: ConvergenceWarning: lbfgs failed to converge
(status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as

shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-
regression
n_iter_i = _check_optimize_result(
LogisticRegression()
#displaying the number of columns in X_train dataset

X_train.shape[1]
#displaying the names of columns in X_train dataset

X_train.columns
Index(['pregnant', 'insulin', 'bmi', 'age', 'glucose', 'bp',

'pedigree'], dtype='object')
#displaying the 75% of the dataset - 50 x 0.75 = 37 X_train data

X_train.shape[0]
576
#displaying the values of 75% dataset (37) X_train data

X_train

762 9 0 23 33 89 62 0
127 1 94 33 23 118 58 0
564 0 0 32 27 91 80 1
375 12 325 39 58 140 82 1
663 9 130 38 40 145 80 1
.. ... ... ... ... ... .. ...
763 10 180 33 63 101 76 0
192 7 0 30 36 159 66 0
629 4 0 25 21 94 65 0
559 11 0 30 35 85 74 0
684 5 0 0 69 136 82 1
[576 rows x 7 columns]
#displaying the number of columns in y_train dataset

y_train.shape[0]
576
#displaying the values of 75% dataset (37) y_train data

y_train
762 0
127 0
564 0
375 1
663 1
..
763 0
192 1
629 0
559 0
684 0
Name: label, Length: 576, dtype: int64
#displaying the 25% of the dataset - 50 x 0.25 = 13 X_test data

X_test.shape[0]
192
#displaying the values of 25% dataset (13) X_test data

X_test.head()

661 1 0 43 22 199 76 1
122 2 100 34 23 107 74 0
113 4 0 34 25 76 62 0
14 5 175 26 51 166 72 1
529 0 0 25 31 111 65 1
#displaying the 25% of the dataset - 50 x 0.25 = 13 y_test data

y_test.shape[0]
192
#displaying the values of 25% dataset (13) y_test data

y_test
661 1
122 0
113 0
14 1
529 0
..
366 1
301 1
382 0
140 0
463 0
Name: label, Length: 192, dtype: int64
#make a prediction
y_pred=logreg.predict(X_test)
#predicted values of y_test

y_pred
array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1,
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
1,
1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1,
0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0,
1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=int64)
while True:
Pregnancy = raw_input('Enter Num of Pregnancy (or "quit" to quit):
')
if Pregnancy=='quit': break
Insulin = raw_input('Enter insulin value')
Bmi = raw_input('Enter BMI ')
Age = raw_input('Enter Age ')
Glucose = raw_input('Enter Glucose ')
Bp = raw_input('Enter BP value')
Pedigree = raw_input('Enter Pedigree value')
Sdata=logreg.predict([[float(Pregnancy),float(Insulin),float(Bmi),floa
t(Age),float(Glucose),float(Bp),float(Pedigree)]])
print ("The Predicted Diabetes Value is : ",Sdata)
----------------------------------------------------------------------
-----
NameError Traceback (most recent call
last)
Cell In[24], line 2
1 while True:
----> 2 Pregnancy = raw_input('Enter Num of Pregnancy (or "quit"
to quit): ')
3 if Pregnancy=='quit': break
4 Insulin = raw_input('Enter insulin value')
NameError: name 'raw_input' is not defined
#confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual
Values'], colnames=['Predicted Values'])
sns.heatmap(confusion_matrix, annot=True)
<Axes: xlabel='Predicted Values', ylabel='Actual Values'>
#accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)
# precision: tp / (tp + fp)

precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1: 2*R*P / R+P or 2 tp / (2 tp + fp + fn)

f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)
Accuracy: 0.791667
Precision: 0.729167
Recall: 0.564516
F1 score: 0.636364
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.81 0.90 0.85 130

1 0.73 0.56 0.64 62
accuracy 0.79 192

macro avg 0.77 0.73 0.75 192
weighted avg 0.79 0.79 0.78 192
#plotting the probability of the graph curve

#curve is another common tool used with binary classifiers
#the dotted line represents the ROC curve of a purely random
classifier
#good classifier stays as far away from that line as possible (toward
the top-left corner)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)
[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %
logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import From Import Import As

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Import As From Import From Import From Import From Import From Import From Import From Import From Import From Import From Import Import As

Uploaded by

Copyright:

Available Formats

#Vince Jayson Lahica 01 - DA - STM

#import python libraries

data = pd.read_csv("Copy of Diabetes2.csv")

pregnant glucose bp skin insulin bmi pedigree age label

#displays the rows and columns of data

#displays tye types of attributes

#display the statistical information of dataset

count mean std min 25% 50% 75%

#storing in predictor variables,(X) are gmat, gpa, work experience

#displaing the 5 records of predictor_variables

pregnant insulin bmi age glucose bp pedigree

#displaing the 5 records of target_variable

#splitting the dataset in the train and test data

Increase the number of iterations (max_iter) or scale the data as

#displaying the number of columns in X_train dataset

#displaying the names of columns in X_train dataset

Index(['pregnant', 'insulin', 'bmi', 'age', 'glucose', 'bp',

#displaying the 75% of the dataset - 50 x 0.75 = 37 X_train data

#displaying the values of 75% dataset (37) X_train data

pregnant insulin bmi age glucose bp pedigree

#displaying the number of columns in y_train dataset

#displaying the values of 75% dataset (37) y_train data

#displaying the 25% of the dataset - 50 x 0.25 = 13 X_test data

#displaying the values of 25% dataset (13) X_test data

pregnant insulin bmi age glucose bp pedigree

#displaying the 25% of the dataset - 50 x 0.25 = 13 y_test data

#displaying the values of 25% dataset (13) y_test data

#predicted values of y_test

NameError: name 'raw_input' is not defined

<Axes: xlabel='Predicted Values', ylabel='Actual Values'>

# precision: tp / (tp + fp)

# f1: 2*R*P / R+P or 2 tp / (2 tp + fp + fn)

from sklearn.metrics import classification_report

precision recall f1-score support

0 0.81 0.90 0.85 130

accuracy 0.79 192

#plotting the probability of the graph curve

You might also like

# f1: 2RP / R+P or 2 tp / (2 tp + fp + fn)