Index: Name - JINESH PRAJAPAT Class - B. Tech, III Year Branch - AI & DS Sem - V

Name – JINESH PRAJAPAT Class - B.
Tech, III Year Branch - AI & DS Sem – V

Subject – Introduc on to Data Science & Machine Learning (AI 354)
Index
Date of Sign/
Sr.
Topic Page No.
No Remarks
Done Checked
Name – JINESH PRAJAPAT Class - B. Tech, III Year Branch - AI & DS Sem – V
Subject – Introduction to Data Science & Machine Learning (AI 354)
Experiment 9
Aim – Write a python program to implement Logistic Regression in Data Science and Machine
Learning
Code –
In [ 0]: print("Jinesh Prajapat")
Jinesh Prajapat
In [1]: import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
In [2]: # import dataset

data = pd.read_csv("Logistic_data.csv")
data.drop(['Unnamed: 32',"id"], axis=1, inplace=True)
data.diagnosis = [1 if each == "M" else 0 for each in data.diagnosis]
y = data.diagnosis.values
x_data = data.drop(['diagnosis'], axis=1)
In [3]: # normalization
x = (x_data -np.min(x_data))/(np.max(x_data)-np.min(x_data)).values
In [4]: # train test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
x_train = x_train.T
x_test = x_test.T
y_train = y_train.T
y_test = y_test.T
print("x train: ",x_train.shape)

print("x test: ",x_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)
x train: (30, 483)

x test: (30, 86)
y train: (483,)
y test: (86,)
In [5]: # lets initialize parameters

# So what we need is dimension 4096 that is number of pixels as a parameter for our initializ
def initialize_weights_and_bias(dimension):
w = np.full((dimension,1),0.01)
b = 0.0
return w, b
In [6]: # sigmoid
# calculation of z
#z = np.dot(w.T,x_train)+b
#y_head = sigmoid(5)
def sigmoid(z):
y_head = 1/(1+np.exp(-z))
return y_head
In [7] #%% forward and backward

# In backward propagation we will use y_head that found in forward progation
# Therefore instead of writing backward propagation method, lets combine forward propagation
def forward_backward_propagation(w,b,x_train,y_train):
# forward propagation
z = np.dot(w.T,x_train) + b
y_head = sigmoid(z)
loss = -y_train*np.log(y_head)-(1-y_train)*np.log(1-y_head)
# x_train.shape[1] is for scaling
cost = (np.sum(loss))/x_train.shape[1]
# backward propagation
derivative_weight = (np.dot(x_train,((y_head-y_train).T)))/x_train.shape[1]

derivative_bias = np.sum(y_head-y_train)/x_train.shape[1]
gradients = {"derivative_weight": derivative_weight,"derivative_bias": derivative_bias}
return cost,gradients
In [8]: #%%# Updating(learning) parameters

def update(w, b, x_train, y_train, learning_rate,number_of_iterarion):
cost_list = []
cost_list2 = []
index = []
# updating(learning) parameters is number_of_iterarion times
for i in range(number_of_iterarion):
# make forward and backward propagation and find cost and gradients
cost,gradients = forward_backward_propagation(w,b,x_train,y_train)
cost_list.append(cost)
# lets update
w = w - learning_rate * gradients["derivative_weight"]
b = b - learning_rate * gradients["derivative_bias"]
if i % 10 == 0:
cost_list2.append(cost)
index.append(i)
print ("Cost after iteration %i: %f" %(i, cost))
# we update(learn) parameters weights and bias
parameters = {"weight": w,"bias": b}
plt.plot(index,cost_list2)
plt.xticks(index,rotation='vertical')
plt.xlabel("Number of Iterarion")
plt.ylabel("Cost")
plt.show()
return parameters, gradients, cost_list
In [9]: #%% # prediction

def predict(w,b,x_test):
# x_test is a input for forward propagation
z = sigmoid(np.dot(w.T,x_test)+b)
Y_prediction = np.zeros((1,x_test.shape[1]))
# if z is bigger than 0.5, our prediction is sign one (y_head=1),
# if z is smaller than 0.5, our prediction is sign zero (y_head=0),
for i in range(z.shape[1]):
if z[0,i]<= 0.5:
Y_prediction[0,i] = 0
else:
Y_prediction[0,i] = 1
return Y_prediction
# predict(parameters["weight"],parameters["bias"],x_test)
In [10]: # %%
def logistic_regression(x_train, y_train, x_test, y_test, learning_rate , num_iterations):
# initialize
dimension = x_train.shape[0] # that is 4096
w,b = initialize_weights_and_bias(dimension)
# do not change learning rate
parameters, gradients, cost_list = update(w, b, x_train, y_train, learning_rate,num_itera
y_prediction_test = predict(parameters["weight"],parameters["bias"],x_test)
y_prediction_train = predict(parameters["weight"],parameters["bias"],x_train)
# Print train/test Errors

print("train accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_train - y_train)) *
print("test accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_test - y_test)) * 10
logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 1, num_iterations = 100)
Cost after iteration 0: 0.692836

train accuracy: 94.40993788819875 %

test accuracy: 94.18604651162791 %
In [11]: # sklearn
from sklearn import linear_model
logreg = linear_model.LogisticRegression(random_state = 42,max_iter= 150)
print("test accuracy: {} ".format(logreg.fit(x_train.T, y_train.T).score(x_test.T, y_test.T))
print("train accuracy: {} ".format(logreg.fit(x_train.T, y_train.T).score(x_train.T, y_train.
test accuracy: 0.9767441860465116

train accuracy: 0.968944099378882
In [ ]:
Experiment 10
Aim – : Create a Machine Learning Model using Support Vector Machine algorithm.
Code –
In [ 45]: print("Jinesh Prajapat")
Jinesh Prajapat
In [46]: import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import svm

from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_r
In [47]: #loading data

data_df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
data_df.head()
Out[47]: age anaemia creatinine_phosphokinase diabetes ejection_fraction high_blood_pressure platelets serum
0 75.0 0 582 0 20 1 265000.00
1 55.0 0 7861 0 38 0 263358.03
2 65.0 0 146 0 20 0 162000.00
3 50.0 1 111 0 20 0 210000.00
4 65.0 1 160 1 20 0 327000.00
 
In [48]: # Checking for any missing values across the dataset

data_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
# Column Non-Null Count Dtype
0 age 299 non-null float64

1 anaemia 299 non-null int64
2 creatinine_phosphokinase 299 non-null int64
3 diabetes 299 non-null int64
4 ejection_fraction 299 non-null int64
5 high_blood_pressure 299 non-null int64
6 platelets 299 non-null float64
7 serum_creatinine 299 non-null float64
8 serum_sodium 299 non-null int64
9 sex 299 non-null int64
10 smoking 299 non-null int64
11 time 299 non-null int64

12 DEATH_EVENT 299 non-null int64
dtypes: float64(3), int64(10)
memory usage: 30.5 KB
In [49]: #Evaluating the target and finding out the potential skewness in the data
cols= ["#CD5C5C","#FF0000"]
ax = sns.countplot(x= data_df["DEATH_EVENT"], palette= cols)
ax.bar_label(ax.containers[0])
[Text(0, 0, '203'), Text(0, 0, '96')]

Out[49]:
# In
Doing Univariate Analysis for statistical description and understanding of dispersion of da
[50]:
data_df.describe().T
count mean std min 25% 50% 75% max
age 299.0 60.833893 11.894809 40.0 51.0 60.0 70.0 95.0
anaemia 299.0 0.431438 0.496107 0.0 0.0 0.0 1.0 1.0
creatinine_phosphokinase 299.0 581.839465 970.287881 23.0 116.5 250.0 582.0 7861.0
diabetes 299.0 0.418060 0.494067 0.0 0.0 0.0 1.0 1.0
ejection_fraction 299.0 38.083612 11.834841 14.0 30.0 38.0 45.0 80.0
high_blood_pressure 299.0 0.351171 0.478136 0.0 0.0 0.0 1.0 1.0
platelets 299.0 263358.029264 97804.236869 25100.0 212500.0 262000.0 303500.0 850000.0
serum_creatinine 299.0 1.393880 1.034510 0.5 0.9 1.1 1.4 9.4
serum_sodium 299.0 136.625418 4.412477 113.0 134.0 137.0 140.0 148.0
sex 299.0 0.648829 0.478136 0.0 0.0 1.0 1.0 1.0
smoking 299.0 0.321070 0.467670 0.0 0.0 0.0 1.0 1.0
time 299.0 130.260870 77.614208 4.0 73.0 115.0 203.0 285.0
DEATH_EVENT 299.0 0.321070 0.467670 0.0 0.0 0.0 1.0 1.0

In [51]: #Doing Bivariate Analysis by examaning a corelation matrix of all the features using heatmap
cmap = sns.diverging_palette(2, 165, s=80, l=55, n=9)
corrmat = data_df.corr()
plt.subplots(figsize=(12,12))
sns.heatmap(corrmat,cmap= cmap,annot=True, square=True)
Out[51]: <Axes: >
In [52]: #Evauating age distribution as per the deaths happened

plt.figure(figsize=(20,10))
Days_of_week=sns.countplot(x=data_df['age'],data=data_df, hue ="DEATH_EVENT",palette = cols)
Days_of_week.set_title("Distribution Of Age", color="#774571")
Out[52]: Text(0.5, 1.0, 'Distribution Of Age')

In [53]: # Checking for potential outliers using the "Boxen and Swarm plots" of non binary features.
feature = ["age","creatinine_phosphokinase","ejection_fraction","platelets","serum_creatinine
for i in feature:
sns.swarmplot(x=data_df["DEATH_EVENT"], y=data_df[i], color="black", alpha=0.7)
sns.boxenplot(x=data_df["DEATH_EVENT"], y=data_df[i], palette=cols)
plt.show()
In [54]: # Plotting "Kernel Density Estimation (kde plot)" of time and age features - both of which a
sns.kdeplot(x=data_df["time"], y=data_df["age"], hue =data_df["DEATH_EVENT"], palette=cols)
<Axes: xlabel='time', ylabel='age'>

Out[54]:
In [13]: # Defining independent and dependent attributes in training and test sets
X=data_df.drop(["DEATH_EVENT"],axis=1)
y=data_df["DEATH_EVENT"]
In [14]: # Setting up a standard scaler for the features and analyzing it thereafter
col_names = list(X.columns)
s_scaler = preprocessing.StandardScaler()
X_scaled= s_scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=col_names)
X_scaled.describe().T
Out[14]: count mean std min 25% 50% 75% max
age 299.0 5.703353e-16 1.001676 -1.754448 -0.828124 -0.070223 0.771889 2.877170
anaemia 299.0 1.009969e-16 1.001676 -0.871105 -0.871105 -0.871105 1.147968 1.147968
creatinine_phosphokinase 299.0 0.000000e+00 1.001676 -0.576918 -0.480393 -0.342574 0.000166 7.514640
diabetes 299.0 9.060014e-17 1.001676 -0.847579 -0.847579 -0.847579 1.179830 1.179830

-3.267546e-
ejection_fraction 299.0 1.001676 -2.038387 -0.684180 -0.007077 0.585389 3.547716
17
high_blood_pressure 299.0 0.000000e+00 1.001676 -0.735688 -0.735688 -0.735688 1.359272 1.359272
platelets 299.0 7.723291e-17 1.001676 -2.440155 -0.520870 -0.013908 0.411120 6.008180
serum_creatinine 299.0 1.425838e-16 1.001676 -0.865509 -0.478205 -0.284552 0.005926 7.752020

-8.673849e-
serum_sodium 299.0 1.001676 -5.363206 -0.595996 0.085034 0.766064 2.582144
16
-8.911489e-
sex 299.0 1.001676 -1.359272 -1.359272 0.735688 0.735688 0.735688
18
-1.188199e-
smoking 299.0 1.001676 -0.687682 -0.687682 -0.687682 1.454161 1.454161
17
-1.901118e-
time 299.0 1.001676 -1.629502 -0.739000 -0.196954 0.938759 1.997038
16
 
In [15]: #Plotting the scaled features using boxen plots

colors =["#CD5C5C","#F08080","#FA8072","#E9967A","#FFA07A"]
sns.boxenplot(data = X_scaled,palette = colors)
plt.xticks(rotation=60)
plt.show()
In [55]: #spliting variables into training and test sets

X_train, X_test, y_train,y_test = train_test_split(X_scaled,y,test_size=0.30,random_state=25)
In [56]: # Instantiating the SVM algorithm

model1=svm.SVC()
# Fitting the model

model1.fit (X_train, y_train)
# Predicting the test variables

y_pred = model1.predict(X_test)
# Getting the score

model1.score (X_test, y_test)
Out[56]: 0.7888888888888889
In [57]: # Printing classification report (since there was biasness in target labels)
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.84 0.85 0.84 60

1 0.69 0.67 0.68 30
accuracy 0.79 90
macro avg 0.76 0.76 0.76 90

weighted avg 0.79 0.79 0.79 90
In [58]: # Getting the confusion matrix

cmap1 = sns.diverging_palette(2, 165, s=80, l=55, n=9)
plt.subplots(figsize=(5,3))
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap = cmap1, annot = True, annot_kws = {'size':10})
Out[58]: <Axes: >

Experiment 11
Aim – : Create a Machine Learning Model using Decision Tree algorithm.
Code –
In [0]: print("Jinesh Prajapat")
Jinesh Prajapat

import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
import warnings
In [2]: data = 'car_evaluation.csv'

df = pd.read_csv(data, header=None)
In [3]: # view dimensions of dataset

df.shape
(1728, 7)
Out[3]:
In [4]: # preview Dataset

df.head()
Out[4]: 0 2 3 4 5 6
1
0 vhigh vhigh 2 2 small low unacc
1 vhigh vhigh 2 2 small med unacc
2 vhigh vhigh 2 2 small high unacc
3 vhigh vhigh 2 2 med low unacc
4 vhigh vhigh 2 2 med med unacc
In [5]: col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

df.columns = col_names
col_names
['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

Out[5]:
In [6]: # let's again preview the dataset

df.head()
Out[6]:
In [7]: df.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex:
1728 entries, 0 to 1727
0 buying 1728 non-null object

1 maint 1728 non-null object
2 doors 1728 non-null object
3 persons 1728 non-null object
<4 lug_boot 1728 non-null object
c5 safety 1728 non-null object
l6 class 1728 non-null object
dtaypes: object(7)
mesmory usage: 94.6+ KB
s
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for 'col in col_names:
pprint(df[col].value_counts())
a
vhignh 432
highd 432
med a 432
low s 432
Name.: buying, dtype: int64
vhigch 432
higho 432
med r 432
low e. 432
Name: maint, dtype: int64
f
2 432
r
3 432
a
4 m 432
432
5more
e doors, dtype: int64
Name:.
2 576
D
4 576
a
more 576
t
a
F
r
a
m
e
out [8]:
Name: persons, dtype: int64

small 576
med 576
big 576
Name: lug_boot, dtype: int64
low 576
med 576
high 576
Name: safety, dtype: int64
unacc 1210
acc 384
good 69
vgood 65
Name: class, dtype: int64
In [9]: df['class'].value_counts()
unacc 1210
Out[9]:
acc 384
good 69
vgood 65
In [10]: # check missing values in variables

df.isnull().sum()
Out[10]: buying 0
maint 0
doors 0
persons 0
lug_boot 0
safety 0
class 0
dtype: int64
X = df.drop(['class'], axis=1)
y = df['class']
# split X and y into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random
In [13]: # check the shape of X_train and X_test

X_train.shape, X_test.shape
((1157, 6), (571, 6))

Out[13]:
In [14]: # check data types in X_train

X_train.dtypes
buying object
Out[14]:
maint object
doors object
persons object
lug_boot object
safety object
dtype: object
In [15]: X_train.head()
Out[15]: buying maint doors persons lug_boot safety
48 vhigh vhigh 3 more med low
468 high vhigh 3 4 small low
155 vhigh high 3 more small high
1721 low low 5more more small high
1208 med low 2 more small high
In [16]: # import category encoders

import category_encoders as ce
In [17]: # encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
48 1 1 1 1 1 1
468 2 1 1 2 2 1
155 1 2 1 1 2 2
1721 3 3 2 1 2 2
1208 4 3 3 1 2 2
In [19]: X_test.head()
599 2 2 4 3 1 2
1201 4 3 3 2 1 3
628 2 2 2 3 3 3
1498 3 2 2 2 1 3
1263 4 3 4 1 1 1
In [20]: # import DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
In [21]: # instantiate the DecisionTreeClassifier model with criterion gini index

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
# fit the model

clf_gini.fit(X_train, y_train)
Out[21]: ▾ DecisionTreeClassifier
DecisionTreeClassifier(max_depth=3, random_state=0)
In [22]: y_pred_gini = clf_gini.predict(X_test)
In [23]: from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_s
Model accuracy score with criterion gini index: 0.8021
In [24]: y_pred_train_gini = clf_gini.predict(X_train)

y_pred_train_gini
array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'unacc', 'acc'],

Out[24]:
dtype=object)
In [25]: print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pre
Training-set accuracy score: 0.7865
In [26]: # print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))
Training set score: 0.7865

Test set score: 0.8021
In [27]: plt.figure(figsize=(8,6))
from sklearn import tree
tree.plot_tree(clf_gini.fit(X_train, y_train))
Out[27]: [Text(0.4, 0.875, 'x[5] <= 1.5\ngini = 0.455\nsamples = 1157\nvalue = [255, 49, 81
3, 40]'),
Text(0.2, 0.625, 'gini = 0.0\nsamples = 386\nvalue = [0, 0, 386, 0]'),
Text(0.6, 0.625, 'x[3] <= 2.5\ngini = 0.577\nsamples = 771\nvalue = [255, 49, 42
7, 40]'),
Text(0.4, 0.375, 'x[0] <= 2.5\ngini = 0.631\nsamples = 525\nvalue = [255, 49, 18
1, 40]'),
Text(0.8, 0.375, 'gini = 0.0\nsamples = 246\nvalue = [0, 0, 246, 0]')]
In [28]: # instantiate the DecisionTreeClassifier model with criterion entropy

clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
# fit the model

clf_en.fit(X_train, y_train)
Out[28]: ▾ DecisionTreeClassifier
DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
In [29]: y_pred_en = clf_en.predict(X_test)
In [30]: from sklearn.metrics import accuracy_score

print('Model accuracy score with criterion entropy: {0:0.4f}'. format(accuracy_scor
Model accuracy score with criterion entropy: 0.8021
In [31]: y_pred_train_en = clf_en.predict(X_train)

y_pred_train_en
array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'unacc', 'acc'],

Out[31]:
dtype=object)
In [32]: print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pre
Training-set accuracy score: 0.7865
In [33]: # print the scores on training and test set

print('Training set score: {:.4f}'.format(clf_en.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(clf_en.score(X_test, y_test)))
Training set score: 0.7865

Test set score: 0.8021
In [34]: plt.figure(figsize=(8,6))
from sklearn import tree
tree.plot_tree(clf_en.fit(X_train, y_train))
[Text(0.4, 0.875, 'x[5] <= 1.5\nentropy = 1.2\nsamples = 1157\nvalue = [255, 49, 8

Out[34]:
13, 40]'),
Text(0.2, 0.625, 'entropy = 0.0\nsamples = 386\nvalue = [0, 0, 386, 0]'),
Text(0.6, 0.625, 'x[3] <= 2.5\nentropy = 1.474\nsamples = 771\nvalue = [255, 49,
427, 40]'),
Text(0.4, 0.375, 'x[0] <= 2.5\nentropy = 1.638\nsamples = 525\nvalue = [255, 49,
181, 40]'),
Text(0.8, 0.375, 'entropy = 0.0\nsamples = 246\nvalue = [0, 0, 246, 0]')]
In [35]: # Print the Confusion Matrix and slice it into four pieces
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_en)
print('Confusion matrix\n\n', cm)
Confusion matrix
[[ 73 0 56 0]
[ 20 0 0 0]
[ 12 0 385 0]
[ 25 0 0 0]]
In [36]: from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_en))
acc 0.56 0.57 0.56 129

good 0.00 0.00 0.00 20
unacc 0.87 0.97 0.92 397
vgood 0.00 0.00 0.00 25
accuracy 0.80 571

macro avg 0.36 0.38 0.37 571
weighted avg 0.73 0.80 0.77 571
In [ ]:
Experiment 12
Aim – : Create a Machine Learning Model using Random Forest algorithm.
Code –
In [0]: print("Jinesh Prajapat")
Jinesh Prajapat

import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
import warnings
In [2]: data = 'car_evaluation.csv'

df = pd.read_csv(data, header=None)
In [3]: # view dimensions of dataset

df.shape
(1728, 7)
Out[3]:
In [4]: # preview Dataset

df.head()
Out[4]: 0 2 3 4 5 6
1
0 vhigh vhigh 2 2 small low unacc
1 vhigh vhigh 2 2 small med unacc
2 vhigh vhigh 2 2 small high unacc
3 vhigh vhigh 2 2 med low unacc
4 vhigh vhigh 2 2 med med unacc
In [5]: col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

df.columns = col_names
col_names
['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

Out[5]:
In [6]: # let's again preview the dataset

df.head()
Out[6]:
In [7]: df.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex:
1728 entries, 0 to 1727
0 buying 1728 non-null object

1 maint 1728 non-null object
2 doors 1728 non-null object
3 persons 1728 non-null object
<4 lug_boot 1728 non-null object
c5 safety 1728 non-null object
l6 class 1728 non-null object
dtypes:
a object(7)
memory
s usage: 94.6+ KB
s
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
for 'col in col_names:
pprint(df[col].value_counts())
a
n
vhigh 432
highd 432
med a 432
low s 432
. buying, dtype: int64
Name:
c
vhigh 432
higho 432
med r 432
low e 432
.
Name: maint, dtype: int64
f
2 432
r
3 432
a
4 432
m
5more 432
e
Name: doors, dtype: int64
.
2 576
D
4 576
a
more 576
t
a
F
r
a
m
e
out [8]:
Name: persons, dtype: int64

small 576
med 576
big 576
Name: lug_boot, dtype: int64
low 576
med 576
high 576
Name: safety, dtype: int64
unacc 1210
acc 384
good 69
vgood 65
In [9]: df['class'].value_counts()
unacc 1210
Out[9]:
acc 384
good 69
vgood 65
In [10]: # check missing values in variables

df.isnull().sum()
Out[10]: buying 0
maint 0
doors 0
persons 0
lug_boot 0
safety 0
class 0
dtype: int64
X = df.drop(['class'], axis=1)
y = df['class']
# split X and y into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random
In [13]: # check the shape of X_train and X_test

X_train.shape, X_test.shape
((1157, 6), (571, 6))

Out[13]:
In [14]: # check data types in X_train

X_train.dtypes
buying object
Out[14]:
maint object
doors object
persons object
lug_boot object
safety object
dtype: object
48 vhigh vhigh 3 more med low
468 high vhigh 3 4 small low
155 vhigh high 3 more small high
1721 low low 5more more small high
1208 med low 2 more small high
In [16]: # import category encoders

import category_encoders as ce
In [17]: # encode variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot
48 1 1 1 1 1 1
468 2 1 1 2 2 1
155 1 2 1 1 2 2
1721 3 3 2 1 2 2
1208 4 3 3 1 2 2
In [19]: X_test.head()
599 2 2 4 3 1 2
1201 4 3 3 2 1 3
628 2 2 2 3 3 3
1498 3 2 2 2 1 3
1263 4 3 4 1 1 1
In [21]: # import Random Forest classifier

from sklearn.ensemble import RandomForestClassifier
# instantiate the classifier

rfc = RandomForestClassifier(random_state=0)
# fit the model

rfc.fit(X_train, y_train)
# Predict the Test set results

y_pred = rfc.predict(X_test)
# Check accuracy score

from sklearn.metrics import accuracy_score
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test,
Model accuracy score with 10 decision-trees : 0.9457
In [22]: # instantiate the classifier with n_estimators = 100

rfc_100 = RandomForestClassifier(n_estimators=100, random_state=0)
# fit the model to the training set

rfc_100.fit(X_train, y_train)
# Predict on the test set results

y_pred_100 = rfc_100.predict(X_test)

print('Model accuracy score with 100 decision-trees : {0:0.4f}'. format(accuracy_score(y_test
Model accuracy score with 100 decision-trees : 0.9457
In [23]: # create the classifier with n_estimators = 100

clf = RandomForestClassifier(n_estimators=100, random_state=0)

clf.fit(X_train, y_train)
Out[23]: ▾ RandomForestClassifier
RandomForestClassifier(random_state=0)
In [24]: # view the feature scores

feature_scores = pd.Series(clf.feature_importances_, index=X_train.columns).sort_values(ascen
feature_scores
safety 0.295319
Out[24]:
persons 0.233856
buying 0.151734
maint 0.146653
lug_boot 0.100048
doors 0.072389
#dtype: float64
Creating a seaborn bar plot
In [25]: sns.barplot(x=feature_scores, y=feature_scores.index)
# Add labels to the graph

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
# Add title to the graph

plt.title("Visualizing Important Features")
# Visualize the graph

plt.show()
In [26]: # declare feature vector and target variable

X = df.drop(['class', 'doors'], axis=1)
y = df['class']
In [27]: # split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42
In [28]: # encode categorical variables with ordinal encoding

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'persons', 'lug_boot', 'safety'])
In [29]: # instantiate the classifier with n_estimators = 100

clf = RandomForestClassifier(random_state=0)

clf.fit(X_train, y_train)
# Predict on the test set results

y_pred = clf.predict(X_test)

print('Model accuracy score with doors variable removed : {0:0.4f}'. format(accuracy_score(y_
Model accuracy score with doors variable removed : 0.9264
In [30]: # Print the Confusion Matrix and slice it into four pieces
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
acc 0.89 0.81 0.85 129

good 0.56 0.90 0.69 20
unacc 0.97 0.97 0.97 397
vgood 0.80 0.80 0.80 25
accuracy 0.93 571

macro avg 0.81 0.87 0.83 571
weighted avg 0.93 0.93 0.93 571
In [ ]:
Experiment 13
Aim – : Create a Machine Learning Model using K-means Clustering algorithm.
Code –

Index: Name - JINESH PRAJAPAT Class - B. Tech, III Year Branch - AI & DS Sem - V

Uploaded by

Copyright:

Available Formats

You might also like

Index: Name - JINESH PRAJAPAT Class - B. Tech, III Year Branch - AI & DS Sem - V

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Index: Name - JINESH PRAJAPAT Class - B. Tech, III Year Branch - AI & DS Sem - V

Uploaded by

Copyright:

Available Formats

Name – JINESH PRAJAPAT Class - B.

Tech, III Year Branch - AI & DS Sem – V

In [ 0]: print("Jinesh Prajapat")

In [1]: import numpy as np # linear algebra

In [2]: # import dataset

In [4]: # train test split

print("x train: ",x_train.shape)

x train: (30, 483)

In [5]: # lets initialize parameters

In [7] #%% forward and backward

# x_train.shape[1] is for scaling

In [8]: #%%# Updating(learning) parameters

In [9]: #%% # prediction

# Print train/test Errors

logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 1, num_iterations = 100)

Cost after iteration 0: 0.692836

train accuracy: 94.40993788819875 %

test accuracy: 0.9767441860465116

In [46]: import warnings

from sklearn import svm

In [47]: #loading data

Out[47]: age anaemia creatinine_phosphokinase diabetes ejection_fraction high_blood_pressure platelets serum

0 75.0 0 582 0 20 1 265000.00

1 55.0 0 7861 0 38 0 263358.03

2 65.0 0 146 0 20 0 162000.00

3 50.0 1 111 0 20 0 210000.00

4 65.0 1 160 1 20 0 327000.00

In [48]: # Checking for any missing values across the dataset

0 age 299 non-null float64

11 time 299 non-null int64

[Text(0, 0, '203'), Text(0, 0, '96')]

count mean std min 25% 50% 75% max

age 299.0 60.833893 11.894809 40.0 51.0 60.0 70.0 95.0

anaemia 299.0 0.431438 0.496107 0.0 0.0 0.0 1.0 1.0

creatinine_phosphokinase 299.0 581.839465 970.287881 23.0 116.5 250.0 582.0 7861.0

diabetes 299.0 0.418060 0.494067 0.0 0.0 0.0 1.0 1.0

ejection_fraction 299.0 38.083612 11.834841 14.0 30.0 38.0 45.0 80.0

high_blood_pressure 299.0 0.351171 0.478136 0.0 0.0 0.0 1.0 1.0

platelets 299.0 263358.029264 97804.236869 25100.0 212500.0 262000.0 303500.0 850000.0

serum_creatinine 299.0 1.393880 1.034510 0.5 0.9 1.1 1.4 9.4

serum_sodium 299.0 136.625418 4.412477 113.0 134.0 137.0 140.0 148.0

sex 299.0 0.648829 0.478136 0.0 0.0 1.0 1.0 1.0

smoking 299.0 0.321070 0.467670 0.0 0.0 0.0 1.0 1.0

time 299.0 130.260870 77.614208 4.0 73.0 115.0 203.0 285.0

DEATH_EVENT 299.0 0.321070 0.467670 0.0 0.0 0.0 1.0 1.0

Out[51]: <Axes: >

In [52]: #Evauating age distribution as per the deaths happened

Out[52]: Text(0.5, 1.0, 'Distribution Of Age')

<Axes: xlabel='time', ylabel='age'>

Out[14]: count mean std min 25% 50% 75% max

age 299.0 5.703353e-16 1.001676 -1.754448 -0.828124 -0.070223 0.771889 2.877170

anaemia 299.0 1.009969e-16 1.001676 -0.871105 -0.871105 -0.871105 1.147968 1.147968

creatinine_phosphokinase 299.0 0.000000e+00 1.001676 -0.576918 -0.480393 -0.342574 0.000166 7.514640

diabetes 299.0 9.060014e-17 1.001676 -0.847579 -0.847579 -0.847579 1.179830 1.179830

high_blood_pressure 299.0 0.000000e+00 1.001676 -0.735688 -0.735688 -0.735688 1.359272 1.359272

platelets 299.0 7.723291e-17 1.001676 -2.440155 -0.520870 -0.013908 0.411120 6.008180

serum_creatinine 299.0 1.425838e-16 1.001676 -0.865509 -0.478205 -0.284552 0.005926 7.752020