Download as pdf or txt
Download as pdf or txt
You are on page 1of 29

L.J. Institute of Engineering & Technology, Ahmedabad.

COMPUTER ENGINEERING DEPARTMENT

Index

SUBJECT NAME & BRANCH & TEACHING SCHEME (HOURS)


CREDITS
CODE SEMESTER
THEORY TUTORIAL PRACTICAL
Machine Learning- (CE) -VII 03 00 02 04
3170724

Sr. DATE FACULTY SIGN


AIM OF PRACTICAL
No
1 Hypothesis Testing

2 Statistics - Exploratory Data Analysis


Working with Time Series - Date Data
3
Linear Regression from Scratch!
4
Multivariate Linear Regression
5

6 Logistic Regression
Factors affecting data quality
7
kNN Algorithm
8
Decision Trees
9
Random Forest Classifier
10
Machine Learning (3170724)

Practical 1: Hypothesis Testing


Creating a Sample Distribution from our Control and Experimental groups
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

file_name = "https://raw.githubusercontent.com/rajeevratan84/
datascienceforbusiness/master/drug_tria df = pd.read_csv(file_name)
df.head()
group mean_duratio
n
0 Control 7
1 Control 5
2 Control 4
3 Control 5
4 Control 8

df['group'].unique()
array(['Control', 'Experimental'], dtype=object)

# Show means of Control and Experimental groups


control_mean = df[df['group'] == 'Control'].mean()
experiment_mean = df[df['group'] == 'Experimental'].mean()
print("Control Mean = " +str(control_mean))
print("Experimental Mean = " + str(experiment_mean))

Control Mean = mean_duration 5.4


dtype: float64
Experimental Mean = mean_duration 4.8
dtype: float64
Let's run 10,000 Permutations

mean_difference = control_mean - experiment_mean data_points =


list(df['mean_duration']) mean_differences = []
number_of_iterations = 10000

for i in range(number_of_iterations):
group_1 = []
group_2 = []
for data_point in data_points:
random_assignment = np.random.randint(0,2)==True
if random_assignment:
1
Page

group_1.append(data_point)

L. J. Institutes of engineering and technology


Machine Learning (3170724)

else:
group_2.append(data_point)

trial_mean_difference = np.mean(group_2) - np.mean(group_1)


mean_differences.append(trial_mean_difference)

print("Mean Differences Mean - " + str(np.array(mean_differences).mean()))


# Generate Frequency or Histogram Plot sns.distplot(mean_differences)

Mean Differences Mean - -0.000839655770485615


/usr/local/lib/python3.6/dist-packages/seaborn/distributions.py:2557:
FutureWarning: `distplot warnings.warn(msg, FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d24a95d30>

How many values exceed the mean difference of 0.6?

# Convert our mean_differences to a data frame for easy manipulation


mean_differences = pd.DataFrame(mean_differences)

# Get the length of this series to count the number of value exceeding 0.6
exceeds_mean_diff = len(mean_differences[mean_differences[0] >= 0.6])
print(exceeds_mean_diff)

372

Calculating the P-Value

# Number of trials we performed was 10,000


# P-Value is equal to the

p_value = exceeds_mean_diff / number_of_iterations


print(p_value)
0.0372
2
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 2: Statistics - Exploratory Data Analysis

 Histogram plots
 Scatter plots
 Percentiles
 Boxplots
 Violin
 Heatmaps
 Barplots
 Factor plots
 Density plots
 Joint Distribution plots

import pandas as pd
import numpy as np
file_name = "https://raw.githubusercontent.com/rajeevratan84/
datascienceforbusiness/master/winequali df = pd.read_csv(file_name)
df.head()

Simple Descriptive Analysis

df.describe()

df.info()
3
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

df["alcohol"].mean()
10.491800831149511
df.hist(column='alcohol', bins=15)

array([[<matplotlib.axes._subplots.AxesSubplot object at
0x7f7f2ecc5fd0>]], dtype=object)

# Let's get a bit more advanced and learn to customize your plots
df.hist(column='alcohol', bins=10, grid=False, figsize=(10,6),
color='green')
array([[<matplotlib.axes._subplots.AxesSubplot object at
0x7f7f2ec5e828>]], dtype=object)
4
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Using the library Seaborn, we can quickly produce even nicer plots
import seaborn as sns

sns.distplot(df['alcohol'], bins=25, kde=False)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2ebd85c0>

sns.distplot(df.alcohol, bins=10, kde=True, color="red")


<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2eb5e390>
5
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

df["alcohol"].value_counts().head()
9.5 367
9.4 332
9.2 271
10.0 229
10.5 227
Name: alcohol, dtype: int64

# Let's start with labeling our axis


import matplotlib.pyplot as plt
_ = sns.distplot(df.alcohol)
_ = plt.xlabel('Alcohol Percentage')
_ = plt.ylabel('Count')
_ = plt.title("Alcohol Content")
plt.show()

Scatter Plots
sns.lmplot(x='alcohol', y='fixed acidity', data=df)
<seaborn.axisgrid.FacetGrid at 0x7f7f2ed06908>
6
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

sns.lmplot(x='alcohol', y='density', data=df, fit_reg=False)


<seaborn.axisgrid.FacetGrid at 0x7f7f2f247cc0>

# Scatterplot arguments
sns.lmplot(x='alcohol', y='chlorides', data=df,
fit_reg=False, # Remove the regression line
hue='quality') # Color by quality
<seaborn.axisgrid.FacetGrid at 0x7f7f2ec224e0>

7
Page

Understanding Percentiles

L. J. Institutes of engineering and technology


Machine Learning (3170724)

# The quantile function


print(df["alcohol"].quantile(0.1))
print(df["alcohol"].quantile(0.5))
print(df["alcohol"].quantile(0.9))
print(df["alcohol"].quantile(0.99))
9.1
10.3
12.3
13.4

# What's the max alcohol value though?


df["alcohol"].max()
14.9

# Quantile ranges
df["alcohol"].quantile(([0.05, 0.95]))
0.05 9.0
0.95 12.7
Name: alcohol, dtype: float64

Boxplots & Finding Outliers

print(df["alcohol"].quantile(([0.25, 0.75])))
sns.boxplot(data=df['alcohol'])

0.25 9.5
0.75 11.3
Name: alcohol, dtype: float64
<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2e8309e8>
sns.boxplot(data=df, palette="Set3")
<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2e89dda0>

from matplotlib import pyplot as plt


# Enlarge the plot
plt.figure(figsize=(12,8))
summary = df.drop(['free sulfur dioxide', 'total sulfur dioxide', 'quality',
8

'residual sugar', 'fixed acidity', 'alcohol'], axis=1)


Page

sns.boxplot(data=summary, palette="Set3")

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 3: Working with Time Series Date Data

import pandas as pd
from datetime import datetime
import numpy as np
# Let's create a pandas series that logs time every hour from 1st Nov'19 to 7th
Nov'19
df = pd.date_range(start='11/01/2019', end='11/07/2019', freq='H')
df

NOTE ISO 8601 format


yyyy-mm-dd hh:mm:ss
len(df)
145

# Now let's turn our series into a dataframe


df = pd.DataFrame(df, columns=['date'])
# And add a 'made up' column for sales data
df['sales'] = np.random.randint(0,1000,size=(len(df)))

df.head()

# Set your date as the index


df = df.set_index('date')
df.head()
9
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

# Selecting using date - getting exact value for cell


df.loc['2019-11-01 03:00:00', 'sales']
938
# Selecting using date to return the row corresponding to that date
df.loc['2019-11-01 03:00:00']
sales 938
Name: 2019-11-01 03:00:00, dtype: int64
# Selecting an entire day
df.loc['2019-11-01']
# Similary you an use df.loc['2019-11'] to select and entire month

10

# Selecting a range of dates


df.loc['2019-11-01':'2019-11-02']
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

11
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Summary States - we can use Statistical methods over different time intervals
mean(), sum(), count(), min(), max()
Down-sampling
reduce datetime rows to longer frequency

Up-sampling
increase datetime rows to shorter frequency
# Using resample to get the average for each day per hour
df.resample('D').mean()
#df.resample('D').sum()

Parsing dates

df = pd.DataFrame({'year': [2015, 2016],


'month': [2, 3],
'day': [4, 5]})
df

df.info()

pd.to_datetime(df)
12
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 4: Linear Regression from Scratch!

# Our cost function


def cost_function(m, b, x, y):
totalError = 0
for i in range(0, len(x)):
totalError += (y[i]-(m*x[i]+b))**2
return totalError/float(len(x))
# Implementation of the gradient descent algorithm
def gradient_descent(b, m, x, y, learning_rate, num_iterations):
N = float(len(x))
# repeat for num_iterations
for j in range(num_iterations):
b_gradient = 0
m_gradient = 0
for i in range(0, len(x)):
b_gradient += -(2/N) * (y[i] - ((m * x[i]) + b))
m_gradient += -(2/N) * x[i] * (y[i] - ((m * x[i]) + b))
b -= (learning_rate * b_gradient)
m -= (learning_rate * m_gradient)
# Every 100 iterations we print our error
if j%100==0:
print("MSE after " + str(j) + " iterations: " +
str(cost_function(m, b, x, y)))
return [b, m]

Linear Regression algorithm on some test data

import numpy as np
import matplotlib.pyplot as plt
# Let's create a some randon data using linspace
x = np.linspace(0, 100, 50) # creates a range of 50 numbers evenly spaced
between 0 and 100
delta = np.random.uniform(-10, 10, x.size)
y = 0.5*x + 3 + delta
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x7f0fc11b2080>

13
Page

# Defining our learning rate

L. J. Institutes of engineering and technology


Machine Learning (3170724)

learning_rate = 0.0001
# defining our initial values of b and m
initial_b = 0
initial_m = 0
# Setting how many iterations we wish to run
num_iterations= 1000
print('Initial MSE:', cost_function(initial_m, initial_b, x, y))
[b, m] = gradient_descent(initial_b, initial_m, x, y, learning_rate,
num_iterations)
print('b:', b)
print('m:', m)
print('MSE:', cost_function(m, b, x, y))

predictions = [(m * x[i]) + b for i in range(len(x))]


plt.scatter(x, y)
plt.plot(x, predictions, color='r')
[<matplotlib.lines.Line2D at 0x7f0fc3f16198>]

SkiLearn's Linear Regression model

import numpy as np
from sklearn.linear_model import LinearRegression
# change the shape of x array to the format expected by sklearn
print(x.shape)
x = x.reshape((-1, 1))
14

print(x.shape)
(50,)
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

(50, 1)
# create the model object using LinearRegression
model = LinearRegression()

# Fit our model to our input data x and y


model.fit(x, y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)

# Get our model paramters


print('Y Intercept:', model.intercept_) #b
print('Gradient:', model.coef_) #m
Y Intercept: 5.916909681624745
Gradient: [0.45837012]

print('Y Intercept:', b)
print('Gradient:', m)

Y Intercept: 0.3047385205747066
Gradient: 0.5417087284350158

# Store all predictions in y_pred


y_pred = model.predict(x)
y

plt.scatter(x, y)
plt.plot(x, y_pred, color='r')
[<matplotlib.lines.Line2D at 0x7f0fc1132400>]

15

from sklearn.metrics import mean_squared_error


print("Mean squared error: %.2f" % mean_squared_error(y, y_pred))
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Mean squared error: 31.96

Linear Regression on Olympic 100m Gold Times

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
file_name = "https://raw.githubusercontent.com/Jovita7/Data-Analysis-and-
Visualization/main/olympic1”
df = pd.read_csv(file_name)

# Format data into correct shape


x = df['year']
x_train = np.array(x).reshape((-1, 1))
x_train.shape
(29, 1)
y_train = np.array(df['time'])
y_train.shape
(29,)

import numpy as np
from sklearn.linear_model import LinearRegression

# Let's create the model object using LinearRegression


model = LinearRegression()

# Fit our model to our input data x and y


model.fit(x_train, y_train)
y_pred = model.predict(x_train)
plt.scatter(x_train, y_train)
plt.plot(x, y_pred, color='r')
[<matplotlib.lines.Line2D at 0x7fbe17346b10>]

# Predict for 2030 Olympics


x_2030 = np.array([2020]).reshape(-1, 1)
x_2030.shape
(1, 1)
16

model.predict(x_2030)
Page

array([9.52679525])

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 5: Multivariate Linear Regression


# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset


file_name = "https://raw.githubusercontent.com/Data-Analysis-and-
Visualization/main/auto-mpg”
auto_df = pd.read_csv(file_name)
auto_df.head()

# Get the indexes that have "?" instead of numbers


indexNames = auto_df[auto_df['horsepower'] == "?" ].index

# Delete these row indexes from dataFrame


auto_df.drop(indexNames , inplace=True)

# Just checking to see if they've been removed


auto_df[auto_df['horsepower'] == '?']
auto_df['horsepower'] = auto_df['horsepower'].astype(float)
auto_df.info()

<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to


397
Data columns (total 9 columns):
mpg 392 non-null float64 cylinders 392 non-null
int64 displacement 392 non-null float64 horsepower 392 non-
null float64 weight 392 non-null int64 acceleration 392
non-null float64 model year 392 non-null int64 origin
392 non-null int64 car name 392 non-null object
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB

x = auto_df.iloc[:,1:8].values y = auto_df.iloc[:,0].values
x.shape
(392, 7)
17
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

from sklearn.model_selection import train_test_split from


sklearn.linear_model import LinearRegression from sklearn.metrics import
mean_squared_error

X_train,X_test,Y_train,Y_test =
train_test_split(x,y,test_size=0.3,random_state=0)

regressor = LinearRegression()
regressor.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,


normalize=False)

# cylinders displacement horsepower weight acceleration model year


origin

# Data for Honda Prelude actual mpg is 33.7


Trial_Data = np.array([4,120,79,2625,18.6,82,1])
Trial_Data = Trial_Data.reshape((-1, 7))
Trial_Data = Trial_Data.astype(float)

regressor.predict(Trial_Data)[0]

29.31873696265787

18
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 6: Logistic Regression


import numpy as np
import pandas as pd
from sklearn import linear_model

# Load data
file_name =
"https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/
heights_w”
df = pd.read_csv(file_name)

df.head()

df.shape
(10000, 3)

# Plotting our data


import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
ax = sns.scatterplot(x="Height", y="Weight", hue="Male", data=df)

# Extract the columns we'll use for our data


x = df.iloc[:,0:2].values
y = df.iloc[:,2].values
19

# Split data into our test and training datasets


from sklearn.model_selection import train_test_split
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3,


random_state=0)

# Fit (train) the Logistic Regression classifier

#clf = linear_model.LogisticRegression(C=1e40, solver='newton-cg')


clf = linear_model.LogisticRegression()
fitted_model = clf.fit(X_train, Y_train)
prediction = clf.predict(X_test)

#Metric -Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, prediction)
accuracy
0.9186666666666666
error_rate = 1 - accuracy
error_rate
0.08133333333333337

#Metric Confusion Matrix


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, prediction)
cm
array([[1385, 104],
[ 140, 1371]])
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]

print(TP, FN, TN, FP)


1371 140 1385 104
Y_test.shape
(3000,)
#Sensitivity
sensitivity = TN / (TN + FP)
sensitivity
0.9301544660846206
#Specificity
specificity = TP / (TP + FN)
specificity
0.9073461283917935

# Predict based on height (inches) and weight (lbs)


height = 64
weight = 155

# Get prediction
20

prediction_individual = clf.predict([(height,weight)])
if prediction_individual[0]:
Page

result = "Male"

L. J. Institutes of engineering and technology


Machine Learning (3170724)

else:
result = "Female"
print("Person is " + result)
Person is Female

21
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 7: Factors affecting data quality


Data Remediation
1. Remove outliers: If the number of records which are outliers is not many, a simple approach may be
to remove them.
2. Imputation: One other way is to impute the value with mean or median or mode. The value of the
most similar data element may also be used for imputation.
3. Capping: For values that lie outside the 1.5|×| IQR limits, we can cap them by replacing those
observations below the lower limit with the value of 5th percentile and those that lie above the upper
limit, with the value of 95th percentile.
Removing outliers

import pandas as pd
#read csv
data = pd.read_csv('https://raw.githubusercontent.com/Jovita7/Data-Analysis-
and-Visualization/main/auto_mpg')

#finding outliers in 'mpg'


def find_outliers(ds, col):
quart1 = ds[col].quantile(0.25)
quart3 = ds[col].quantile(0.75)
IQR = quart3 - quart1 #Inter-quartile range
low_val = quart1 - 1.5*IQR
high_val = quart3 + 1.5*IQR
ds = ds.loc[(ds[col] < low_val) |
(ds[col] > high_val)]
return ds

find_outliers(data, 'mpg')

def remove_outliers(ds, col):


quart1 = ds[col].quantile(0.25)
quart3 = ds[col].quantile(0.75)
IQR = quart3 - quart1 #Interquartile range
low_val = quart1 - 1.5*IQR
high_val = quart3 + 1.5*IQR
df_out = ds.loc[(ds[col] > low_val) & (ds[col] < high_val)]
return df_out
new_data = remove_outliers(data, 'mpg')

find_outliers(new_data, 'mpg')
22
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 8: kNN Algorithm


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
url = https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

# Assign colum names to the dataset


names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe


dataset = pd.read_csv(url, names=names)
dataset.head()

# Pair Plot Visualization


sns.pairplot(data=dataset, kind='scatter')

23
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

# Spliting our dataset


from sklearn.model_selection import train_test_split
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# preproessing our dataset by scaling it


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Using KNN to train our dataset


from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
y_pred = classifier.predict(X_test)

# Evaluating our predictions


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy {0:.2f}%".format(100*accuracy_score(y_pred, y_test)))

error = []

# Calculating error for K values between 1 and 40


for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
24

pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

# Plot our results


plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
Text(0, 0.5, 'Mean Error')

25
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 9: Decision Trees


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv('https://raw.githubusercontent.com/ Data-Analysis-and-
Visualization/main/a
data.head()

#Segregating predictor variables


predictors = data.iloc[:, 0:7]

#Segregating the target/class variable


target = data.iloc[:, 7]

#split into training and test datasets


predictors_train, predictors_test, target_train, target_test =
train_test_split( predictors, target, test_size=0.20)

#Entropy Calculation
dtree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100,
max_depth=3, min_sampl=5)

#Train the model on training data


model = dtree_entropy.fit(predictors_train, target_train)

#Predictions
prediction = dtree_entropy.predict(predictors_test)

#Calculate accuracy
accuracy_score(target_test, prediction, normalize= True)

0.84375

#Text Visualization
from sklearn import tree
text_representation = tree.export_text(dtree_entropy)
print(text_representation)
26
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

#Graphical Visualization
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(10,15))
#Getting column names
col_names = list(data.columns)
_ = tree.plot_tree(dtree_entropy, feature_names=col_names[0:7],
class_names=['0','1'], filled=True)

27
Page

L. J. Institutes of engineering and technology


Machine Learning (3170724)

Practical 10: Random Forest Classifier

#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('https://raw.githubusercontent.com/Jovita7/Data-Analysis-
and-Visualization/main/auto_mpg')
data.head()

#segregating the predictor variables


predictors = data.iloc[:, 0:7]

#segregating the target variable


target = data.iloc[:, 7]

#splitting the data into training and test data set


predictors_train, predictors_test, target_train, target_test =
train_test_split(predictors, target,
rf = RandomForestClassifier()

#Train the model


model = rf.fit(predictors_train, target_train)
prediction = rf.predict(predictors_test)

#Checking the accuracy


accuracy_score(target_test, prediction, normalize=True)
0.90625
28
Page

L. J. Institutes of engineering and technology

You might also like