Professional Documents
Culture Documents
ML LAB Mannual - Index
ML LAB Mannual - Index
Index
6 Logistic Regression
Factors affecting data quality
7
kNN Algorithm
8
Decision Trees
9
Random Forest Classifier
10
Machine Learning (3170724)
file_name = "https://raw.githubusercontent.com/rajeevratan84/
datascienceforbusiness/master/drug_tria df = pd.read_csv(file_name)
df.head()
group mean_duratio
n
0 Control 7
1 Control 5
2 Control 4
3 Control 5
4 Control 8
df['group'].unique()
array(['Control', 'Experimental'], dtype=object)
for i in range(number_of_iterations):
group_1 = []
group_2 = []
for data_point in data_points:
random_assignment = np.random.randint(0,2)==True
if random_assignment:
1
Page
group_1.append(data_point)
else:
group_2.append(data_point)
# Get the length of this series to count the number of value exceeding 0.6
exceeds_mean_diff = len(mean_differences[mean_differences[0] >= 0.6])
print(exceeds_mean_diff)
372
Histogram plots
Scatter plots
Percentiles
Boxplots
Violin
Heatmaps
Barplots
Factor plots
Density plots
Joint Distribution plots
import pandas as pd
import numpy as np
file_name = "https://raw.githubusercontent.com/rajeevratan84/
datascienceforbusiness/master/winequali df = pd.read_csv(file_name)
df.head()
df.describe()
df.info()
3
Page
df["alcohol"].mean()
10.491800831149511
df.hist(column='alcohol', bins=15)
array([[<matplotlib.axes._subplots.AxesSubplot object at
0x7f7f2ecc5fd0>]], dtype=object)
# Let's get a bit more advanced and learn to customize your plots
df.hist(column='alcohol', bins=10, grid=False, figsize=(10,6),
color='green')
array([[<matplotlib.axes._subplots.AxesSubplot object at
0x7f7f2ec5e828>]], dtype=object)
4
Page
Using the library Seaborn, we can quickly produce even nicer plots
import seaborn as sns
<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2ebd85c0>
df["alcohol"].value_counts().head()
9.5 367
9.4 332
9.2 271
10.0 229
10.5 227
Name: alcohol, dtype: int64
Scatter Plots
sns.lmplot(x='alcohol', y='fixed acidity', data=df)
<seaborn.axisgrid.FacetGrid at 0x7f7f2ed06908>
6
Page
# Scatterplot arguments
sns.lmplot(x='alcohol', y='chlorides', data=df,
fit_reg=False, # Remove the regression line
hue='quality') # Color by quality
<seaborn.axisgrid.FacetGrid at 0x7f7f2ec224e0>
7
Page
Understanding Percentiles
# Quantile ranges
df["alcohol"].quantile(([0.05, 0.95]))
0.05 9.0
0.95 12.7
Name: alcohol, dtype: float64
print(df["alcohol"].quantile(([0.25, 0.75])))
sns.boxplot(data=df['alcohol'])
0.25 9.5
0.75 11.3
Name: alcohol, dtype: float64
<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2e8309e8>
sns.boxplot(data=df, palette="Set3")
<matplotlib.axes._subplots.AxesSubplot at 0x7f7f2e89dda0>
sns.boxplot(data=summary, palette="Set3")
import pandas as pd
from datetime import datetime
import numpy as np
# Let's create a pandas series that logs time every hour from 1st Nov'19 to 7th
Nov'19
df = pd.date_range(start='11/01/2019', end='11/07/2019', freq='H')
df
df.head()
10
11
Page
Summary States - we can use Statistical methods over different time intervals
mean(), sum(), count(), min(), max()
Down-sampling
reduce datetime rows to longer frequency
Up-sampling
increase datetime rows to shorter frequency
# Using resample to get the average for each day per hour
df.resample('D').mean()
#df.resample('D').sum()
Parsing dates
df.info()
pd.to_datetime(df)
12
Page
import numpy as np
import matplotlib.pyplot as plt
# Let's create a some randon data using linspace
x = np.linspace(0, 100, 50) # creates a range of 50 numbers evenly spaced
between 0 and 100
delta = np.random.uniform(-10, 10, x.size)
y = 0.5*x + 3 + delta
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x7f0fc11b2080>
13
Page
learning_rate = 0.0001
# defining our initial values of b and m
initial_b = 0
initial_m = 0
# Setting how many iterations we wish to run
num_iterations= 1000
print('Initial MSE:', cost_function(initial_m, initial_b, x, y))
[b, m] = gradient_descent(initial_b, initial_m, x, y, learning_rate,
num_iterations)
print('b:', b)
print('m:', m)
print('MSE:', cost_function(m, b, x, y))
import numpy as np
from sklearn.linear_model import LinearRegression
# change the shape of x array to the format expected by sklearn
print(x.shape)
x = x.reshape((-1, 1))
14
print(x.shape)
(50,)
Page
(50, 1)
# create the model object using LinearRegression
model = LinearRegression()
print('Y Intercept:', b)
print('Gradient:', m)
Y Intercept: 0.3047385205747066
Gradient: 0.5417087284350158
plt.scatter(x, y)
plt.plot(x, y_pred, color='r')
[<matplotlib.lines.Line2D at 0x7f0fc1132400>]
15
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
file_name = "https://raw.githubusercontent.com/Jovita7/Data-Analysis-and-
Visualization/main/olympic1”
df = pd.read_csv(file_name)
import numpy as np
from sklearn.linear_model import LinearRegression
model.predict(x_2030)
Page
array([9.52679525])
x = auto_df.iloc[:,1:8].values y = auto_df.iloc[:,0].values
x.shape
(392, 7)
17
Page
X_train,X_test,Y_train,Y_test =
train_test_split(x,y,test_size=0.3,random_state=0)
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
regressor.predict(Trial_Data)[0]
29.31873696265787
18
Page
# Load data
file_name =
"https://raw.githubusercontent.com/rajeevratan84/datascienceforbusiness/master/
heights_w”
df = pd.read_csv(file_name)
df.head()
df.shape
(10000, 3)
#Metric -Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, prediction)
accuracy
0.9186666666666666
error_rate = 1 - accuracy
error_rate
0.08133333333333337
# Get prediction
20
prediction_individual = clf.predict([(height,weight)])
if prediction_individual[0]:
Page
result = "Male"
else:
result = "Female"
print("Person is " + result)
Person is Female
21
Page
import pandas as pd
#read csv
data = pd.read_csv('https://raw.githubusercontent.com/Jovita7/Data-Analysis-
and-Visualization/main/auto_mpg')
find_outliers(data, 'mpg')
find_outliers(new_data, 'mpg')
22
Page
23
Page
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy {0:.2f}%".format(100*accuracy_score(y_pred, y_test)))
error = []
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
Page
25
Page
#Entropy Calculation
dtree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=100,
max_depth=3, min_sampl=5)
#Predictions
prediction = dtree_entropy.predict(predictors_test)
#Calculate accuracy
accuracy_score(target_test, prediction, normalize= True)
0.84375
#Text Visualization
from sklearn import tree
text_representation = tree.export_text(dtree_entropy)
print(text_representation)
26
Page
#Graphical Visualization
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(10,15))
#Getting column names
col_names = list(data.columns)
_ = tree.plot_tree(dtree_entropy, feature_names=col_names[0:7],
class_names=['0','1'], filled=True)
27
Page
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
data = pd.read_csv('https://raw.githubusercontent.com/Jovita7/Data-Analysis-
and-Visualization/main/auto_mpg')
data.head()