Professional Documents
Culture Documents
Lab File
Lab File
Lab File
Pseudo-code:
import pandas as pd
df = pd.DataFrame({'date':['2021-01-01', '2021-01-02',
'2021-01-03', '2021-01-04','2021-01-01', '2021-01-02',
'2021-01-03', '2021-01-04'], 'fruit':['apple', 'apple', 'apple',
'apple', 'mango', 'mango', 'mango', 'mango'], 'price': [0.80,
None, None, 1.20, None, 2.10, 2.00, 1.80]})
df['date'] = pd.to_datetime(df['date'])
print("ORIGINAL DATA\n");
print(df)
df['price'].fillna(value = 0.85, inplace = True)
print("\n\nDATA FILLED WITH CONSTANT VALUE IN PLACE
OF NaN\n")
print(df);
Result-
Ques 2- Fill with the Mean of Column.
Pseudo-code:
# mean
df['price'].fillna(value = df.price.mean(), inplace = True)
Result-
Ques 3- Fill with Median of Column.
Pseudo-code:
df['price'].fillna(value = df.price.median(), inplace = True)
Result-
Ques 4- Fill with Mean of Group.
Pseudo-code:
# mean
df['price'].fillna(df.groupby('fruit')['price'].transform('mean'),
inplace = True)
Result-
Ques 5- Fill with Median of Group.
Pseudo-code:
# median
df['price'].fillna(df.groupby('fruit')
['price'].transform('median'), inplace = True)
Result-
Ques 6- Fill using Forward Fill.
Pseudo-code:
df['price'].fillna(method = 'ffill', inplace = True)
Result-
Ques 7- Fill using Forward Fill with Limit=1.
Pseudo-code:
df['price'].fillna(method = 'ffill', limit = 1, inplace = True)
Result-
Ques 8- Fill with Forward Fill withing Group.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].ffill()
Result-
Ques 9- Fill using Forward Fill within Group with Limit=1.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].ffill(limit = 1)
Result-
Ques 10- Fill using Back Fill.
Pseudo-code:
df['price'].fillna(method = 'bfill', inplace = True)
Result-
Ques 11- Fill using Back Fill with Limit=1.
Pseudo-code:
df['price'].fillna(method = 'bfill', limit = 1, inplace = True)
Result-
Ques 12- Fill using Back Fill within Group.
Pseudo-code:
# backfill without propagation limit
df['price'] = df.groupby('fruit')['price'].bfill()
Result-
Ques 13- Fill using Back Fill within Group with Limit=1.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].bfill(limit = 1)
Result-
Ques 14- Fill by Combining both Forward Fill and Back Fill.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].ffill().bfill()
Result-
Ques 15- Fill by Combining both Forward Fill and Back fill
but first use Back Fill then Forward Fill.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].bfill().ffill()
Result-
Ques 16- Fill using Interpolation.
Pseudo-code:
df['price'].interpolate(method = 'linear', inplace = True)
Result-
Ques 17- Fill using Interpolation within Group.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].apply(lambda x:
x.interpolate(method='linear'))
Result-
Ques 18- Fill using both Interpolation and Back Fill.
Pseudo-code:
df['price'] = df.groupby('fruit')['price'].apply(lambda x:
x.interpolate(method='linear')).bfill()
Result-
Ques 19- Fill value based on Conditions.
Pseudo-code:
mean_price = df.groupby('fruit')['price'].transform('mean')
#TRANSFORM MAKES A NEW COLUMN mean_price
print("\nMEAN PRICE\n")
print(mean_price)
Result-
Conclusion
● Forward Fill
● Back Fill
● Interpolation
The choice of the filling method depends on the assumptions and the
context of the problem. For example, filling the missing values of
mangoes with mean price of apples and mangoes may not be a good
idea as apples and mangoes have rather different prices in our toy
dataset.
We also see how to use each of this methods in conjunction with
pandas.groupby() method to fill missing values for each group
separately.
df.columns
df.columns=['color','size','price','classlabel']
print(df);
size_mapping.items()
inv_size_mapping={v:k for k,v in size_mapping.items()}
inv_size_mapping
df['size']=df['size'].map(inv_size_mapping)
df
Conclusion
Pseudo-code:
import pandas as pd
from io import StringIO
csv_data='''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''
csv_data = str(csv_data)
df=pd.read_csv(StringIO(csv_data))
df
Result-
Result-
Result-
\
Ques 4- Show the values in dataframe.
Pseudo-code:
df.values
Result-
Result-
Conclusion
12-January-2023
import pandas as pd
df=pd.DataFrame([
['green','M',10.1,'class1'],
['red','L',13.5,'class2'],
['blue','XL',15.3,'class1']
])
df
df.columns
df.columns=['color','size','price','classlabel']
df
Ques 2- Do Class Labels Encoding.
Pseudo-code with Result:
import numpy as np
np.unique(df['classlabel'])
scaler.mean_
scaler.scale_
X_scaled=scaler.transform(X_train)
X_scaled
X_scaled.mean(axis=0)
X_scaled.std(axis=0)
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
X, y = make_classification(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=42)
pipe = make_pipeline(StandardScaler(),
LogisticRegression())
pipe.fit(X_train, y_train) # apply scaling on training data
import numpy as np
import statsmodels.api as sm
x=np.array([1,2,3,4,5])
y=np.array([7,14,15,18,19])
n=np.size(x)
x_mean=np.mean(x)
y_mean=np.mean(y)
x_mean,y_mean
Sxy=np.sum(x*y)-n*x_mean*y_mean
Sxx=np.sum(x*x)-n*x_mean*x_mean
b1=Sxy/Sxx
b0=y_mean-b1*x_mean
print('slope b1 is',b1)
print('intercept b0 is',b0)
plt.scatter(x,y)
plt.scatter(x,y,color='red')
plt.plot(x,y_predict,color='green')
plt.ylabel('Y')
plt.xlabel('X')
error=y-y_predict
se=np.sum(error**2)
mse=se/n
rmse=np.sqrt(mse)
SSt=np.sum((y-y_mean)**2)
R2=1-(se/SSt)
plt.scatter(x,y,color='red')
plt.plot(x,y_predict,color='green')
plt.ylabel('Y')
plt.xlabel('X')
error=y-y_predict
se=np.sum(error**2)
mse=se/n
rmse=np.sqrt(mse)
SSt=np.sum((y-y_mean)**2)
R2=1-(se/SSt)
Conclusion
We can observe the points distributed in the graph fitting the
regression line. We got SE as 10.80, mse as 2.16 rmse as 1.4696 and
r2 score as 0.8789.
High value of r square shows that linear regression fits the data
well. Transform features by scaling each feature to a given range.
This estimator scales and translates each feature individually such
that it is in the given range on the training set, e.g. between zero
and one.
The high value of r square shows that regression fits the data well.
The Result of the corr() method is a table with a lot of numbers that
represents how well the relationship is between two columns. 1
means that there is a 1 to 1 relationship (a perfect correlation), and
for this data set, each time a value went up in the first column, the
other one went up as well.
0.9 is also a good relationship, and if you increase one value, the
other will probably increase as well.
The adjusted R-squared is positive, not negative. It is always lower
than the R-squared. SVR, is to basically consider the points that are
within the decision boundary line. Our best fit line is the
hyperplane that has a maximum number of points.
Conclusion
25-january-2023
Aim-Correlation Coefficient
Pseudo code with result-
import pandas as pd
import numpy as np
df=pd.read_csv('diabetes.csv')
df
x=df['DiabetesPedigreeFunction']
y=df['Glucose']
print(x,y)
corr_result=np.corrcoef(x,y)
corr_result
p=df['Insulin']
corr_result=np.corrcoef(p,y)
corr_result
q=df['Pregnancies']
corr_result=np.corrcoef(q,y)
print(corr_result)
from sklearn.linear_model import LinearRegression
regression_model=LinearRegression()
regression_model.fit(df2,y)
y_predicted=regression_model.predict(df2)
print(y_predicted)
08 and 09 February,2023
Aim- Preprocessing Normalization
Pseudo code and result-
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import pandas as pd
from sklearn import preprocessing
df=pd.read_csv('diabetes.csv')
print(df)
df.to_numpy()
x=df[['Insulin','Age','BMI','DiabetesPedigreeFunction']].to_
numpy()
y=df.Glucose.to_numpy()
normalized_arr=preprocessing.normalize(x,axis=1)
regressor=SVR(kernel='linear')regressor.fit(x,y)
import matplotlib.pyplot as plt
plt.scatter(df['Age'],y,color='tab:blue')
plt.scatter(df['Age'],y_pred,color='tab:red')
plt.show()
plt.scatter(df['Age'],y_pred)
plt.scatter(df['Age'],y)
15 February,2023
Aim-
Pseudo code as well as result-
import pandas as pd
import numpy as np
import matplotlib.pyplot as mtp
df=pd.read_csv('diabetes.csv')
print(df)
Result-
x= df.iloc[:, [3,4,5,6,7]].values
y= df.iloc[:, 1].values
print(x);
Result-
from sklearn.preprocessing import StandardScaler
st_x= StandardScaler()
x_train= st_x.fit_transform(x_train)
x_test= st_x.transform(x_test)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(x, y,
test_size= 0.25, random_state=0)
from sklearn.tree import DecisionTreeRegressor
model=DecisionTreeRegressor(criterion='squared_error',
max_depth=3,random_state=0)
model.fit(x_train,y_train)
result-
y_pred=model.predict(x_test)
y_pred
model.score(x_test, y_test)
result-
y_pred= classifier.predict(x_test)
y_pred
result-
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, y_pred)
result-
\
model.score(x_test, y_test)
result-
Hierarchical Clustering
Importing the Libraries
Pseudocode:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_iris
iris_SM['2_clust']=fcluster(dist_sin,2, criterion='maxclust')
iris_SM['3_clust']=fcluster(dist_sin,3, criterion='maxclust')
iris_SM.head()
Plotting Different Graphs:
Pseudocode:
plt.figure(figsize=(24,4))
plt.subplot(1,3,1)
plt.title("K = 2",fontsize=14)
sns.scatterplot(x="petal_length",y="petal_width", data=iris_SM,
hue="2_clust")
plt.subplot(1,3,2)
plt.title("K = 3",fontsize=14)
sns.scatterplot(x="petal_length",y="petal_width", data=iris_SM,
hue="3_clust")
plt.subplot(1,3,3)
plt.title("Species",fontsize=14)
sns.scatterplot(x="petal_length",y="petal_width", data=iris_SM,
hue="species")
Pseudocode:
plt.figure(figsize=(24,4))
plt.subplot(1,2,1)
plt.title("K = 2",fontsize=14)
sns.swarmplot(x="species",y="2_clust", data=iris_SM, hue="species")
plt.subplot(1,2,2)
plt.title("K = 3",fontsize=14)
sns.swarmplot(x="species",y="3_clust", data=iris_SM, hue="species")
Pseudocode:
dist_comp = linkage(iris.loc[:,["sepal_length", "sepal_width",
"petal_length", "petal_width"]],method="complete")
plt.figure(figsize=(18,6))
dendrogram(dist_comp, leaf_rotation=90)
plt.xlabel('Index')
plt.ylabel('Distance')
plt.suptitle("DENDROGRAM COMPLETE METHOD",fontsize=18)
plt.show()
Pseudocode:
iris_CM=iris.copy()
iris_CM['2_clust']=fcluster(dist_comp,2, criterion='maxclust')
iris_CM['3_clust']=fcluster(dist_comp,3, criterion='maxclust')
iris_CM.head()
Pseudocode:
plt.figure(figsize=(24,4))
plt.subplot(1,3,1)
plt.title("K = 2",fontsize=14)
sns.scatterplot(x="sepal_length",y="sepal_width", data=iris_CM,
hue="2_clust")
plt.subplot(1,3,2)
plt.title("K = 3",fontsize=14)
sns.scatterplot(x="sepal_length",y="sepal_width", data=iris_CM,
hue="3_clust")
plt.subplot(1,3,3)
plt.title("Species",fontsize=14)
sns.scatterplot(x="sepal_length",y="sepal_width", data=iris_CM,
hue="species")
Pseudocode:
plt.figure(figsize=(24,4))
plt.subplot(1,2,1)
plt.title("K = 2",fontsize=14)
sns.swarmplot(x="species",y="2_clust", data=iris_CM, hue="species")
plt.subplot(1,2,2)
plt.title("K = 3",fontsize=14)
sns.swarmplot(x="species",y="3_clust", data=iris_CM, hue="species")
Thomsan Sampling
Importing the Essential Libraries
Pseudocode:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
Implementing UCB
Pseudocode:
import math
N = 10000
d = 10
ads_selected = []
numbers_of_selections = [0] * d
sums_of_rewards = [0] * d
total_reward = 0
for n in range(0, N):
ad = 0
max_upper_bound = 0
for i in range(0, d):
if (numbers_of_selections[i] > 0):
average_reward = sums_of_rewards[i] / numbers_of_selections[i]
delta_i = math.sqrt(3/2 * math.log(n + 1) /
numbers_of_selections[i])
upper_bound = average_reward + delta_i
else:
upper_bound = 1e400
if (upper_bound > max_upper_bound):
max_upper_bound = upper_bound
ad = i
ads_selected.append(ad)
numbers_of_selections[ad] = numbers_of_selections[ad] + 1
reward = dataset.values[n, ad]
sums_of_rewards[ad] = sums_of_rewards[ad] + reward
total_reward = total_reward + reward