Professional Documents
Culture Documents
ML Manual final
ML Manual final
NO: 1
LINEAR REGRESSION
DATE:
AIM
To explore the linear regression model on the usa housing and uci and pima indians
diabetes data set.
ALGORITHM
STEP 2: To download the any kind of data set like housing dataset using kaggle.
STEP 4: To find the linear and logistic regression model using the given data set.
PROGRAM
import pandas as pd
import numpy as np
matplotlib.ticker
import FormatStrFormatter
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('C:/Users/diabetes.csv')
df.head()
`1
df.shape
df.dtypes
df['Outcome']=df['Outcome'].astype('bool')
plot00=sns.countplot('Pregnancies',data=df,ax=axes[0][0],color='green')
axes[0][0].set_title('Count',fontdict={'fontsize':8})
axes[0][0].set_xlabel('Month of Preg.',fontdict={'fontsize':7})
axes[0][0].set_ylabel('Count',fontdict={'fontsize':7})
plt.tight_layout()
plot01=sns.countplot('Pregnancies',data=df,hue='Outcome',ax=axes[0][1])
axes[0][1].set_title('Diab. VS Non-Diab.',fontdict={'fontsize':8})
axes[0][1].set_xlabel('Month of Preg.',fontdict={'fontsize':7})
axes[0][1].set_ylabel('Count',fontdict={'fontsize':7}) plot01.axes.legend(loc=1)
plt.setp(axes[0][1].get_legend().get_texts(), fontsize='6')
plt.setp(axes[0][1].get_legend().get_title(), fontsize='6')
plt.tight_layout()
plot10 = sns.distplot(df['Pregnancies'],ax=axes[1][0])
axes[1][0].set_title('Pregnancies Distribution',fontdict={'fontsize':8})
axes[1][0].set_xlabel('Pregnancy Class',fontdict={'fontsize':7})
axes[1][0].set_ylabel('Freq/Dist',fontdict={'fontsize':7}) plt.tight_layout()
plot11=f[df['Outcome']==False]['Pregnancies'].plot.hist(ax=axes[1][1],label='N
on- Diab.')
`2
plot11_2=df[df['Outcome']==True]['Pregnancies'].plot.hist(ax=axes[1][1],label=
'Diab.')
axes[1][1].set_title('Diab. VS Non-Diab.',fontdict={'fontsize':8})
axes[1][1].set_xlabel('Pregnancy Class',fontdict={'fontsize':7})
axes[1][1].set_ylabel('Freq/Dist',fontdict={'fontsize':7})
plot11.axes.legend(loc=1)
plt.tight_layout()
plot20 = sns.boxplot(df['Pregnancies'],ax=axes[2][0],orient='v')
axes[2][0].set_title('Pregnancies',fontdict={'fontsize':8})
axes[2][0].set_xlabel('Pregnancy',fontdict={'fontsize':7})
plt.tight_layout()
plot21=sns.boxplot(x='Outcome',y='Pregnancies',data=df,ax=axes[2][1])
axes[2][1].set_title('Diab. VS Non-Diab.',fontdict={'fontsize':8})
axes[2][1].set_xlabel('Pregnancy',fontdict={'fontsize':7})
axes[2][1].set_ylabel('Five Point
Summary',fontdict={'fontsize':7})
plt.xticks(ticks=[0,1],labels=['Non-
Diab.','Diab.'],fontsize=7) plt.tight_layout()
plt.show()
`3
OUTPUT
plot00=sns.distplot(df['BloodPressure'],ax=axes[0][0],color='green')
axes[0][0].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
axes[0][0].set_title('Distribution of BP',fontdict={'fontsize':8})
axes[0][0].set_xlabel('BP Class',fontdict={'fontsize':7})
axes[0][0].set_ylabel('Count/Dist.',fontdict={'fontsize':7}) plt.tight_layout()
plot01=sns.distplot(df[df['Outcome']==False]['BloodPressure'],ax=axes[0][1],color='gr
een',label='NonDiab.')
sns.distplot(df[df.Outcome==True]['BloodPressure'],ax=axes[0][1],color='red',label='
Diab')
axes[0][1].set_title('Distribution of BP',fontdict={'fontsize':8})
`4
axes[0][1].set_xlabel('BP Class',fontdict={'fontsize':7})
axes[0][1].set_ylabel('Count/Dist.',fontdict={'fontsize':7})
axes[0][1].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
plot01.axes.legend(loc=1)
plt.setp(axes[0][1].get_legend().get_texts(), fontsize='6')
plt.setp(axes[0][1].get_legend().get_title(), fontsize='6')
plt.tight_layout()
plot10=sns.boxplot(df['BloodPressure'],ax=axes[1][0],orient='v')
axes[1][0].set_title('Numerical Summary',fontdict={'fontsize':8})
axes[1][0].set_xlabel('BP',fontdict={'fontsize':7})
plt.tight_layout()
plot11=sns.boxplot(x='Outcome',y='BloodPressure',data=df,ax=axes[1][1])
plt.xticks(ticks=[0,1],labels=['Non-Diab.','Diab.'],fontsize=7)
axes[1][1].set_xlabel('Category',fontdict={'fontsize':7})
plt.tight_layou
t()plt.show()
plot0=sns.distplot(df[df['BloodPressure']!=0]['BloodPressure'],ax=axes[0],co
lor='green') axes[0].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
`5
axes[0].set_title('Distribution of BP',fontdict={'fontsize':8})
axes[0].set_xlabel('BP
Class',fontdict={'fontsize':7})
axes[0].set_ylabel('Count/Dist.',fontdict={'font
size':7})plt.tight_layout()
plot1=sns.boxplot(df[df['BloodPressure']!=0]['BloodPressure'],ax=axes[1],orien
t='v')axes[1].set_title('Numerical Summary',fontdict={'fontsize':8})
axes[1].set_xlabel('BloodPressure',fontdict={'fontsize':7})
plt.tight_layout()
OUTPUT
`6
LINEAR REGRESSION MODELLING ON HOUSING DATASET
import pandas as pd
import numpy as np
USAhousing = pd.read_csv('USA_Housing.csv')
USAhousing.head()
USAhousing.info()
USAhousing.describe()
USAhousing.columns sns.pairplot(USAhousing)
sns.distplot(USAhousing['Price'])sns.heatmap(USAhousing.corr())
`7
X = USAhousing[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area
random_state=101)from sklearn.linear_model
import LinearRegression
lm = LinearRegression()lm.fit(X_train,y_train)
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])coeff_df
predictions = lm.predict(X_test)plt.scatter(y_test,predictions)
sns.distplot((y_test-predictions),bins=50);from sklearn
import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,
predictions)))
`8
OUTPUT
`9
RESULT
Exploring various commands for doing Bivariate analytics on the usa housing
Dataset was successfully executed.
`10
EX.NO: 2
BINARY CLASSIFICATION MODEL
DATE:
AIM
ALGORITHM:
`11
PROGRAM
import pandas as pd
roc_auc_score
data = pd.read_csv('house_data.csv')
X = data.drop('above_price_threshold', axis=1)
y = data['above_price_threshold']
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
`12
f1 = f1_score(y_test, y_pred)
probabilities = model.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
threshold = 0.3
print("Accuracy:", accuracy_threshold)
print("Precision:", precision_threshold)
print("Recall:", recall_threshold)
`13
print("F1-Score:", f1_threshold)
OUTPUT
Accuracy: 0.85
Precision: 0.75
Recall: 0.9
F1-Score: 0.82
RESULT
Thus the Binary Classification model with different classification metrics was
`14
EX.NO: 3 CLASSIFICATION WITH NEAREST NEIGHBORS WITH
KNN CLASSIFIER
DATE:
AIM
ALGORITHM
train_test_split from sklearn.model_selection to split the data into training and test
sets.
Use pd.read_csv to load the fake news detection dataset into a pandas DataFrame
named data.
`15
Use vectorizer.fit_transform to convert the text data (X) into numerical features
using TF-IDF vectorization. This step transforms the text data into a matrix of TF-
IDF features.
Use train_test_split to split the TF-IDF features (X) and labels (y) into training and
Create a KNN classifier object, knn, with the desired number of neighbors (k).
Train the KNN classifier on the training data (X_train, y_train) using the fit
method.
Use the trained KNN classifier to make predictions (y_pred) on the test set (X_test).
Calculate the accuracy of the classifier by comparing the predicted labels with the
Output the calculated accuracy to evaluate the performance of the KNN classifier
PROGRAM
import pandas as pd
`16
from sklearn.metrics import accuracy_score
data = pd.read_csv('fake_news_dataset.csv')
X = data['text']
y = data['label']
# Convert the text data into numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
k=5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:", accuracy)
import pandas as pd
`17
from pandas import DataFrame
dataset
iris_obj =
load_iris()#
Dataset preview
iris_obj.data
for i in range(iris_obj.data.shape[0])])).join(DataFrame(iris_obj.target,
columns=pd.Index(["species"]),index=pd.Index([i for i in
range(iris_obj.target.shape[0])])))
data Commands
iris_obj.feature_na
mesiris.count()
iris.mean()
iris.median()
iris.var()
iris.std()
iris.max()
`18
iris.min()
iris.describe()
OUTPUT
RESULT
Thus the implementation of nearest neighboring of KNN classifier was executed
`19
Ex.NO: 4
VALIDATION SET&TEST SET
DATE:
AIM
To implement a validation set and test set with a different regression model.
ALGORITHM
Take the original dataset and randomly split it into three subsets: a training set, a
Decide on the proportions of the splits (e.g., 70% training, 15% validation, 15%
test) based on the size of your dataset and the desired evaluation strategy.
Use the training set to train the classifier or model of your choice. This involves
fitting the model to the training data and adjusting its parameters or weights.
STEP 3:
Apply the trained model to the validation set and obtain predictions for each
Compare the predicted labels with the true labels in the validation set to measure
Once you have selected the best model based on its performance on the validation
Obtain predictions for each instance in the test set using the selected model.
Compare the predicted labels with the true labels in the test set to evaluate the final
Calculate evaluation metrics on the test set to assess the generalization capability
of the model.
Examine the evaluation metrics obtained on the validation and test sets to assess
Compare the results to make informed decisions about the model's suitability for
domain-specific requirements
PROGRAM
# Data manipulation libraries
import numpy as np
import pandas as pd
`21
Regressionfrom sklearn.linear_model
import LogisticRegression
from sklearn.model_selection
import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.metrics
import ColumnTransformer
#for plotting
df=pd.read_csv('C:/Users/diabetes.csv')
df.head()
df.tail()
df.isnull().sum()
df.describe(include='all')
df.corr()
sns.heatmap(df.corr(),annot=True)plt.show()
df.hist() plt.show()
sns.countplot(x=df['Outcome'])
`22
scaler=StandardScaler()
'DiabetesPedigreeFunction', 'Age']]=scaler.fit_transform(df[['Pregnancies',
'DiabetesPedigreeFunction', 'Age']])df_new = df
'Age']],df_new['Outcome'],test_size=0.20,random_state=21)
# Build Model
model.predict(x_test)
score=model.score(x_test,y_test);
print(score)
`23
0.7337662337662337
#Confusion Matrix
np.set_printoptions(precision=2)cnf_matrix
OUTPUT
`24
`25
RESULT
Thus the validation sets and test sets using the dataset was executed
successfully.
`26
EX.NO: 5
K-MEANS ALGORITHM
DATE:
AIM
ALGORITHM
max_iters.
Repeat the following steps until convergence or until reaching the maximum
number of iterations:
Assign each data point to the nearest centroid by calculating the Euclidean
distance.
Update the centroids based on the mean of the data points assigned to
each cluster.
Return the cluster labels assigned to each data point and the final
centroids.
`27
PROGRAM
import numpy as np
centroids[:, np.newaxis])**2).sum(axis=2))
return cluster_labels
"""Update the centroids based on the mean of the data points in each
cluster."""
return centroids
centroids = initialize_centroids(X, k)
for _ in range(max_iters):
prev_centroids = centroids.copy()
`28
# Assign data points to clusters
cluster_labels = assign_clusters(X, centroids)
# Update centroids
if np.allclose(centroids, prev_centroids):
break
# Example usage
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
k=2
print("Centroids:", centroids)
`29
OUTPUT
Cluster Labels: [0 0 0 1 1 1]
Centroids: [[1. 2. ]
[3. 2. ]]
RESULT
`30
EX.NO: 6
NAÏVE BAYES CLASSIFIER
DATE:
AIM
ALGORITHM
Determine the unique class labels and store them in the classes array.
Calculate the prior probabilities for each class by dividing the count of
Create a nested dictionary entry for the feature and class if it doesn't
already exist.
`31
Calculate the conditional probability of each feature value given the class
by dividing the count of samples with the specific feature value and class
Initialize an empty array to store the posterior probabilities for each class.
Append the class label with the maximum posterior probability to the
`32
PROGRAM
import numpy as np
class NaiveBayesClassifier:
def init (self):
self.classes = None
self.class_priors = None
self.feature_probs = None
def fit(self, X, y):
self.classes = np.unique(y)
self.class_priors = np.zeros(len(self.classes))
self.feature_probs = {}
for i, c in enumerate(self.classes):
X_c = X[y == c]
self.class_priors[i] = len(X_c) / len(X)
for j in range(X.shape[1]):
feature_values = np.unique(X[:, j])
self.feature_probs[(j, c)] = {}
for value in feature_values:
count = len(X_c[X_c[:, j] == value])
self.feature_probs[(j, c)][value] = count / len(X_c)
def predict(self, X):
predictions = []
for x in X:
posterior_probs = []
for i, c in enumerate(self.classes):
posterior_prob = self.class_priors[i]
for j, value in enumerate(x):
if (j, c) in self.feature_probs and value in self.feature_probs[(j, c)]:
`33
posterior_prob *= self.feature_probs[(j, c)][value]
else:
posterior_prob = 0
break
posterior_probs.append(posterior_prob)
predictions.append(self.classes[np.argmax(posterior_probs)])
return np.array(predictions)
# Sample dataset
classifier = NaiveBayesClassifier()
classifier.fit(X_train, y_train)
# Make predictions
predictions = classifier.predict(X_test)
print("Predictions:", predictions)
`34
OUTPUT
Predictions: [1 0]
RESULT
verified successfully.
`35