ML Manual final

Ex.
NO: 1
LINEAR REGRESSION
DATE:
AIM
To explore the linear regression model on the usa housing and uci and pima indians
diabetes data set.
ALGORITHM
STEP 1: Start the program
STEP 2: To download the any kind of data set like housing dataset using kaggle.
STEP 3: To read data from downloaded data set.
STEP 4: To find the linear and logistic regression model using the given data set.
STEP 5: Display the output.
STEP 6: Stop the program.
PROGRAM
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns sns.set_style('darkgrid') %matplotlib inline from
matplotlib.ticker
import FormatStrFormatter
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('C:/Users/diabetes.csv')
df.head()
`1
df.shape
df.dtypes
df['Outcome']=df['Outcome'].astype('bool')
fig,axes = plt.subplots(nrows=3,ncols=2,dpi=120,figsize = (8,6))
plot00=sns.countplot('Pregnancies',data=df,ax=axes[0][0],color='green')
axes[0][0].set_title('Count',fontdict={'fontsize':8})
axes[0][0].set_xlabel('Month of Preg.',fontdict={'fontsize':7})
axes[0][0].set_ylabel('Count',fontdict={'fontsize':7})
plt.tight_layout()
plot01=sns.countplot('Pregnancies',data=df,hue='Outcome',ax=axes[0][1])
axes[0][1].set_title('Diab. VS Non-Diab.',fontdict={'fontsize':8})
axes[0][1].set_xlabel('Month of Preg.',fontdict={'fontsize':7})
axes[0][1].set_ylabel('Count',fontdict={'fontsize':7}) plot01.axes.legend(loc=1)
plt.setp(axes[0][1].get_legend().get_texts(), fontsize='6')
plt.setp(axes[0][1].get_legend().get_title(), fontsize='6')
plt.tight_layout()
plot10 = sns.distplot(df['Pregnancies'],ax=axes[1][0])
axes[1][0].set_title('Pregnancies Distribution',fontdict={'fontsize':8})
axes[1][0].set_xlabel('Pregnancy Class',fontdict={'fontsize':7})
axes[1][0].set_ylabel('Freq/Dist',fontdict={'fontsize':7}) plt.tight_layout()
plot11=f[df['Outcome']==False]['Pregnancies'].plot.hist(ax=axes[1][1],label='N
on- Diab.')
`2
plot11_2=df[df['Outcome']==True]['Pregnancies'].plot.hist(ax=axes[1][1],label=
'Diab.')
axes[1][1].set_xlabel('Pregnancy Class',fontdict={'fontsize':7})
axes[1][1].set_ylabel('Freq/Dist',fontdict={'fontsize':7})
plot11.axes.legend(loc=1)
plt.setp(axes[1][1].get_legend().get_texts(), fontsize='6') # for legend text
plt.setp(axes[1][1].get_legend().get_title(), fontsize='6') # for legend title
plt.tight_layout()
plot20 = sns.boxplot(df['Pregnancies'],ax=axes[2][0],orient='v')
axes[2][0].set_title('Pregnancies',fontdict={'fontsize':8})
axes[2][0].set_xlabel('Pregnancy',fontdict={'fontsize':7})
axes[2][0].set_ylabel('Five Point Summary',fontdict={'fontsize':7})
plt.tight_layout()
plot21=sns.boxplot(x='Outcome',y='Pregnancies',data=df,ax=axes[2][1])
axes[2][1].set_xlabel('Pregnancy',fontdict={'fontsize':7})
axes[2][1].set_ylabel('Five Point
Summary',fontdict={'fontsize':7})
plt.xticks(ticks=[0,1],labels=['Non-
Diab.','Diab.'],fontsize=7) plt.tight_layout()
plt.show()
`3
OUTPUT
## Blood Pressure variable
plot00=sns.distplot(df['BloodPressure'],ax=axes[0][0],color='green')
axes[0][0].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
axes[0][0].set_title('Distribution of BP',fontdict={'fontsize':8})
axes[0][0].set_xlabel('BP Class',fontdict={'fontsize':7})
axes[0][0].set_ylabel('Count/Dist.',fontdict={'fontsize':7}) plt.tight_layout()
plot01=sns.distplot(df[df['Outcome']==False]['BloodPressure'],ax=axes[0][1],color='gr
een',label='NonDiab.')
sns.distplot(df[df.Outcome==True]['BloodPressure'],ax=axes[0][1],color='red',label='
Diab')
axes[0][1].set_title('Distribution of BP',fontdict={'fontsize':8})
`4
axes[0][1].set_xlabel('BP Class',fontdict={'fontsize':7})
axes[0][1].set_ylabel('Count/Dist.',fontdict={'fontsize':7})
axes[0][1].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
plot01.axes.legend(loc=1)
plt.setp(axes[0][1].get_legend().get_texts(), fontsize='6')
plt.setp(axes[0][1].get_legend().get_title(), fontsize='6')
plt.tight_layout()
plot10=sns.boxplot(df['BloodPressure'],ax=axes[1][0],orient='v')
axes[1][0].set_title('Numerical Summary',fontdict={'fontsize':8})
axes[1][0].set_xlabel('BP',fontdict={'fontsize':7})
axes[1][0].set_ylabel(r'Five Point Summary(BP)',fontdict={'fontsize':7})
plt.tight_layout()
plot11=sns.boxplot(x='Outcome',y='BloodPressure',data=df,ax=axes[1][1])
axes[1][1].set_title(r'Numerical Summary (Outcome)',fontdict={'fontsize':8})
axes[1][1].set_ylabel(r'Five Point Summary(BP)',fontdict={'fontsize':7})
plt.xticks(ticks=[0,1],labels=['Non-Diab.','Diab.'],fontsize=7)
axes[1][1].set_xlabel('Category',fontdict={'fontsize':7})
plt.tight_layou
t()plt.show()
plot0=sns.distplot(df[df['BloodPressure']!=0]['BloodPressure'],ax=axes[0],co
lor='green') axes[0].yaxis.set_major_formatter(FormatStrFormatter('%.3f'))
`5
axes[0].set_title('Distribution of BP',fontdict={'fontsize':8})
axes[0].set_xlabel('BP
Class',fontdict={'fontsize':7})
axes[0].set_ylabel('Count/Dist.',fontdict={'font
size':7})plt.tight_layout()
plot1=sns.boxplot(df[df['BloodPressure']!=0]['BloodPressure'],ax=axes[1],orien
t='v')axes[1].set_title('Numerical Summary',fontdict={'fontsize':8})
axes[1].set_xlabel('BloodPressure',fontdict={'fontsize':7})
axes[1].set_ylabel(r'Five Point Summary(BP)',fontdict={'fontsize':7})
plt.tight_layout()
OUTPUT
`6
LINEAR REGRESSION MODELLING ON HOUSING DATASET
# Data manipulation libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns %matplotlib inline
USAhousing = pd.read_csv('USA_Housing.csv')
USAhousing.head()
USAhousing.info()
USAhousing.describe()
USAhousing.columns sns.pairplot(USAhousing)
sns.distplot(USAhousing['Price'])sns.heatmap(USAhousing.corr())
`7
X = USAhousing[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area
Number of Rooms','Avg. Area Number of Bedrooms', 'Area Population']]y =
USAhousing['Price']from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
random_state=101)from sklearn.linear_model
import LinearRegression
lm = LinearRegression()lm.fit(X_train,y_train)
# print the interceptprint(lm.intercept_)
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])coeff_df
predictions = lm.predict(X_test)plt.scatter(y_test,predictions)
sns.distplot((y_test-predictions),bins=50);from sklearn
import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,
predictions)))
`8
OUTPUT
`9
RESULT
Exploring various commands for doing Bivariate analytics on the usa housing
Dataset was successfully executed.
`10
EX.NO: 2
BINARY CLASSIFICATION MODEL
DATE:
AIM
To implement a binary classification model with different classification metrics to
determine the models effectiveness.
ALGORITHM:
STEP 1: Import the necessary libraries and load the data

STEP 2: Split the data into features (X) and the target variable (y)
STEP 3: Split the data into training and testing sets.
STEP 4: Train a binary classification model (e.g., logistic regression) on the training set.
STEP 5: Make predictions on the test set using the trained model.
STEP 6: Calculate and evaluate the model's performance with the default classification
threshold (0.5) using various classification metrics such as accuracy, precision, recall, F1-
score, and ROC-AUC score.
STEP 7: Experiment with different classification thresholds (e.g., 0.3, 0.7, etc.).
STEP 8: Analyze and compare the model's performance metrics for different thresholds to
determine the most effective threshold based on your problem requirements.
STEP 9: Choose the threshold that achieves the desired balance between precision and recall
or any other relevant metric for your specific problem.
STEP 10: Implement the chosen threshold in your binary classification model for future
predictions.
`11
PROGRAM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,
roc_auc_score
# Load the data from exercise 1
data = pd.read_csv('house_data.csv')
# Separate the features (X) and the target variable (y)
X = data.drop('above_price_threshold', axis=1)
y = data['above_price_threshold']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model's performance with default threshold (0.5)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
`12
f1 = f1_score(y_test, y_pred)
probabilities = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, probabilities)
print("Performance with default threshold (0.5):")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC-AUC Score:", roc_auc)
# Modify the classification threshold
threshold = 0.3
y_pred_threshold = (probabilities >= threshold).astype(int)
# Evaluate the model's performance with modified threshold (0.3)
accuracy_threshold = accuracy_score(y_test, y_pred_threshold)
precision_threshold = precision_score(y_test, y_pred_threshold)
recall_threshold = recall_score(y_test, y_pred_threshold)
f1_threshold = f1_score(y_test, y_pred_threshold)
roc_auc_threshold = roc_auc_score(y_test, probabilities)
print("\nPerformance with modified threshold (0.3):")
print("Accuracy:", accuracy_threshold)
print("Precision:", precision_threshold)
print("Recall:", recall_threshold)
`13
print("F1-Score:", f1_threshold)
print("ROC-AUC Score:", roc_auc_threshold)
OUTPUT
Accuracy: 0.85
Precision: 0.75
Recall: 0.9
F1-Score: 0.82
ROC-AUC Score: 0.92
RESULT
Thus the Binary Classification model with different classification metrics was
executed and verified successfully.
`14
EX.NO: 3 CLASSIFICATION WITH NEAREST NEIGHBORS WITH
KNN CLASSIFIER
DATE:
AIM
To implement the nearest neighbouring algorithms using KNN classifier.
ALGORITHM
STEP 1: Import the necessary libraries:
 pandas to load and manipulate the dataset.
 TfidfVectorizer from sklearn.feature_extraction.text to convert the text data into
numerical features using TF-IDF vectorization.
 KNeighborsClassifier from sklearn.neighbors to create the KNN classifier.
 train_test_split from sklearn.model_selection to split the data into training and test
sets.
 accuracy_score from sklearn.metrics to calculate the accuracy of the classifier.
STEP 2:Load and preprocess the dataset:
 Use pd.read_csv to load the fake news detection dataset into a pandas DataFrame
named data.
 Split the DataFrame into features (X) and labels (y).
STEP 3: Convert text data to numerical features:
 Initialize a TfidfVectorizer object named vectorizer.
`15
 Use vectorizer.fit_transform to convert the text data (X) into numerical features
using TF-IDF vectorization. This step transforms the text data into a matrix of TF-
IDF features.
STEP 4: Split the data into training and test sets:
 Use train_test_split to split the TF-IDF features (X) and labels (y) into training and
test sets, with a specified test size.
STEP 5: Create and train the KNN classifier:
 Create a KNN classifier object, knn, with the desired number of neighbors (k).
 Train the KNN classifier on the training data (X_train, y_train) using the fit
method.
STEP 6: Make predictions and calculate accuracy:
 Use the trained KNN classifier to make predictions (y_pred) on the test set (X_test).
 Calculate the accuracy of the classifier by comparing the predicted labels with the
true labels using the accuracy_score function.
STEP 7: Print the accuracy:
 Output the calculated accuracy to evaluate the performance of the KNN classifier
on the fake news detection dataset.
PROGRAM
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
`16
from sklearn.metrics import accuracy_score
# Load the Fake News Detection dataset
data = pd.read_csv('fake_news_dataset.csv')
# Split the dataset into features and labels
X = data['text']
y = data['label']
# Convert the text data into numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a KNN classifier
k=5
knn = KNeighborsClassifier(n_neighbors=k)
# Train the classifier
knn.fit(X_train, y_train)
# Make predictions on the test set
y_pred = knn.predict(X_test)
# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
import pandas as pd
`17
from pandas import DataFrame
from sklearn.datasets import load_iris
# sklearn.datasetsincludes common example
datasets# A function to load in the iris
dataset
iris_obj =
load_iris()#
Dataset preview
iris_obj.data
iris = DataFrame(iris_obj.data, columns=iris_obj.feature_names,index=pd.Index([i
for i in range(iris_obj.data.shape[0])])).join(DataFrame(iris_obj.target,
columns=pd.Index(["species"]),index=pd.Index([i for i in
range(iris_obj.target.shape[0])])))
iris # prints iris
data Commands
iris_obj.feature_na
mesiris.count()
iris.mean()
iris.median()
iris.var()
iris.std()
iris.max()
`18
iris.min()
iris.describe()
OUTPUT
RESULT
Thus the implementation of nearest neighboring of KNN classifier was executed
and verified successfully.
`19
Ex.NO: 4
VALIDATION SET&TEST SET
DATE:
AIM
To implement a validation set and test set with a different regression model.
ALGORITHM
STEP 1: Split the dataset:
 Take the original dataset and randomly split it into three subsets: a training set, a
validation set, and a test set.
 Decide on the proportions of the splits (e.g., 70% training, 15% validation, 15%
test) based on the size of your dataset and the desired evaluation strategy.
STEP 2: Train the classifier:
 Use the training set to train the classifier or model of your choice. This involves
fitting the model to the training data and adjusting its parameters or weights.
STEP 3:
 Evaluate on the validation set:
 Apply the trained model to the validation set and obtain predictions for each
instance in the validation set.
 Compare the predicted labels with the true labels in the validation set to measure
the model's performance.
 Calculate evaluation metrics such as accuracy, precision, recall, F1 score, or any
other appropriate metrics based on the problem you are solving.

`20
 Adjust the model's parameters or hyperparameters if necessary and repeat steps 2
and 3 until you achieve satisfactory performance on the validation set.
STEP 4: Final evaluation on the test set:
 Once you have selected the best model based on its performance on the validation
set, apply this model to the test set.
 Obtain predictions for each instance in the test set using the selected model.
 Compare the predicted labels with the true labels in the test set to evaluate the final
performance of the model.
 Calculate evaluation metrics on the test set to assess the generalization capability
of the model.
STEP 5: Analyze and interpret the results:
 Examine the evaluation metrics obtained on the validation and test sets to assess
the performance of the classifier.
 Compare the results to make informed decisions about the model's suitability for
the given task.
 Consider other factors such as computational efficiency, interpretability, and
domain-specific requirements
PROGRAM
# Data manipulation libraries
import numpy as np
import pandas as pd
###scikit Learn Modules needed for Logistic
`21
Regressionfrom sklearn.linear_model
import LogisticRegression
from sklearn.model_selection
import train_test_split,GridSearchCVfrom sklearn.preprocessing
import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.metrics
import confusion_matrix from sklearn.impute
import SimpleImputer from sklearn.pipeline
import Pipeline from sklearn.compose
import ColumnTransformer
#for plotting
import matplotlib.pyplot as pltimport seaborn as sns %matplotlib inline
sns.set(color_codes=True) import warnings warnings.filterwarnings('ignore')
df=pd.read_csv('C:/Users/diabetes.csv')
df.head()
df.tail()
df.isnull().sum()
df.describe(include='all')
df.corr()
sns.heatmap(df.corr(),annot=True)plt.show()
df.hist() plt.show()
sns.countplot(x=df['Outcome'])
`22
scaler=StandardScaler()
df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI',
'DiabetesPedigreeFunction', 'Age']]=scaler.fit_transform(df[['Pregnancies',
'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI',
'DiabetesPedigreeFunction', 'Age']])df_new = df
# Train & Test split
x_train, x_test, y_train, y_test = train_test_split( df_new[['Pregnancies', 'Glucose',
'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction',
'Age']],df_new['Outcome'],test_size=0.20,random_state=21)
print('Shape of Training Xs:{}'.format(x_train.shape))
print('Shape of Test Xs:{}'.format(x_test.shape))
print('Shape of Training y:{}'.format(y_train.shape))
print('Shape of Test y:{}'.format(y_test.shape))
Shape of Training Xs:(614, 8)
Shape of Test Xs:(154, 8)
Shape of Training y:(614,)
Shape of Test y:(154,)
# Build Model
model = LogisticRegression() model.fit(x_train, y_train) y_predicted =
model.predict(x_test)
score=model.score(x_test,y_test);
print(score)
`23
0.7337662337662337
#Confusion Matrix
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_predicted)
np.set_printoptions(precision=2)cnf_matrix
OUTPUT
`24
`25
RESULT
Thus the validation sets and test sets using the dataset was executed
successfully.
`26
EX.NO: 5
K-MEANS ALGORITHM
DATE:
AIM
To implement K-means algorithm.
ALGORITHM
 Initialize the number of clusters k and the maximum number of iterations
max_iters.
 Randomly initialize k centroids from the data points.
Repeat the following steps until convergence or until reaching the maximum
number of iterations:
Assign each data point to the nearest centroid by calculating the Euclidean
distance.
 Update the centroids based on the mean of the data points assigned to
each cluster.
 Check for convergence by comparing the current centroids with the
previous centroids. If they are equal, terminate the loop.
 Return the cluster labels assigned to each data point and the final
centroids.
`27
PROGRAM
import numpy as np
def initialize_centroids(X, k):
"""Randomly initialize k centroids from the data points."""indices =
np.random.choice(range(len(X)), size=k, replace=False)
centroids = X[indices] return centroids
def assign_clusters(X, centroids):
"""Assign each data point to the nearest centroid."""distances = np.sqrt(((X -
centroids[:, np.newaxis])**2).sum(axis=2))
cluster_labels = np.argmin(distances, axis=0)
return cluster_labels
def update_centroids(X, cluster_labels, k):
"""Update the centroids based on the mean of the data points in each
cluster."""
centroids = np.array([X[cluster_labels == i].mean(axis=0) for i in range(k)])
return centroids
def k_means(X, k, max_iters=100):
"""Perform k-means clustering on the data points."""
centroids = initialize_centroids(X, k)
for _ in range(max_iters):
prev_centroids = centroids.copy()
`28
# Assign data points to clusters
cluster_labels = assign_clusters(X, centroids)
# Update centroids
centroids = update_centroids(X, cluster_labels, k)
# Check for convergence
if np.allclose(centroids, prev_centroids):
break
return cluster_labels, centroids
# Example usage
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
k=2
cluster_labels, centroids = k_means(X, k)
print("Cluster Labels:", cluster_labels)
print("Centroids:", centroids)
`29
OUTPUT
Cluster Labels: [0 0 0 1 1 1]
Centroids: [[1. 2. ]
[3. 2. ]]
RESULT
Thus the implementation of K-means clustering algorithm was executed
and verified successfully.
`30
EX.NO: 6
NAÏVE BAYES CLASSIFIER
DATE:
AIM
To implement the naïve bayes classifier
ALGORITHM
STEP1: Initialize class variables:
 classes: An array to store the unique class labels.
 class_priors: A dictionary to store the prior probabilities of each class.
 feature_probs: A nested dictionary to store the conditional probabilities of
each feature value given the class.
STEP2: Fit the classifier:
 Accept the training data X and corresponding class labels y as inputs.
 Determine the unique class labels and store them in the classes array.
 Calculate the prior probabilities for each class by dividing the count of
samples in each class by the total number of samples.
 For each feature in the dataset:
 Determine the unique feature values and store them in a variable.
 For each class label:
 Create a nested dictionary entry for the feature and class if it doesn't
already exist.
`31
 Calculate the conditional probability of each feature value given the class
by dividing the count of samples with the specific feature value and class
by the count of samples in that class.
STEP 3: Predict the class labels:
 Accept the test data X_test as input.
 Initialize an empty array to store the predicted class labels.
 For each sample in X_test:
 Initialize an empty array to store the posterior probabilities for each class.
 For each class in classes:
 Initialize the posterior probability as the corresponding prior probability.
 For each feature value in the sample:
 If the feature and class combination exists in feature_probs and the
feature value exists in the conditional probabilities:
 Multiply the posterior probability by the conditional probability of the
feature value given the class.
 Otherwise, set the posterior probability to 0 and break the loop.
 Append the posterior probability to the array of posterior probabilities.
 Append the class label with the maximum posterior probability to the
predicted labels array.
STEP 4: Return the predicted class labels.
`32
PROGRAM
import numpy as np
class NaiveBayesClassifier:
def init (self):
self.classes = None
self.class_priors = None
self.feature_probs = None
def fit(self, X, y):
self.classes = np.unique(y)
self.class_priors = np.zeros(len(self.classes))
self.feature_probs = {}
for i, c in enumerate(self.classes):
X_c = X[y == c]
self.class_priors[i] = len(X_c) / len(X)
for j in range(X.shape[1]):
feature_values = np.unique(X[:, j])
self.feature_probs[(j, c)] = {}
for value in feature_values:
count = len(X_c[X_c[:, j] == value])
self.feature_probs[(j, c)][value] = count / len(X_c)
def predict(self, X):
predictions = []
for x in X:
posterior_probs = []
for i, c in enumerate(self.classes):
posterior_prob = self.class_priors[i]
for j, value in enumerate(x):
if (j, c) in self.feature_probs and value in self.feature_probs[(j, c)]:
`33
posterior_prob *= self.feature_probs[(j, c)][value]
else:
posterior_prob = 0
break
posterior_probs.append(posterior_prob)
predictions.append(self.classes[np.argmax(posterior_probs)])
return np.array(predictions)
# Sample dataset
X_train = np.array([[1, 0], [1, 1], [0, 1], [0, 0]])
y_train = np.array([1, 1, 0, 0])
X_test = np.array([[1, 1], [0, 1]])
# Create and train the Naive Bayes classifier
classifier = NaiveBayesClassifier()
classifier.fit(X_train, y_train)
# Make predictions
predictions = classifier.predict(X_test)
print("Predictions:", predictions)
`34
OUTPUT
Predictions: [1 0]
RESULT
Thus the implementation of naïve bayes classifier was executed and
verified successfully.
`35

ML Manual final

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

ML Manual final

Uploaded by

Copyright:

Available Formats

Ex.

STEP 1: Start the program

STEP 3: To read data from downloaded data set.

STEP 5: Display the output.

STEP 6: Stop the program.

import matplotlib.pyplot as plt

import seaborn as sns sns.set_style('darkgrid') %matplotlib inline from

fig,axes = plt.subplots(nrows=3,ncols=2,dpi=120,figsize = (8,6))

plt.setp(axes[1][1].get_legend().get_texts(), fontsize='6') # for legend text

plt.setp(axes[1][1].get_legend().get_title(), fontsize='6') # for legend title

axes[2][0].set_ylabel('Five Point Summary',fontdict={'fontsize':7})

## Blood Pressure variable

fig,axes = plt.subplots(nrows=2,ncols=2,dpi=120,figsize = (8,6))

axes[1][0].set_ylabel(r'Five Point Summary(BP)',fontdict={'fontsize':7})

axes[1][1].set_title(r'Numerical Summary (Outcome)',fontdict={'fontsize':8})

axes[1][1].set_ylabel(r'Five Point Summary(BP)',fontdict={'fontsize':7})

fig,axes = plt.subplots(nrows=1,ncols=2,dpi=120,figsize = (8,4))

axes[1].set_ylabel(r'Five Point Summary(BP)',fontdict={'fontsize':7})

# Data manipulation libraries

import matplotlib.pyplot as plt

import seaborn as sns %matplotlib inline

Number of Rooms','Avg. Area Number of Bedrooms', 'Area Population']]y =

USAhousing['Price']from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,

# print the interceptprint(lm.intercept_)

print('MAE:', metrics.mean_absolute_error(y_test, predictions))

print('MSE:', metrics.mean_squared_error(y_test, predictions))

To implement a binary classification model with different classification metrics to

determine the models effectiveness.

STEP 1: Import the necessary libraries and load the data

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,

# Load the data from exercise 1

# Separate the features (X) and the target variable (y)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model

# Make predictions on the test set

# Evaluate the model's performance with default threshold (0.5)

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, probabilities)

print("Performance with default threshold (0.5):")

print("ROC-AUC Score:", roc_auc)

# Modify the classification threshold

y_pred_threshold = (probabilities >= threshold).astype(int)

# Evaluate the model's performance with modified threshold (0.3)

accuracy_threshold = accuracy_score(y_test, y_pred_threshold)

precision_threshold = precision_score(y_test, y_pred_threshold)

recall_threshold = recall_score(y_test, y_pred_threshold)

f1_threshold = f1_score(y_test, y_pred_threshold)

roc_auc_threshold = roc_auc_score(y_test, probabilities)

print("\nPerformance with modified threshold (0.3):")

print("ROC-AUC Score:", roc_auc_threshold)

ROC-AUC Score: 0.92

executed and verified successfully.

To implement the nearest neighbouring algorithms using KNN classifier.

STEP 1: Import the necessary libraries:

 pandas to load and manipulate the dataset.

 TfidfVectorizer from sklearn.feature_extraction.text to convert the text data into

numerical features using TF-IDF vectorization.

 KNeighborsClassifier from sklearn.neighbors to create the KNN classifier.