Download as pdf or txt
Download as pdf or txt
You are on page 1of 9

1.

Implement and demonstrate the FIND-S algorithm for finding the most specific hypothesis based on
a given set of training data samples. Read the training data from a .csv file.

import csv
with open('1.csv', 'r') as f:
reader = csv.reader(f)
your_list = list(reader)
h = [['0', '0', '0', '0', '0', '0']]
for i in your_list:
print(i)
if i[-1] == "Yes":
j = 0
for x in i:
if x != "Yes":
if x != h[0][j] and h[0][j] == '0':
h[0][j] = x
elif x != h[0][j] and h[0][j] != '0':
h[0][j] = '?'
else:
pass
j = j + 1
print("A Maximally Specific hypothesis is")
print(h)

2. For a given set of training data examples stored in a .CSV file, implement and demonstrate the
Candidate-Elimination algorithm to output a description of the set of all hypotheses consistent with
the training examples.

import csv
a = []
print("\n The Given Training Data Set \n")

with open('enjoysport.csv', 'r') as csvFile:


reader = csv.reader(csvFile)
for row in reader:
a.append (row)
print(row)
num_attributes = len(a[0])-1

print("\n The initial value of hypothesis: ")


S = ['0'] * num_attributes
G = ['?'] * num_attributes
print ("\n The most specific hypothesis S0 : [0,0,0,0,0,0]\n")
print (" \n The most general hypothesis G0 : [?,?,?,?,?,?]\n")

# Comparing with First Training Example


for j in range(0,num_attributes):
S[j] = a[0][j];

# Comparing with Remaining Training Examples of Given Data Set

print("\n Candidate Elimination algorithm Hypotheses Version Space Computation\n")


temp=[]

for i in range(0,len(a)):
if a[i][num_attributes]=='Yes':
for j in range(0,num_attributes):
if a[i][j]!=S[j]:
S[j]='?'
for j in range(0,num_attributes):
for k in range(1,len(temp)):
if temp[k][j]!= '?' and temp[k][j] !=S[j]:
del temp[k]

print(" For Training Example No :{0} the hypothesis is S{0} ".format(i+1),S)


if (len(temp)==0):
print(" For Training Example No :{0} the hypothesis is G{0}
".format(i+1),G)
else:
print(" For Training Example No :{0} the hypothesis is
G{0}".format(i+1),temp)

if a[i][num_attributes]=='No':
for j in range(0,num_attributes):
if S[j] != a[i][j] and S[j]!= '?':
G[j]=S[j]
temp.append(G)
G = ['?'] * num_attributes

print(" For Training Example No :{0} the hypothesis is S{0} ".format(i+1),S)


print(" For Training Example No :{0} the hypothesis is G{0}".format(i+1),temp)

3. Write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an
appropriate data set for building the decision tree and apply this knowledge to classify a new
sample.

import pandas as pd
import numpy as np
dataset= pd.read_csv('P3_Tennis.csv')

def entropy(target_col):
elements,counts = np.unique(target_col,return_counts = True)

entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))
for i in range(len(elements))])

return entropy

def InfoGain(data,split_attribute_name,target_name="PlayTennis"):
total_entropy = entropy(data[target_name])
vals,counts= np.unique(data[split_attribute_name],return_counts=True)

Weighted_Entropy =
np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==v
als[i]).dropna()[target_name]) for i in range(len(vals))])

InfoGain = total_entropy - Weighted_Entropy


return InfoGain

def
ID3(data,originaldata,features,target_attribute_name="PlayTennis",parent_node_class
= None):

if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return
np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[tar
get_attribute_name],return_counts=True)[1])]

elif len(features) ==0:


return parent_node_class
else:

parent_node_class =
np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_na
me],return_counts=True)[1])]

item_values = [InfoGain(data,feature,target_attribute_name) for feature in


features] #Return the information gain values for the features in the dataset

best_feature_index = np.argmax(item_values)
best_feature = features[best_feature_index]
tree = {best_feature:{}}
features = [i for i in features if i != best_feature]
for value in np.unique(data[best_feature]):
value = value
sub_data = data.where(data[best_feature] == value).dropna()

subtree =
ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)

tree[best_feature][value] = subtree
return(tree)

tree = ID3(dataset,dataset,dataset.columns[:-1])
print(dataset.head())
print(' \nDisplay Tree\n',tree)

4. Build an Artificial Neural Network by implementing the Back propagation algorithm and test the
same using appropriate data sets.

import math

def sigmoid(x):
y= 1/(1+math.exp(-x))
return y
##define inputs and target for xor gate
x1=[0,0,1,1] #input1
x2=[0,1,0,1] #input2
t=[0,1,1,0] #target

## Initialize random weights and biases

# Hidden layer first Perceptron


b1=-0.3
w11=0.21
w21= 0.15
# Hidden Layer Second Perceptron
b2=0.25
w12=-0.4
w22=0.1
# Output layer Perceptron
b3=-0.4
w13=-0.2
w23=0.3
error=0
iteration=0
train=True
print("weight are:")
print("w11 : %4.2f w12: %4.2f w21: %4.2f w22: %4.2f w13: %4.2f w23: %4.2f \n"
%(w11,w12,w21,w22,w13,w23))

## Training Starts

while(train):

for i in range(len(x1)):

##input for each perceptron of hidden layer


z_in1=b1+x1[i]*w11+x2[i]*w21
z_in2=b2+x1[i]*w12+x2[i]*w22
##computing activation function output
z1=round(sigmoid(z_in1),4)
z2=round(sigmoid(z_in2),4)

# Output layer forward pass


y_in=b3+z1*w13+z2*w23
y=round(sigmoid(y_in),4)

##error computation
del_k=round((t[i]-y)*y*(1-y),4)
error=del_k
##Back pass
# weight update for output layer
w13=round(w13+del_k*z1,4)
w23=round(w23+del_k*z2,4)
b3=round(b3+del_k,4)

##error computation for hidden layer


del_1=del_k*w13*z1*(1-z1)
del_2=del_k*w23*z2*(1-z2)

## update weight and biases


b1=round(b1+del_1,4)
w11=round(w11+del_1*x1[i],4)
w12=round(w12+del_1*x1[i],4)

b2=round(b2+del_2,4)
w21=round(w21+del_2*x2[i],4)
w22=round(w22+del_2*x2[i],4)

print("Iteration: ",iteration)
print("w11 : %5.4f w12: %5.4f w21: %5.4f w22: %5.4f w13: %5.4f w23: %5.4f "
%(w11,w12,w21,w22,w13,w23))
print("Error: %5.3f" %del_k)
iteration=iteration+1

if(iteration==1000):
train=False

5. Write a program to implement the naïve Bayesian classifier for a sample training data set stored as a
.csv file. Compute the accuracy of the classifier, considering few test data sets.

import csv
import math
import random
import statistics

def calculate_probability(x, mean, stdev):


exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

dataset = []
dataset_size = 0
with open('lab5.csv') as csvfile:
lines = csv.reader(csvfile)
for row in lines:
dataset.append([float(attr) for attr in row])
dataset_size = len(dataset)
print('Size of dataset is : ', dataset_size)

train_size = int(0.7 * dataset_size) # 70 % as test data

print(train_size)

X_train = []
X_test = dataset.copy()
training_indexes = random.sample(range(dataset_size), train_size)

# Split Data
for i in training_indexes:
X_train.append(dataset[i])
X_test.remove(dataset[i])

# Separate Data based on class value


classes = {}
for samples in X_train:
last = int(samples[-1])
if last not in classes:
classes[last] = []
classes[last].append(samples)

# Find mean and variance of each attribute by adding all attributes


summaries = {}
for classValue, training_data in classes.items():
summary = [(statistics.mean(attribute), statistics.stdev(attribute)) for attribute
in zip(*training_data)]
del summary[-1]
summaries[classValue] = summary

X_prediction = []

# Predict the output of test data


for i in X_test:
probabilities = {}
for classValue, classSummary in summaries.items():
probabilities[classValue] = 1
for index, attr in enumerate(classSummary):
probabilities[classValue] *= calculate_probability(i[index], attr[0],
attr[1])

best_label, best_prob = None, -1


for classValue, probability in probabilities.items():
if best_label is None or probability > best_prob:
best_prob = probability
best_label = classValue
X_prediction.append(best_label)

# Find Accuracy
correct = 0
for index, key in enumerate(X_test):
if X_test[index][-1] == X_prediction[index]:
correct += 1

print("Accuracy : ", correct / (float(len(X_test))) * 100)

6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier model to
perform this task. Built-in Java classes/API can be used to write the program. Calculate the accuracy,
precision, and recall for your data set.

import pandas as pd
dataset = pd.read_csv('naivetext1.txt',names =['text','tag'])
dataset.head()
#encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataset['tag']= encoder.fit_transform(dataset['tag'])
#splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['tag'],
test_size=0.2)
#vectorization
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(X_train)
xtest_dtm=count_vect.transform(X_test)
dataset=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
#prediction
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,y_train)#to load the text data
y_pred= clf.predict(xtest_dtm)
#output
from sklearn.metrics import precision_score,accuracy_score,recall_score
print('Precision',precision_score(y_test, y_pred))
print('Accuracy',accuracy_score(y_test, y_pred))
print('Recall',recall_score(y_test, y_pred))

7. Write a program to construct a Bayesian network considering medical data. Use this model to
demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can use
Java/Python ML library classes/API.

import pandas as pd
col =['Age','Gender','FamilyHist','Diet','LifeStyle','Cholesterol','HeartDisease']
data = pd.read_csv('heart_disease_data.csv',names =col )
print(data)

#encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for i in range(len(col)):
data.iloc[:,i] = encoder.fit_transform(data.iloc[:,i])

#spliting data
X = data.iloc[:,0:6]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#prediction
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

#confusion mtx output


from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set for clustering
using k-Means algorithm. Compare the results of these two algorithms and comment on the quality
of clustering. You can add Java/Python ML library classes/API in the program.

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
# Importing the dataset
data = pd.read_csv('xclara.csv')
data.head()

# Getting the values and plotting it


f1 = data['V1'].values
f2 = data['V2'].values
X = np.array(list(zip(f1, f2)))

kmeans = KMeans(3, random_state=0)


labels = kmeans.fit(X).predict(X)
centroids = kmeans.cluster_centers_
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');
print('Graph using Kmeans Algorithm')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
plt.show()
#gmm
gmm = GaussianMixture(n_components=3).fit(X)
labels = gmm.predict(X)

# plot
probs = gmm.predict_proba(X)
size = 10 * probs.max(1) ** 3
print('Graph using EM Algorithm')
#print(probs[:300].round(4))
plt.scatter(X[:, 0], X[:, 1], c=labels, s=size, cmap='viridis');
plt.show()

9. Write a program to implement k-Nearest Neighbour algorithm to classify the iris data set. Print both
correct and wrong predictions. Java/Python ML library classes can be used for this problem.

from sklearn.model_selection import train_test_split


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets
iris=datasets.load_iris()
iris_data=iris.data
iris_labels=iris.target
print(iris_labels)
x_train, x_test, y_train,
y_test=train_test_split(iris_data,iris_labels,test_size=0.20)

classifier=KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)
print('confusion matrix')
print(confusion_matrix(y_test,y_pred))
print('Accuracy metrics')
print(classification_report(y_test,y_pred))

10. Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points.
Select appropriate data set for your experiment and draw graphs.

import matplotlib.pyplot as plt


import pandas as pd
import numpy as np1

def kernel(point,xmat, k):


m,n = np1.shape(xmat)
weights = np1.mat(np1.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2))
return weights

def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W

def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred

# load data points


data = pd.read_csv('data10.csv')
bill = np1.array(data.total_bill)
tip = np1.array(data.tip)

#preparing and add 1 in bill


mbill = np1.mat(bill)
mtip = np1.mat(tip)
m= np1.shape(mbill)[1]
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T))

#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(bill,tip, color='green')
ax.plot(xsort[:,1],ypred[SortIndex], color = 'red', linewidth=5)
plt.xlabel('Total bill')
plt.ylabel('Tip')
plt.show();

You might also like