Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

2_9_kNN

January 31, 2023

[1]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

[2]: def euclidean_distance(x, y):


distance = 0.0
n = len(x)
for i in range(n):
distance += (x[i] - y[i])**2
return distance**0.5

[3]: euclidean_distance([1,1], [1,3])

[3]: 2.0

[4]: def plus_frequent(L):


frequence = {}
plus_frequent = L[0]
for x in L:
if x not in frequence:
frequence[x] = 0
frequence[x] += 1
if frequence[x] > frequence[plus_frequent]:
plus_frequent = x
return plus_frequent

[5]: lst = [1,2,5,1,6,2,1,2,2,2]


plus_frequent(lst)

[5]: 2

[6]: def knn(points, classes, x, k):


nb_pts = len(points)

# generer le tableau des distances


distances = []
for i in range(nb_pts):

1
d = euclidean_distance(x, points[i])
distances.append([i, d])

# trié le tableau des distances


for i in range(nb_pts):
for j in range(nb_pts-1):
if distances[j][1] > distances[j+1][1]:
c = distances[j]
distances[j] = distances[j+1]
distances[j+1] = c

# Les classes des k plus proches voisins


classes_voisins = []
for i in range(k):
indice = distances[i][0]
classes_voisins.append(classes[indice])

#la classe la plus frequente


c = plus_frequent(classes_voisins)

return c

[7]: def evaluation(points, classes, k):


nb_points = len(points)
seuil = (4*nb_points)//5

points_train = points[:seuil]
points_test = points[seuil:]
classes_train = classes[:seuil]
classes_test = classes[seuil:]

succes = 0
nb_test = len(points_test)

for i in range(nb_test):
prediction = knn(points_train, classes_train, points_test[i], k)
if prediction == classes_test[i]:
succes += 1

return succes/nb_test

1 Generic Dataset
[8]: points, classes = make_blobs(n_samples = 500, n_features = 2, centers =␣
↪3,cluster_std = 1.5, random_state = 6)

2
[9]: plt.figure(figsize = (10,5))
plt.scatter(points[:,0], points[:,1], c=classes, marker= '.
↪',s=100,edgecolors='black')

plt.show()

[10]: points_train = points[:400]


points_test = points[400:]
classes_train = classes[:400]
classes_test = classes[400:]

[11]: print(points_train[:10])

[[ 7.80291838 -3.49667437]
[-6.2660849 1.92611179]
[-8.85654973 3.25691309]
[-5.84437689 4.59816109]
[ 6.55402995 -2.8281474 ]
[ 6.85441089 -9.26260683]
[ 7.66709846 -5.41332313]
[-7.72643879 -2.05980392]
[10.11138133 -4.25359347]
[ 6.15349088 -8.59446213]]

[12]: print(points_train[:10])
print(classes_train[:10])

[[ 7.80291838 -3.49667437]
[-6.2660849 1.92611179]
[-8.85654973 3.25691309]

3
[-5.84437689 4.59816109]
[ 6.55402995 -2.8281474 ]
[ 6.85441089 -9.26260683]
[ 7.66709846 -5.41332313]
[-7.72643879 -2.05980392]
[10.11138133 -4.25359347]
[ 6.15349088 -8.59446213]]
[0 2 2 2 0 1 0 2 0 1]

[13]: x = points_test[33]
print(x)
knn(points_train, classes_train, x, 10)

[-6.43194186 0.92589598]

[13]: 2

[14]: evaluation(points, classes, 10)

[14]: 0.99

2 Iris Dataset
[15]: dataset_iris = pd.read_csv('iris.csv')

[16]: print(len(dataset_iris))
print(dataset_iris)

150
sepal.length sepal.width petal.length petal.width variety
0 5.1 3.5 1.4 0.2 Setosa
1 4.9 3.0 1.4 0.2 Setosa
2 4.7 3.2 1.3 0.2 Setosa
3 4.6 3.1 1.5 0.2 Setosa
4 5.0 3.6 1.4 0.2 Setosa
.. … … … … …
145 6.7 3.0 5.2 2.3 Virginica
146 6.3 2.5 5.0 1.9 Virginica
147 6.5 3.0 5.2 2.0 Virginica
148 6.2 3.4 5.4 2.3 Virginica
149 5.9 3.0 5.1 1.8 Virginica

[150 rows x 5 columns]

[17]: dataset_iris = np.array(dataset_iris)


points_iris = dataset_iris[:,:4]
classes_iris = dataset_iris[:,4:]
classes_iris = classes_iris[:,0]

4
[18]: knn(points_iris, classes_iris, [3.5,3.5,4.5,4.5], 5)

[18]: 'Virginica'

[19]: print(points_iris[:10])

[[5.1 3.5 1.4 0.2]


[4.9 3.0 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5.0 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5.0 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]]

[20]: print(classes_iris[:10])

['Setosa' 'Setosa' 'Setosa' 'Setosa' 'Setosa' 'Setosa' 'Setosa' 'Setosa'


'Setosa' 'Setosa']

[21]: evaluation(points_iris, classes_iris, 6)

[21]: 0.8

3 Diabetes Dataset
[22]: dataset_diabetes = pd.read_csv('diabetes.csv')

[23]: print(len(dataset_diabetes))
dataset_diabetes.head()

768

[23]: Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \


0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1

DiabetesPedigreeFunction Age Outcome


0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1

5
[24]: dataset_diabetes = np.array(dataset_diabetes)
points_diabetes = dataset_diabetes[:,:8]
classes_diabetes = dataset_diabetes[:,8:]
classes_diabetes = classes_diabetes[:,0]

[25]: print(points_diabetes[:10])

[[6.000e+00 1.480e+02 7.200e+01 3.500e+01 0.000e+00 3.360e+01 6.270e-01


5.000e+01]
[1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
3.100e+01]
[8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
3.200e+01]
[1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
2.100e+01]
[0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
3.300e+01]
[5.000e+00 1.160e+02 7.400e+01 0.000e+00 0.000e+00 2.560e+01 2.010e-01
3.000e+01]
[3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
2.600e+01]
[1.000e+01 1.150e+02 0.000e+00 0.000e+00 0.000e+00 3.530e+01 1.340e-01
2.900e+01]
[2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
5.300e+01]
[8.000e+00 1.250e+02 9.600e+01 0.000e+00 0.000e+00 0.000e+00 2.320e-01
5.400e+01]]

[26]: print(classes_diabetes[:10])

[1. 0. 1. 0. 1. 0. 1. 0. 1. 1.]

[27]: evaluation(points_diabetes, classes_diabetes, 8)

[27]: 0.7207792207792207

You might also like