Professional Documents
Culture Documents
DSM 2
DSM 2
def read_dataset(filename):
df = pd.read_csv(filename)
matrix = df.to_numpy()
return matrix
csv_file = "iris.csv"
dataset = read_dataset(csv_file)
1
[6.3 3.3 6.0 2.5 'Virginica']
[5.8 2.7 5.1 1.9 'Virginica']
[7.1 3.0 5.9 2.1 'Virginica']
[6.3 2.9 5.6 1.8 'Virginica']
[6.5 3.0 5.8 2.2 'Virginica']
[7.6 3.0 6.6 2.1 'Virginica']
[4.9 2.5 4.5 1.7 'Virginica']
[7.3 2.9 6.3 1.8 'Virginica']
[6.7 2.5 5.8 1.8 'Virginica']
[7.2 3.6 6.1 2.5 'Virginica']]
0.2 2.a Calculate Data mean for each attribute and represent it as a vector
[203]: def calculate_data_mean(filename):
# Read the CSV file using pandas
df = pd.read_csv(filename)
return mean_vector
Mean Vector:
sepal.length 5.95
sepal.width 3.07
petal.length 3.86
petal.width 1.22
dtype: float64
2
0.4 2.c Calculate Euclidian distance between two data objects
[205]: # calculating Euclidean distance using linalg.norm()
def euclidean_distance(vec1, vec2):
dist = np.linalg.norm(vec1 - vec2)
return dist
mahalanobis_distance = np.sqrt(mahalanobis_sq)
return mahalanobis_distance
iris_data = pd.read_csv('iris.csv')
# ref https://www.machinelearningplus.com/statistics/mahalanobis-distance/
3
dist = manhattan_distance(row, query_point)
elif distance_metric == 'chebyshev':
dist = chebyshev(row, query_point)
elif distance_metric == 'euclidean':
dist = euclidean_distance(row, query_point)
elif distance_metric == 'mahalanobis':
dist = mahalanobis_distance(data, query_point)
else:
raise ValueError("Invalid distance metric. Supported options are␣
↪'manhattan', 'chebyshev', 'euclidean', and 'mahalanobis'.")
distances.append((dist, labels[i]))
distances.sort()
k_nearest = distances[:k]
k_nearest_labels = [label for (_, label) in k_nearest]
most_common = Counter(k_nearest_labels).most_common(1)
predicted_label = most_common[0][0]
return predicted_label
# Example usage: classify a random point using KNN with different distance␣
↪metrics
4
0.8 Write a separate function to implement the K-means clustering method
using all the functions implemented in question (2) above
[214]: def initialize_centroids(data, k):
"""Randomly initialize k centroids from the data."""
centroids = data[np.random.choice(range(data.shape[0]), k, replace=False)]
return centroids
5
def euclidean_distance(vec1, vec2):
dist = np.linalg.norm(vec1 - vec2)
return dist
k = 3
distance_metrics = ['mahalanobis', 'manhattan', 'chebyshev', 'euclidean']
for metric in distance_metrics:
cluster_labels, centroids = kmeans(iris_data, k, distance_metric=metric)
print(f"Distance Metric: {metric}")
print("Cluster Labels:")
print(cluster_labels)
print("Centroids:")
print(centroids)
print()
6
Distance Metric: chebyshev
Cluster Labels:
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 2 1 2 2 2 2 1 2 2 2]
Centroids:
[[5.09090909 3.36363636 1.57272727 0.3 ]
[6.13636364 2.80909091 4.58181818 1.5 ]
[6.875 3.025 6.0125 2.1 ]]