Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 8

1.

Perform clustering (Both hierarchical and K means clustering) for


the airlines data to obtain optimum number of clusters.
The data interface shows that crime data of their states
Importing libraries
import pandas as pd
import matplotlib.pylab as plt
=> Loading the data set
cdata = pd.read_csv("C:\\Users\\NEHAL RAJ\\Downloads\\crime_data.csv(5)\\crime_data.csv")
Describing the data set
cdata.describe()
Murder Assault UrbanPop Rape
count 50.00000 50.000000 50.000000 50.000000
mean 7.78800 170.760000 65.540000 21.232000
std 4.35551 83.337661 14.474763 9.366385
min 0.80000 45.000000 32.000000 7.300000
25% 4.07500 109.000000 54.500000 15.075000
50% 7.25000 159.000000 66.000000 20.100000
75% 11.25000 249.000000 77.750000 26.175000
max 17.40000 337.000000 91.000000 46.000000

data=cdata.drop(["Index"],axis=1)
=>Reducing the coloumn so that index coloumn is categorical data
def norm_func(i):
x=(i-i.min()) / (i.max()-i.min())
= return (x)
=df_norm=norm_func(data.iloc[:,:])
df_norm.describe()
Murder Assault UrbanPop Rape
count 50.000000 50.000000 50.000000 50.000000
mean 0.420964 0.430685 0.568475 0.360000
std 0.262380 0.285403 0.245335 0.242025
min 0.000000 0.000000 0.000000 0.000000
25% 0.197289 0.219178 0.381356 0.200904
50% 0.388554 0.390411 0.576271 0.330749
75% 0.629518 0.698630 0.775424 0.487726
max 1.000000 1.000000 1.000000 1.000000
=>Importing the linkage from scipy.cluster.hierarchy
from scipy.cluster.hirarchy import linkage
import scipy.cluster.hierarchy as sch
z = linkage(df_norm, method="complete",metrics="euclidean")
plt.figure(figsize=(20,5));plt.title("hierarchical clustering Dendrogram");
plt.xlable('index');plt.ylable('distance')

sch.dendrogram(z,
leaf_rotation = 90,
left_font_size= 10
)
plt.show
#algomerative clustering
=>from sklearn.cluster import AgglomerativeClustering

h_complete=AgglomerativeClustering(n_clusters=5 , linkage= 'complete' ,


affinity="euclidean").fit(df_norm)
h_complete.labels_

cluster_labels = pd.series(h_complete.labels_)
=>Adding the new coloumn to the data set
cdata['clust'] = cluster labels #creating new column
=>Changing the location of the coloumn
data=cdata.iloc[: ,[5,0,1,2,3,4,]]
data.head()
clust Index Murder Assault UrbanPop Rape
0 3 Alabama 13.2 236 58 21.2
1 4 Alaska 10.0 263 48 44.5
2 1 Arizona 8.1 294 80 31.0
3 0 Arkansas 8.8 190 50 19.5
4 1 California 9.0 276 91 40.6
=>Saving the data
data.iloc[: , 1:].groupby(data.clust).mean()
data.to_csv("crimedata.csv", encoding = "utf-8")

import os
os.getcwd()

Python libraries :
import pandas as pd
import mat plot lib. pylab as plt
import linkage from scipy.cluster.hierarchy
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

2 . Perform clustering (Both hierarchical and K means clustering) for


the airlines data to obtain optimum number of clusters.
=>Importing the libraries
import pandas as pd
import matplotlib.pylab as plt
=>loading the data set
air = pd.read_excel("E:\\EastWestAirlines.xlsx",sheet_name="data")
air.describe()
ID# Balance ... Days_since_enroll Award?
count 3999.000000 3.999000e+03 ... 3999.00000 3999.000000
mean 2014.819455 7.360133e+04 ... 4118.55939 0.370343
std 1160.764358 1.007757e+05 ... 2065.13454 0.482957
min 1.000000 0.000000e+00 ... 2.00000 0.000000
25% 1010.500000 1.852750e+04 ... 2330.00000 0.000000
50% 2016.000000 4.309700e+04 ... 4096.00000 0.000000
75% 3020.500000 9.240400e+04 ... 5790.50000 1.000000
max 4021.000000 1.704838e+06 ... 8296.00000 1.000000
air1=air.drop(["ID#"],axis=1)
=>Droping the coloumn because it is categorical
def norm_func(y):
x = (y-y.min()) / (y.max()-y.min())
return (x)

df_norm=norm_func(air1.iloc[:,:])
=>Normalizing the data to reduce the values
df_norm.describe()
Balance Qual_miles ... Days_since_enroll Award?
count 3999.000000 3999.000000 ... 3999.000000 3999.000000
mean 0.043172 0.012927 ... 0.496330 0.370343
std 0.059112 0.069399 ... 0.248991 0.482957
min 0.000000 0.000000 ... 0.000000 0.000000
25% 0.010868 0.000000 ... 0.280685 0.000000
50% 0.025279 0.000000 ... 0.493610 0.000000
75% 0.054201 0.000000 ... 0.697914 1.000000
max 1.000000 1.000000 ... 1.000000 1.000000
from scipy.cluster.hierarchy import linkage

import scipy.cluster.hierarchy as sch

z = linkage(df_norm, method="complete",metrics="euclidean")
=> Figering the data to find the no of clusteres

plt.figure(figsize=(20,20));plt.title("hierarchical clustering
Dendrogram");plt.xlable('index');plt.ylable('distance')

sch.dendrogram(z,
leaf_rotation = 90,
left_font_size= 10

)
plt.show
#algomerative clustering
from sklearn.cluster import AgglomerativeClustering

=> Creating no of clusters based on the dendogram tree

h_complete=AgglomerativeClustering(n_clusters=5 , linkage= 'complete'


affinity="euclidean").fit(df_norm)
h_complete.labels_
cluster_labels = pd.series(h_complete.labels_)

=> Creating the column for numbering clust


air1['clust'] = cluster labels #creating new column
=> Changing the clust coloumn
air=air1.iloc[: ,[12,0,1,2,3,4,5,6,7,8,9,10,11]]
air.head()
ID# Balance Qual_miles ... Flight_trans_12 Days_since_enroll Award?
0 1 28143 0 ... 0 7000 0
1 2 19244 0 ... 0 6968 0
2 3 41354 0 ... 0 7034 0
3 4 14776 0 ... 0 6952 0
4 5 97752 0 ... 4 6935 1
air.iloc[: , 2:].groupby(air.clust).mean()
air.to_csv("airlines.csv", encoding = "utf-8")

import os
os.getcwd()

Python libraries :
import pandas as pd
import mat plot lib. pylab as plt
import linkage from scipy.cluster.hierarchy
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

You might also like