Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 33

EAST WEST AIRLINES OUTPUT

import pandas as pd

import numpy as np

import matplotlib.pylab as plt

East = pd.read_excel("C:\\Users\\Santosh raj\\Desktop\\pyton study material\\hireachical clustering\\


East_West.xlsx")

East.dtypes

Out[5]:

ID# int64

Balance int64

Qual_miles int64

cc1_miles int64

cc2_miles int64

cc3_miles int64

Bonus_miles int64

Bonus_trans int64

Flight_miles_12mo int64

Flight_trans_12 int64

Days_since_enroll int64

Award? int64

dtype: object
East.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 3999 entries, 0 to 3998

Data columns (total 12 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 ID# 3999 non-null int64

1 Balance 3999 non-null int64

2 Qual_miles 3999 non-null int64

3 cc1_miles 3999 non-null int64

4 cc2_miles 3999 non-null int64

5 cc3_miles 3999 non-null int64

6 Bonus_miles 3999 non-null int64

7 Bonus_trans 3999 non-null int64

8 Flight_miles_12mo 3999 non-null int64

9 Flight_trans_12 3999 non-null int64

10 Days_since_enroll 3999 non-null int64

11 Award? 3999 non-null int64

dtypes: int64(12)

memory usage: 375.0 KB

#To Detect the outliers

import seaborn as sns


sns.boxplot(East.Balance)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[9]: <AxesSubplot:xlabel='Balance'>

#Removing the outilers

IQR = East['Balance'].quantile(0.75) - East['Balance'].quantile(0.25)

lower_limit = East['Balance'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['Balance'].quantile(0.75) + (IQR * 1.5)


#Apply Winsor

from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails

fold=1.5,

variables=['Balance'])

East_t = winsor.fit_transform(East[['Balance']])

sns.boxplot(East_t.Balance)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[20]: <AxesSubplot:xlabel='Balance'>

#To find outliers for qual miles

sns.boxplot(East.Qual_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[22]: <AxesSubplot:xlabel='Qual_miles'>

#Removing the outilers

IQR = East['Qual_miles'].quantile(0.75) - East['Qual_miles'].quantile(0.25)

lower_limit = East['Qual_miles'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['Qual_miles'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor

from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails


fold=1.5,

variables=['Qual_miles'])

East_Q = winsor.fit_transform(East[['Qual_miles']])

sns.boxplot(East_Q.Qual_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[31]: <AxesSubplot:xlabel='Qual_miles'>

#To find outiler for cc1_miles

sns.boxplot(East.cc1_miles)
C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[33]: <AxesSubplot:xlabel='cc1_miles'>

#Outilers not present

#To find outilers for cc2_miles

sns.boxplot(East.cc2_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[36]: <AxesSubplot:xlabel='cc2_miles'>

#To detect outliers

IQR = East['cc2_miles'].quantile(0.75) - East['cc2_miles'].quantile(0.25)

lower_limit = East['cc2_miles'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['cc2_miles'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor

from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails


fold=1.5,

variables=['cc2_miles'])

East_c = winsor.fit_transform(East[['cc2_miles']])

sns.boxplot(East_c.cc2_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[45]: <AxesSubplot:xlabel='cc2_miles'>

#To find outilers for cc3

sns.boxplot(East.cc3_miles)
C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[47]: <AxesSubplot:xlabel='cc3_miles'>

#To detect outilers

IQR = East['cc3_miles'].quantile(0.75) - East['cc3_miles'].quantile(0.25)

lower_limit = East['cc3_miles'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['cc3_miles'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor
from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails

fold=1.5,

variables=['cc3_miles'])

East_c3 = winsor.fit_transform(East[['cc3_miles']])

sns.boxplot(East_c3.cc3_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[56]: <AxesSubplot:xlabel='cc3_miles'>

#To find outilers for Bonus_miles

sns.boxplot(East.Bonus_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[58]: <AxesSubplot:xlabel='Bonus_miles'>

#To detect outilers

IQR = East['Bonus_miles'].quantile(0.75) - East['Bonus_miles'].quantile(0.25)

lower_limit = East['Bonus_miles'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['Bonus_miles'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor

from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails


fold=1.5,

variables=['Bonus_miles'])

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails

fold=1.5,

variables=['Bonus_miles'])

East_B = winsor.fit_transform(East[['Bonus_miles']])

sns.boxplot(East_B.Bonus_miles)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[68]: <AxesSubplot:xlabel='Bonus_miles'>

#To find outilers for Bonus_tans

sns.boxplot(East.Bonus_trans)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[70]: <AxesSubplot:xlabel='Bonus_trans'>

#To detect outilers

IQR = East['Bonus_trans'].quantile(0.75) - East['Bonus_trans'].quantile(0.25)

lower_limit = East['Bonus_trans'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['Bonus_trans'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor

from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails


fold=1.5,

variables=['Bonus_trans'])

East_t = winsor.fit_transform(East[['Bonus_trans']])

sns.boxplot(East_t.Bonus_trans)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[79]: <AxesSubplot:xlabel='Bonus_trans'>

#To find outilers for Flight

sns.boxplot(East.Flight_miles_12mo)
C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[81]: <AxesSubplot:xlabel='Flight_miles_12mo'>

#To detect outilers

IQR = East['Flight_miles_12mo'].quantile(0.75) - East['Flight_miles_12mo'].quantile(0.25)

lower_limit = East['Flight_miles_12mo'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['Flight_miles_12mo'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor
from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails

fold=1.5,

variables=['Flight_miles_12mo'])

East_F = winsor.fit_transform(East[['Flight_miles_12mo']])

sns.boxplot(East_F.Flight_miles_12mo)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[90]: <AxesSubplot:xlabel='Flight_miles_12mo'>

#To find Outilers for Flight trans

sns.boxplot(East.Flight_trans_12)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[92]: <AxesSubplot:xlabel='Flight_trans_12'>

#To detect outilers

IQR = East['Flight_trans_12'].quantile(0.75) - East['Flight_trans_12'].quantile(0.25)

lower_limit = East['Flight_trans_12'].quantile(0.25) - (IQR * 1.5)

upper_limit = East['Flight_trans_12'].quantile(0.75) + (IQR * 1.5)

#Apply Winsor

from feature_engine.outliers import Winsorizer

winsor = Winsorizer(capping_method='iqr', # choose IQR rule boundaries or gaussian for mean and std

tail='both', # cap left, right or both tails


fold=1.5,

variables=['Flight_trans_12'])

East_F2 = winsor.fit_transform(East[['Flight_trans_12']])

sns.boxplot(East_F2.Flight_trans_12)

C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following


variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[101]: <AxesSubplot:xlabel='Flight_trans_12'>

#To find outilers for day since enroll

sns.boxplot(East.Days_since_enroll)
C:\Users\Public\conda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and
passing other arguments without an explicit keyword will result in an error or misinterpretation.

warnings.warn(

Out[103]: <AxesSubplot:xlabel='Days_since_enroll'>

#No outlers present

#To find Na Values

East.isna().sum()

Out[106]:

ID# 0

Balance 0

Qual_miles 0
cc1_miles 0

cc2_miles 0

cc3_miles 0

Bonus_miles 0

Bonus_trans 0

Flight_miles_12mo 0

Flight_trans_12 0

Days_since_enroll 0

Award? 0

dtype: int64

#Plot Histogram

plt.hist(East_B.Bonus_miles)

Out[108]:

(array([1855., 490., 340., 269., 209., 151., 142., 113., 84.,

346.]),

array([ 0. , 5762.625, 11525.25 , 17287.875, 23050.5 , 28813.125,

34575.75 , 40338.375, 46101. , 51863.625, 57626.25 ]),

<BarContainer object of 10 artists>)


plt.hist(East_t.Bonus_trans)

Out[109]:

(array([1041., 519., 396., 853., 454., 314., 151., 100., 70.,

101.]),

array([ 0. , 3.8, 7.6, 11.4, 15.2, 19. , 22.8, 26.6, 30.4, 34.2, 38. ]),

<BarContainer object of 10 artists>)


plt.hist(East_F.Flight_miles_12mo)

Out[110]:

(array([2763., 105., 52., 79., 51., 51., 197., 53., 31.,

617.]),

array([ 0. , 77.75, 155.5 , 233.25, 311. , 388.75, 466.5 , 544.25,

622. , 699.75, 777.5 ]),

<BarContainer object of 10 artists>)


plt.hist(East_F2.Flight_trans_12)

Out[111]:

(array([2723., 0., 0., 0., 469., 0., 0., 0., 242.,

565.]),

array([0. , 0.25, 0.5 , 0.75, 1. , 1.25, 1.5 , 1.75, 2. , 2.25, 2.5 ]),

<BarContainer object of 10 artists>)


East1 = East.drop(["ID#"], axis=1)

# Normalization function

def norm_func(i):

x = (i-i.min()) / (i.max()-i.min())

return (x)

# Normalized data frame (considering the numerical part of data)

df_norm = norm_func(East1.iloc[:, 1:])

df_norm.describe()

Out[117]:
Qual_miles cc1_miles ... Days_since_enroll Award?

count 3999.000000 3999.000000 ... 3999.000000 3999.000000

mean 0.012927 0.264879 ... 0.496330 0.370343

std 0.069399 0.344230 ... 0.248991 0.482957

min 0.000000 0.000000 ... 0.000000 0.000000

25% 0.000000 0.000000 ... 0.280685 0.000000

50% 0.000000 0.000000 ... 0.493610 0.000000

75% 0.000000 0.500000 ... 0.697914 1.000000

max 1.000000 1.000000 ... 1.000000 1.000000

[8 rows x 10 columns]

# for creating dendrogram

from scipy.cluster.hierarchy import linkage, dendrogram

# import scipy.cluster.hierarchy as sch

z = linkage(df_norm, method = "complete", metric = "euclidean")

# Dendrogram

plt.figure(figsize=(15, 8));plt.title('Hierarchical Clustering


Dendrogram');plt.xlabel('Index');plt.ylabel('Distance')

dendrogram(z,

leaf_rotation = 0, # rotates the x axis labels

leaf_font_size = 10 # font size for the x axis labels

)
plt.show()

# Now applying AgglomerativeClustering choosing 5 as clusters from the above dendrogram

from sklearn.cluster import AgglomerativeClustering

h_complete = AgglomerativeClustering(n_clusters = 3, linkage = 'complete', affinity =


"euclidean").fit(df_norm)

h_complete.labels_

Out[125]: array([0, 0, 0, ..., 2, 0, 0], dtype=int64)

cluster_labels = pd.Series(h_complete.labels_)

East1['clust'] = cluster_labels # creating a new column and assigning it to new column


East1.head()

Out[128]:

Balance Qual_miles cc1_miles ... Days_since_enroll Award? clust

0 28143 0 1 ... 7000 0 0

1 19244 0 1 ... 6968 0 0

2 41354 0 1 ... 7034 0 0

3 14776 0 1 ... 6952 0 0

4 97752 0 4 ... 6935 1 2

[5 rows x 12 columns]

# Aggregate mean of each cluster

East1.iloc[:, 1:].groupby(East1.clust).mean()

Out[130]:

Qual_miles cc1_miles cc2_miles ... Days_since_enroll Award? clust

clust ...

0 87.790412 1.703645 1.018621 ... 3822.941363 0.002377 0.0

1 347.000000 2.500000 1.000000 ... 2200.250000 1.000000 1.0

2 240.205982 2.668933 1.007478 ... 4631.008838 1.000000 2.0

[3 rows x 11 columns]
East1.iloc[:, 2:].groupby(East1.clust).std()

Out[131]:

cc1_miles cc2_miles cc3_miles ... Days_since_enroll Award? clust

clust ...

0 1.152351 0.173702 0.156518 ... 2079.882670 0.048708 0.0

1 1.732051 0.000000 0.000000 ... 1171.167331 0.000000 0.0

2 1.510456 0.086180 0.248128 ... 1936.161303 0.000000 0.0

[3 rows x 10 columns]

# creating a csv file

East1.to_csv("East_West.csv", encoding = "utf-8")

import os

os.getcwd()

#In Cluster 1 flying patterns, earning and use of frequent flyer rewards, and use of the airline credit card
usage is high

#In Cluster 2 flying patterns, earning and use of frequent flyer rewards, and use of the airline credit card
usage is medium

#In Cluster 0 flying patterns, earning and use of frequent flyer rewards, and use of the airline credit card
usage is low

You might also like