Professional Documents
Culture Documents
Data Mining - Project
Data Mining - Project
In [ ]:
#Case study 1
In [1]:
import numpy as np
import pandas as pd
In [2]:
od=pd.read_csv(r'C:\Users\SBHAGAT\Downloads\bank_marketing_part1_Data-1.csv')
In [3]:
od.head()
In [4]:
od.shape
Out[4]: (210, 7)
In [5]:
od.info()
<class 'pandas.core.frame.DataFrame'>
dtypes: float64(7)
In [6]:
od.isnull().sum()
Out[6]: spending 0
advance_payments 0
probability_of_full_payment 0
current_balance 0
credit_limit 0
min_payment_amt 0
max_spent_in_single_shopping 0
dtype: int64
In [7]:
od.describe().T
In [8]:
dups =od.duplicated()
In [11]:
from sklearn.preprocessing import StandardScaler
In [12]:
X = StandardScaler()
In [13]:
scaled_clust = pd.DataFrame(X.fit_transform(od), columns=od.columns)
In [14]:
scaled_clust.head()
In [15]:
#creating clusters using K means
In [16]:
k_means.fit(scaled_clust)
In [17]: k_means.labels_
Out[17]: array([1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1])
In [18]:
wss=[]
In [19]:
for i in range(1,11):
KM = KMeans(n_clusters=i,random_state=1)
KM.fit(scaled_clust)
wss.append(KM.inertia_)
C:\Users\SBHAGAT\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:881: UserWar
ning: KMeans is known to have a memory leak on Windows with MKL, when there are less
chunks than available threads. You can avoid it by setting the environment variable
OMP_NUM_THREADS=1.
warnings.warn(
In [20]:
wss
Out[20]: [1469.9999999999995,
659.1717544870411,
430.65897315130064,
371.301721277542,
327.9608240079031,
290.5900305968219,
264.83153087478144,
240.6837259501598,
220.85285825594738,
206.3829103601579]
In [21]:
k_means = KMeans(n_clusters = 3,random_state=1)
k_means.fit(scaled_clust)
labels = k_means.labels_
In [22]:
silhouette_score(scaled_clust,labels,random_state=1)
Out[22]: 0.40072705527512986
In [23]:
k_means = KMeans(n_clusters = 4,random_state=1)
k_means.fit(scaled_clust)
labels = k_means.labels_
In [24]:
silhouette_score(scaled_clust,labels,random_state=1)
Out[24]: 0.32757426605518075
In [ ]:
In [1]:
from scipy.cluster.hierarchy import dendrogram, linkage
In [ ]:
link_method = linkage(clean_dataset_Scaled, method = 'average')
In [2]:
dend = dendrogram(link_method)
---------------------------------------------------------------------------
<ipython-input-2-0e1e4bd1bd60> in <module>
In [25]:
#case study 2
import numpy as np
import pandas as pd
In [26]:
df_insured = pd.read_csv(r'C:\Users\SBHAGAT\Downloads\insurance_part2_data-2.csv')
In [27]:
df_insured.head()
Out[27]: Product
Age Agency_Code Type Claimed Commision Channel Duration Sales Destina
Name
Customised
0 48 C2B Airlines No 0.70 Online 7 2.51
Plan
Travel Customised
1 36 EPX No 0.00 Online 34 20.00
Agency Plan
Travel Customised
2 39 CWT No 5.94 Online 3 9.90 Ame
Agency Plan
Travel Cancellation
3 36 EPX No 0.00 Online 4 26.00
Agency Plan
In [28]:
df_insured.info()
<class 'pandas.core.frame.DataFrame'>
In [29]:
df_insured.isnull().sum()
Out[29]: Age 0
Agency_Code 0
Type 0
Claimed 0
Commision 0
Channel 0
Duration 0
Sales 0
Product Name 0
Destination 0
dtype: int64
In [30]:
df_insured.describe(include="all").T
Out[30]: count unique top freq mean std min 25% 50% 75%
Age 3000.0 NaN NaN NaN 38.091 10.463518 8.0 32.0 36.0 42.0
Agency_Code 3000 4 EPX 1365 NaN NaN NaN NaN NaN NaN
Travel
Type 3000 2 1837 NaN NaN NaN NaN NaN NaN
Agency
Commision 3000.0 NaN NaN NaN 14.529203 25.481455 0.0 0.0 4.63 17.235 2
Channel 3000 2 Online 2954 NaN NaN NaN NaN NaN NaN
Duration 3000.0 NaN NaN NaN 70.001333 134.053313 -1.0 11.0 26.5 63.0 4
Sales 3000.0 NaN NaN NaN 60.249913 70.733954 0.0 20.0 33.0 69.0
Product Customised
3000 5 1136 NaN NaN NaN NaN NaN NaN
Name Plan
Destination 3000 3 ASIA 2465 NaN NaN NaN NaN NaN NaN
In [31]:
dups = df_insured.duplicated()
In [32]:
df_insured.drop_duplicates(inplace=True)
In [33]:
dups = df_insured.duplicated()
print(df_insured.shape)
(2861, 10)
In [34]:
plt.figure(figsize=(10,5))
Out[34]: <AxesSubplot:>
In [35]:
sns.pairplot(df_insured[['Age','Commision', 'Duration', 'Sales']])
In [36]:
#corelation
plt.figure(figsize=(10,8))
sns.set(font_scale=1.2)
Out[36]: <AxesSubplot:>
In [37]:
#converting all objects to categorical
if df_insured[feature].dtype == 'object':
print('\n')
print('feature:',feature)
print(pd.Categorical(df_insured[feature].unique()))
print(pd.Categorical(df_insured[feature].unique()).codes)
df_insured[feature] = pd.Categorical(df_insured[feature]).codes
feature: Agency_Code
[0 2 1 3]
feature: Type
[0 1]
feature: Claimed
['No', 'Yes']
[0 1]
feature: Channel
['Online', 'Offline']
[1 0]
['Customised Plan', 'Cancellation Plan', 'Bronze Plan', 'Silver Plan', 'Gold Plan']
Categories (5, object): ['Bronze Plan', 'Cancellation Plan', 'Customised Plan', 'Gol
d Plan', 'Silver Plan']
[2 1 0 4 3]
feature: Destination
[0 1 2]
In [38]:
df_insured.info()
<class 'pandas.core.frame.DataFrame'>
In [39]:
df_insured.head()
Out[39]: Product
Age Agency_Code Type Claimed Commision Channel Duration Sales Destination
Name
0 48 0 0 0 0.70 1 7 2.51 2 0
1 36 2 1 0 0.00 1 34 20.00 2 0
2 39 1 1 0 5.94 1 3 9.90 2 1
3 36 2 1 0 0.00 1 4 26.00 1 0
4 33 3 0 0 6.30 1 53 18.00 0 0
In [40]:
df_insured.Claimed.value_counts(normalize=True)
Out[40]: 0 0.680531
1 0.319469
In [41]:
#2.2 Data Split: Split the data into test and train
X = df_insured.drop("Claimed", axis=1)
y = df_insured.pop("Claimed")
X.head()
Out[41]: Age Agency_Code Type Commision Channel Duration Sales Product Name Destination
0 48 0 0 0.70 1 7 2.51 2 0
localhost:8888/nbconvert/html/Data Mining_Project_Ankit Bhagat.ipynb?download=false 9/11
10/24/21, 7:22 PM Data Mining_Project_Ankit Bhagat
Age Agency_Code Type Commision Channel Duration Sales Product Name Destination
1 36 2 1 0.00 1 34 20.00 2 0
2 39 1 1 5.94 1 3 9.90 2 1
3 36 2 1 0.00 1 4 26.00 1 0
4 33 3 0 6.30 1 53 18.00 0 0
In [42]:
X_train, X_test, train_labels, test_labels = train_test_split(X, y, test_size=.30, r
In [43]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('train_labels',train_labels.shape)
print('test_labels',test_labels.shape)
X_train (2002, 9)
X_test (859, 9)
train_labels (2002,)
test_labels (859,)
In [44]:
dt_model = DecisionTreeClassifier(criterion = 'gini' )
In [45]:
dt_model.fit(X_train, train_labels)
Out[45]: DecisionTreeClassifier()
In [46]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_trai
Imp
Age 0.194224
Agency_Code 0.168120
Type 0.000000
Commision 0.077408
Channel 0.003908
Duration 0.274566
Sales 0.221966
Destination 0.027677
In [49]:
from sklearn.model_selection import GridSearchCV
param_grid ={
'max_depth' :[4,5,6],
'min_samples_leaf' : [20,40,60,70]
'min_samples_split' : [150,200,250,300]
dt_model= DecisionTreeClassifier()
'min_samples_split' : [150,200,250,300]
In [50]:
#FITTING THE OPTIMAL VALUES TO THE TRAINING DATASET
grid_search.fit(X_train,train_labels)
---------------------------------------------------------------------------
<ipython-input-50-bf4f3cc6609d> in <module>
----> 2 grid_search.fit(X_train,train_labels)
In [51]:
best_grid
---------------------------------------------------------------------------
<ipython-input-51-dbefcb93077f> in <module>
----> 1 best_grid
In [52]:
#case 1 1.3 hierarchical testing
wardlink = linkage(scaled_clust,method='ward')
dend = dendogram(wardlink)
---------------------------------------------------------------------------
<ipython-input-52-ff4670ab758a> in <module>
3 wardlink = linkage(scaled_clust,method='ward')
4 dend = dendogram(wardlink)
In [ ]: