vertopal.com_Vijay_Shankar_Customer_churn_Random_Forest_Hyperparameter_tuning

1. Cleanup: The data does not include any null or even zeros, the least is 1.
2. Outlier: From graphical analysis of the scatter plot and interquartile outlier
detection, we have shown that the data does not have any outliers.
3. Feature_Engineering: The feature engineering is minimal, we have removed

unncessary rows. We have cheecked for correlations between features using heat
map but they appear highly uncorrelated. The data set is balanced and linearly
unseperable.
4. We have tried Random forest classifier, we optimize the hyperparameters using

random search grid. We have tried to improve the evaluation metrics.
5. We have tried using deep learning as well. However, the result is not good enough.
For deep learning the data is insufficient, perhaps generating more data will be
appropriate (not including all of them here).
6. For normalization we have used a min max scaler, and this has possible eliminated
the need for outlier removal.
7. We have seperated train and test and validation data and found the result using
Random Forest Classifier.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
Removing uncecessary columns customer Id, Name and Index
df=pd.read_excel('/content/drive/MyDrive/Sunbase/
customer_churn_large_dataset(2).xlsx')
df=df.drop(['Name'],axis=1)
df.head()
CustomerID Age Gender Location Subscription_Length_Months \

0 1 63 Male Los Angeles 17
1 2 62 Female New York 1
2 3 24 Female Los Angeles 5
3 4 36 Female Miami 3
4 5 46 Female Miami 19
Monthly_Bill Total_Usage_GB Churn

0 73.36 236 0
1 48.76 172 0
2 85.47 460 0
3 97.94 297 1
4 58.14 266 0
df=df.drop(['CustomerID'],axis=1)
df.head()
Age Gender Location Subscription_Length_Months Monthly_Bill

\
0 63 Male Los Angeles 17 73.36
1 62 Female New York 1 48.76
2 24 Female Los Angeles 5 85.47
3 36 Female Miami 3 97.94
Total_Usage_GB Churn
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0
len(df['Gender'])
100000
df.head()

\
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0
<google.colab._quickchart_helpers.SectionTitle at 0x78de8ac72710>
import numpy as np
from google.colab import autoviz
def value_plot(df, y, figscale=1):

from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()
chart = value_plot(_df_58, *['Age'], **{})

chart
import numpy as np

title=y)
plt.tight_layout()
chart = value_plot(_df_59, *['Subscription_Length_Months'], **{})

chart
import numpy as np

title=y)
plt.tight_layout()
chart = value_plot(_df_60, *['Monthly_Bill'], **{})

chart
import numpy as np

title=y)
plt.tight_layout()
chart = value_plot(_df_61, *['Total_Usage_GB'], **{})

chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8b34d9c0>
import numpy as np
def histogram(df, colname, num_bins=20, figscale=1):

df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
chart = histogram(_df_62, *['Age'], **{})

chart
import numpy as np

plt.tight_layout()
chart = histogram(_df_63, *['Subscription_Length_Months'], **{})

chart
import numpy as np

plt.tight_layout()
chart = histogram(_df_64, *['Monthly_Bill'], **{})
chart
import numpy as np

plt.tight_layout()
chart = histogram(_df_65, *['Total_Usage_GB'], **{})

chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8ad5f1c0>
import numpy as np
def categorical_histogram(df, colname, figscale=1,

mpl_palette_name='Dark2'):
df.groupby(colname).size().plot(kind='barh',
color=sns.palettes.mpl_palette(mpl_palette_name), figsize=(8*figscale,
4.8*figscale))
chart = categorical_histogram(_df_66, *['Gender'], **{})

chart
import numpy as np
def categorical_histogram(df, colname, figscale=1,

mpl_palette_name='Dark2'):
df.groupby(colname).size().plot(kind='barh',
color=sns.palettes.mpl_palette(mpl_palette_name), figsize=(8*figscale,
4.8*figscale))
chart = categorical_histogram(_df_67, *['Location'], **{})

chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8ad5c7f0>
import numpy as np
def scatter_plots(df, colname_pairs, figscale=1, alpha=.8):

plt.figure(figsize=(len(colname_pairs) * 6 * figscale, 6 *
figscale))
for plot_i, (x_colname, y_colname) in enumerate(colname_pairs,
start=1):
ax = plt.subplot(1, len(colname_pairs), plot_i)
df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 *
figscale), alpha=alpha, ax=ax)
ax.spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
chart = scatter_plots(_df_68, *[[['Age',

'Subscription_Length_Months'], ['Subscription_Length_Months',
'Monthly_Bill'], ['Monthly_Bill', 'Total_Usage_GB'],
['Total_Usage_GB', 'Churn']]], **{})
chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8ad5d7e0>
import numpy as np
def heatmap(df, x_colname, y_colname, figscale=1,

mpl_palette_name='viridis'):
import pandas as pd
plt.subplots(figsize=(8 * figscale, 8 * figscale))
df_2dhist = pd.DataFrame({
x_label: grp[y_colname].value_counts()
for x_label, grp in df.groupby(x_colname)
})
sns.heatmap(df_2dhist, cmap=mpl_palette_name)
plt.xlabel(x_colname)
plt.ylabel(y_colname)
chart = heatmap(_df_69, *['Gender', 'Location'], **{})

chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8b428970>
import numpy as np
def violin_plot(df, value_colname, facet_colname, figscale=1,
mpl_palette_name='Dark2', **kwargs):
figsize = (12 * figscale, 1.2 * figscale *
len(df[facet_colname].unique()))
plt.figure(figsize=figsize)
sns.violinplot(df, x=value_colname, y=facet_colname,
palette=mpl_palette_name, **kwargs)
sns.despine(top=True, right=True, bottom=True, left=True)
chart = violin_plot(_df_70, *['Age', 'Gender'], **{'inner': 'stick'})

chart
import numpy as np

chart = violin_plot(_df_71, *['Age', 'Location'], **{'inner':

'stick'})
chart
import numpy as np

chart = violin_plot(_df_72, *['Subscription_Length_Months', 'Gender'],
**{'inner': 'stick'})
chart
import numpy as np

chart = violin_plot(_df_73, *['Subscription_Length_Months',

'Location'], **{'inner': 'stick'})
chart
check_for_null
df.isnull().any()
Age False
Gender False
Location False
Subscription_Length_Months False
Monthly_Bill False
Total_Usage_GB False
Churn False
dtype: bool
Hence we see that there are no missing values
df.min()
Age 18
Gender Female
Location Chicago
Subscription_Length_Months 1
Monthly_Bill 30.0
Total_Usage_GB 50
Churn 0
dtype: object
df.max()
Age 70
Gender Male
Location New York
Monthly_Bill 100.0
Total_Usage_GB 500
Churn 1
dtype: object
There are no zeros, and hence we can say that there are no missing values. The min max range is
also reasonable
Let us look for correlation between the data
sns.heatmap(df.corr(),cmap='RdBu');
<ipython-input-28-b0c0b226e882>:1: FutureWarning: The default value of

numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
sns.heatmap(df.corr(),cmap='RdBu');
From the Heat-map it is clear that the values are hihgly uncorrelated. So all the features are
important and they can make a difference.
Now let us find out what really matters. What causes the churn and what does not cause the
churn
chu = df.loc[df.Churn == 1]
nchu = df.loc[df.Churn == 0]
chu.count()
Age 49779
Gender 49779
Location 49779
Monthly_Bill 49779
Total_Usage_GB 49779
Churn 49779
dtype: int64
nchu.count()
Age 50221
Gender 50221
Location 50221
Monthly_Bill 50221
Total_Usage_GB 50221
Churn 50221
dtype: int64
The data set is more or less balanced. So this is good data too work with.
Now let us compare the correlation between churn and non churn data
sns.heatmap(chu.corr(),cmap='RdBu',);
<ipython-input-35-dedbc7672ed7>:1: FutureWarning: The default value of

sns.heatmap(chu.corr(),cmap='RdBu',);
sns.heatmap(nchu.corr(),cmap='RdBu');
<ipython-input-38-c0011f0edf45>:1: FutureWarning: The default value of

sns.heatmap(nchu.corr(),cmap='RdBu');
Once again there isn't much correlation beween the data, this is because variables are
categorical. But even the non categorical variables like monthly bill and subscription length are
very much uncorrelated.
But what is that really matters ?
var = df.groupby('Gender').Churn.sum()
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
var.plot(kind='bar')
ax1.set_title("Total amount per transaction type")
ax1.set_xlabel('Type of Transaction')
ax1.set_ylabel('Amount');
The above graph is x axis gener and y axis churn
var = df.groupby('Location').Churn.sum()
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
var.plot(kind='bar')
ax1.set_title("Total amount per transaction type")
ax1.set_xlabel('Type of Transaction')
ax1.set_ylabel('Amount');
In above graph x axis is location and y axis is churn.
That data is too balanced, churn and not churn seems to be completely random and this is a
challenging work.
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(chu['Age'],chu['Churn'],c='g')
ax.scatter(nchu['Age'],nchu['Churn'],c='r')
plt.show()
fig = plt.figure()
ax.scatter(chu['Monthly_Bill'],chu['Churn'],c='g')
ax.scatter(nchu['Monthly_Bill'],nchu['Churn'],c='r')
plt.show()
We can convfirm that the distribution of Churn and non Churn based on different features is
even and a simple If Else program can not sort the distinction based on any feature. Let us
confirm this further. There can be no thresholds for Churn and Non Churn that can be seperated
easily
fig = plt.figure()
ax.scatter(chu['Total_Usage_GB'],chu['Churn'],c='g')
ax.scatter(nchu['Total_Usage_GB'],nchu['Churn'],c='r')
plt.show()
fig = plt.figure()
ax.scatter(chu['Subscription_Length_Months'],chu['Churn'],c='g')
ax.scatter(nchu['Subscription_Length_Months'],nchu['Churn'],c='r')
plt.show()
Now let us look at outlier detection. But before that let us convert the data into categorical
variables and normalize.
df.describe()
Age Subscription_Length_Months Monthly_Bill \

count 100000.000000 100000.000000 100000.000000
mean 44.027020 12.490100 65.053197
std 15.280283 6.926461 20.230696
min 18.000000 1.000000 30.000000
25% 31.000000 6.000000 47.540000
50% 44.000000 12.000000 65.010000
75% 57.000000 19.000000 82.640000
max 70.000000 24.000000 100.000000
count 100000.000000 100000.000000
mean 274.393650 0.497790
std 130.463063 0.499998
min 50.000000 0.000000
25% 161.000000 0.000000
50% 274.000000 0.000000
75% 387.000000 1.000000
max 500.000000 1.000000
Describe give the overall range of data including percentile information, which will help us find
the upper and lower limit.
df.head()

\
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0
df1=df.copy()
df['Location'].unique().tolist()
['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston']
df = df.replace(to_replace={'Male':1,'Female':2})
df.head()

\
0 63 1 Los Angeles 17 73.36
1 62 2 New York 1 48.76
2 24 2 Los Angeles 5 85.47
3 36 2 Miami 3 97.94
4 46 2 Miami 19 58.14
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0
df = df.replace(to_replace={'Los Angeles':1,'New
York':2,'Miami':3,'Chicago':4,'Houston':5})
df.head()
Age Gender Location Subscription_Length_Months Monthly_Bill \

0 63 1 1 17 73.36
1 62 2 2 1 48.76
2 24 2 1 5 85.47
3 36 2 3 3 97.94
4 46 2 3 19 58.14
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0
Min Max Normalization
x = df.values #returns a numpy array

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.head()
0 1 2 3 4 5 6
0 0.865385 0.0 0.00 0.695652 0.619429 0.413333 0.0
1 0.846154 1.0 0.25 0.000000 0.268000 0.271111 0.0
2 0.115385 1.0 0.00 0.173913 0.792429 0.911111 0.0
3 0.346154 1.0 0.50 0.086957 0.970571 0.548889 1.0
4 0.538462 1.0 0.50 0.782609 0.402000 0.480000 0.0
Outlier detection using Inter Quartile Range
def outlier(A):
Q1 = np.percentile(A, 25, method='midpoint')
Q3 = np.percentile(A, 75, method='midpoint')
IQR = Q3 - Q1
upper = Q3 + 1.5 * IQR
lower = Q1 - 1.5 * IQR
upper_array = np.where(A >= upper)[0]
lower_array = np.where(A <= lower)[0]
print(upper)
print(lower)
print(upper_array)
print(lower_array)
A = A.drop(index=upper_array) # Remove the outliers
A = A.drop(index=lower_array)
return A
print('subscription',outlier(df[3]))
print('Bill',outlier(df[4]))
print('Total Use GB',outlier(df[5]))
subscription (1.6304347826086958, -0.6304347826086958)

Bill (1.504142857142857, -0.5015714285714283)
Total Use GB (1.5022222222222226, -0.5066666666666668)
These are the upper and the lower bounds, anything that lies beyond these bounds are to be
considered as outliers.
df_o=df.copy()
df_o.head()
0 1 2 3 4 5 6
0 0.865385 0.0 0.00 0.695652 0.619429 0.413333 0.0
1 0.846154 1.0 0.25 0.000000 0.268000 0.271111 0.0
2 0.115385 1.0 0.00 0.173913 0.792429 0.911111 0.0
3 0.346154 1.0 0.50 0.086957 0.970571 0.548889 1.0
4 0.538462 1.0 0.50 0.782609 0.402000 0.480000 0.0
df_o[5]=outlier(df_o[5])
1.5022222222222226
-0.5066666666666668
[]
[]
The quartile upper and lower limits are well above the max and minimum, hence there are no
outliers to remove, this confirms the structure of the scatter plot and other plots. Hence there
are no outliers. In other words, the upper and lower limit are well outside the normalization
range.d So this further reiterates that there are not outliers in the data.
df_o[5].max()
1.0
df_o[5].min()
0.0
df_o[5]
0 0.413333
1 0.271111
2 0.911111
3 0.548889
4 0.480000
...
99995 0.391111
99996 0.668889
99997 0.446667
99998 0.853333
99999 0.273333
Name: 5, Length: 100000, dtype: float64
print(df_o[4].count())
print(df_o[4].max())
print(df_o[4].min())
1.504142857142857
-0.5015714285714283
[]
[]
100000
1.0
0.0
print(df_o[3].count())
print(df_o[3].max())
print(df_o[3].min())
1.6304347826086958
-0.6304347826086958
[]
[]
100000
1.0
0.0
Hence we confirm that there are no outliers on the data, the data naturally does not have
outliers and min max normalization has removed the need for outliers
df_o[3]
0 0.695652
1 0.000000
2 0.173913
3 0.086957
4 0.782609
...
99995 0.956522
99996 0.782609
99997 0.695652
99998 0.826087
99999 0.782609
Model creation and optimization
The normalized data looks like this
df.sample(5)
0 1 2 3 4 5 6
90208 0.923077 1.0 0.5 0.217391 0.421143 0.862222 1.0
65316 0.942308 1.0 0.0 0.739130 0.110857 0.600000 1.0
10386 0.576923 0.0 1.0 0.260870 0.116857 0.102222 0.0
73335 0.711538 0.0 1.0 0.043478 0.495714 0.640000 0.0
15963 0.019231 1.0 0.5 0.347826 0.819286 0.891111 0.0
<google.colab._quickchart_helpers.SectionTitle at 0x78de7ec30520>
import numpy as np

title=y)
plt.tight_layout()
chart = value_plot(_df_74, *[0], **{})

chart
import numpy as np

title=y)
plt.tight_layout()
chart
import numpy as np

title=y)
plt.tight_layout()

chart
import numpy as np

title=y)
plt.tight_layout()

chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8ac94040>
import numpy as np

plt.tight_layout()
chart = histogram(_df_78, *[0], **{})

chart
import numpy as np

plt.tight_layout()

chart
import numpy as np

plt.tight_layout()

chart
import numpy as np

plt.tight_layout()

chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de7e03dbd0>
import numpy as np
def scatter_plots(df, colname_pairs, figscale=1, alpha=.8):

plt.figure(figsize=(len(colname_pairs) * 6 * figscale, 6 *
figscale))
for plot_i, (x_colname, y_colname) in enumerate(colname_pairs,
start=1):
ax = plt.subplot(1, len(colname_pairs), plot_i)
df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 *
figscale), alpha=alpha, ax=ax)
ax.spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
chart = scatter_plots(_df_82, *[[[0, 1], [1, 2], [2, 3], [3, 4]]],
**{})
chart
Y = df[[6]]
X = df.drop([6],axis=1)
X.sample(3)
0 1 2 3 4 5
20012 0.846154 1.0 0.50 0.652174 0.916857 0.111111
6045 0.596154 1.0 0.00 0.608696 0.531429 0.695556
88476 0.961538 1.0 0.75 0.000000 0.434857 0.108889
df[5]
0 0.413333
1 0.271111
2 0.911111
3 0.548889
4 0.480000
...
99995 0.391111
99996 0.668889
99997 0.446667
99998 0.853333
99999 0.273333
df[4]
0 0.619429
1 0.268000
2 0.792429
3 0.970571
4 0.402000
...
99995 0.359000
99996 0.452143
99997 0.944429
99998 0.275000
99999 0.665286
df.count()
0 100000
1 100000
2 100000
3 100000
4 100000
5 100000
6 100000
dtype: int64
Let us analyze the code
Mmodel Optimization and Hyperparameter tuning
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import average_precision_score
Text Train Split
train_X, test_X, train_y, test_y = train_test_split(X,Y, test_size =

0.3, random_state = 121)
train_X_, val_X, train_y_, val_y = train_test_split(X,Y, test_size =

0.2, random_state = 121)
Finding out the best random forest hyperparameters for the model
from sklearn.model_selection import RandomizedSearchCV# Number of

trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200,
num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap} #These are the
hyperparameters
#pprint(random_grid)
random_grid
{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
'max_features': ['auto', 'sqrt'],
'max_depth': [10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [True, False]}
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available
cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions =
random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs =
-1)# Fit the random search model
rf_random.fit(train_X, train_y)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/
_search.py:909: DataConversionWarning: A column-vector y was passed
when a 1d array was expected. Please change the shape of y to
(n_samples,), for example using ravel().
self.best_estimator_.fit(X, y, **fit_params)
/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:42
4: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and
will be removed in 1.3. To keep the past behaviour, explicitly set
`max_features='sqrt'` or remove this parameter as it is also the
default value for RandomForestClassifiers and ExtraTreesClassifiers.
warn(
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-

1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 14, 18, 22,
26, 30,
34, 38, 42, 46,
50,
None],
'max_features': ['auto',
'sqrt'],
'min_samples_split': [2, 5,
10],
'n_estimators': [10, 20, 30,
40, 50, 60,
70, 80, 90,
100]},
random_state=42, verbose=2)
These are the parameters best tuned for limited number of iterations.
rf_random.best_params_
{'n_estimators': 60,
'min_samples_split': 2,
'min_samples_leaf': 4,
'max_features': 'auto',
'max_depth': 46,
'bootstrap': True}
print(rf_random.best_params_)
saving the model
import joblib
# save
joblib.dump(rf_random, "/content/drive/MyDrive/Sunbase/random.joblib")
# load
['/content/drive/MyDrive/Sunbase/random.joblib']
t_pred=rf_random.predict(test_X)
from sklearn.metrics import classification_report
t_pred
array([1., 1., 1., ..., 1., 0., 0.])
print(classification_report(t_pred,test_y))
precision recall f1-score support
0.0 0.51 0.50 0.51 15272

1.0 0.49 0.50 0.49 14728
accuracy 0.50 30000

macro avg 0.50 0.50 0.50 30000
weighted avg 0.50 0.50 0.50 30000
The performance so far has been only 50 percentage, let us see if we can improve it further.
rf_random = RandomizedSearchCV(estimator = rf, param_distributions =

random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs
= -1)# Fit the random search model
rf_random.fit(train_X, train_y)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/
_search.py:909: DataConversionWarning: A column-vector y was passed
when a 1d array was expected. Please change the shape of y to
(n_samples,), for example using ravel().
self.best_estimator_.fit(X, y, **fit_params)
RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(),
n_iter=100,
n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 19, 28, 37,
46, 55,
64, 73, 82, 91,
100,
None],
'max_features': ['auto',
'sqrt'],
'min_samples_split': [2, 5,
10],
'n_estimators': [10, 31, 52,
73, 94,
115, 136,
157, 178,
200]},
random_state=42, verbose=2)
rf_random.best_params_
{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 10,
'bootstrap': False}
t_pred=rf_random.predict(test_X)
print(classification_report(t_pred,test_y))
0.0 0.57 0.51 0.54 17074

1.0 0.43 0.50 0.46 12926
accuracy 0.50 30000

macro avg 0.50 0.50 0.50 30000
weighted avg 0.51 0.50 0.51 30000
There has been a slight increase in the evaluation parameters for the non-churn case and a slight
dip in parameters for the churn case. So this is a win and loose situation. It is also clear that from
the parameters for the validation data. there has been considerable overlearning.
t_pred_=rf_random.predict(val_X)
print(classification_report(t_pred_,val_y))
0.0 0.57 0.51 0.54 11405

1.0 0.43 0.50 0.46 8595
accuracy 0.51 20000

macro avg 0.50 0.50 0.50 20000
weighted avg 0.51 0.51 0.51 20000
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
This data looks very amenable to logistic regression, although similar data sets have fared better
with random forest classifiers.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_X,train_y)
/usr/local/lib/python3.10/dist-packages/sklearn/utils/
validation.py:1143: DataConversionWarning: A column-vector y was
passed when a 1d array was expected. Please change the shape of y to
(n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
LogisticRegression()
pred=lr.predict(test_X)
print(classification_report(test_y,pred))
0.0 0.50 0.69 0.58 15108

1.0 0.50 0.31 0.38 14892
accuracy 0.50 30000

macro avg 0.50 0.50 0.48 30000
weighted avg 0.50 0.50 0.48 30000
The recall is good for no churn case and very poor for the churn case. In all of the cases the
machine learning model is able to predict correctly only for half the cases in general.
from keras import models, layers
model = models.Sequential()
model.add(layers.Dense(10, input_shape=(X.shape[1],),
activation='relu'))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(6, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
from sklearn.metrics import classification_report
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
For deep learning the learning improvement has been extremely small, we have tried
hyperparameter tuning like adding dropouts and changing the number of layers, epochs etc. We
conclude that data is insufficient for deep learning and generating more data will be appropriate,
for training neural networks
model.fit(X,Y,
epochs=100,
batch_size=64,
validation_data=(val_X, val_y))
Epoch 1/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6936 - val_loss: 0.6935
Epoch 2/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6933 - val_loss: 0.6931
Epoch 3/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 4/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6933
Epoch 5/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 6/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6932 - val_loss: 0.6932
Epoch 7/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 8/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6934
Epoch 9/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 10/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 11/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6932
Epoch 12/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 13/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 14/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 15/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 16/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 17/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 18/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 19/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 20/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 21/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 22/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 23/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6933
Epoch 24/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 25/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 26/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 27/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 28/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 29/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 30/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 31/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6932
Epoch 32/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 33/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 34/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 35/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 36/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 37/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6932
Epoch 38/100
1563/1563 [==============================] - 2s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 39/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6932
Epoch 40/100
1563/1563 [==============================] - 4s 3ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 41/100
1563/1563 [==============================] - 4s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 42/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 43/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 44/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 45/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 46/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6933
Epoch 47/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 48/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 49/100
1563/1563 [==============================] - 4s 3ms/step - loss:
0.6930 - val_loss: 0.6931
Epoch 50/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 51/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 52/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6930
Epoch 53/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6929
Epoch 54/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6929
Epoch 55/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6929
Epoch 56/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6932
Epoch 57/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6931
Epoch 58/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6929
Epoch 59/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6930
Epoch 60/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6929
Epoch 61/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6928
Epoch 62/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6928
Epoch 63/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 64/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6929
Epoch 65/100
1563/1563 [==============================] - 2s 2ms/step - loss:
0.6930 - val_loss: 0.6930
Epoch 66/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6931
Epoch 67/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 68/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 69/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 70/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 71/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6928
Epoch 72/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 73/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 74/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 75/100
1563/1563 [==============================] - 4s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 76/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 77/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 78/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6928 - val_loss: 0.6928
Epoch 79/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 80/100
1563/1563 [==============================] - 2s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 81/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6932
Epoch 82/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 83/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 84/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 85/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 86/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 87/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 88/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 89/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 90/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6930
Epoch 91/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6926
Epoch 92/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 93/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 94/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6928 - val_loss: 0.6926
Epoch 95/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6928 - val_loss: 0.6927
Epoch 96/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6928 - val_loss: 0.6926
Epoch 97/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6928 - val_loss: 0.6927
Epoch 98/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6928 - val_loss: 0.6926
Epoch 99/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6926
Epoch 100/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6925
<keras.src.callbacks.History at 0x7e0c41d35ff0>
val_predictions = model.predict(val_X)
625/625 [==============================] - 1s 1ms/step
preds = np.around(val_predictions)
print(classification_report(val_y, preds))
0.0 0.51 0.68 0.59 10113

1.0 0.51 0.33 0.40 9887
accuracy 0.51 20000

macro avg 0.51 0.51 0.49 20000
weighted avg 0.51 0.51 0.50 20000
Conclusion: multiple machine learning algorithms have given similar results. There might be
need for more data, or some statistical modification during preprocessing. The data may also be
difficult.

vertopal.com_Vijay_Shankar_Customer_churn_Random_Forest_Hyperparameter_tuning

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

vertopal.com_Vijay_Shankar_Customer_churn_Random_Forest_Hyperparameter_tuning

Uploaded by

Copyright:

Available Formats

1. Cleanup: The data does not include any null or even zeros, the least is 1.

3. Feature_Engineering: The feature engineering is minimal, we have removed

4. We have tried Random forest classifier, we optimize the hyperparameters using

Removing uncecessary columns customer Id, Name and Index

CustomerID Age Gender Location Subscription_Length_Months \

Monthly_Bill Total_Usage_GB Churn

Age Gender Location Subscription_Length_Months Monthly_Bill

1 62 Female New York 1 48.76

2 24 Female Los Angeles 5 85.47

3 36 Female Miami 3 97.94

4 46 Female Miami 19 58.14

Age Gender Location Subscription_Length_Months Monthly_Bill

1 62 Female New York 1 48.76

2 24 Female Los Angeles 5 85.47

3 36 Female Miami 3 97.94

4 46 Female Miami 19 58.14

def value_plot(df, y, figscale=1):

chart = value_plot(_df_58, *['Age'], **{})

def value_plot(df, y, figscale=1):

chart = value_plot(_df_59, *['Subscription_Length_Months'], **{})

def value_plot(df, y, figscale=1):

chart = value_plot(_df_60, *['Monthly_Bill'], **{})

def value_plot(df, y, figscale=1):

chart = value_plot(_df_61, *['Total_Usage_GB'], **{})

def histogram(df, colname, num_bins=20, figscale=1):

chart = histogram(_df_62, *['Age'], **{})

def histogram(df, colname, num_bins=20, figscale=1):

chart = histogram(_df_63, *['Subscription_Length_Months'], **{})

def histogram(df, colname, num_bins=20, figscale=1):

def histogram(df, colname, num_bins=20, figscale=1):

chart = histogram(_df_65, *['Total_Usage_GB'], **{})

def categorical_histogram(df, colname, figscale=1,

chart = categorical_histogram(_df_66, *['Gender'], **{})

def categorical_histogram(df, colname, figscale=1,

chart = categorical_histogram(_df_67, *['Location'], **{})

def scatter_plots(df, colname_pairs, figscale=1, alpha=.8):

chart = scatter_plots(_df_68, *[[['Age',

def heatmap(df, x_colname, y_colname, figscale=1,

chart = heatmap(_df_69, *['Gender', 'Location'], **{})

chart = violin_plot(_df_70, *['Age', 'Gender'], **{'inner': 'stick'})

def violin_plot(df, value_colname, facet_colname, figscale=1,

chart = violin_plot(_df_71, *['Age', 'Location'], **{'inner':

def violin_plot(df, value_colname, facet_colname, figscale=1,

def violin_plot(df, value_colname, facet_colname, figscale=1,

chart = violin_plot(_df_73, *['Subscription_Length_Months',

Hence we see that there are no missing values

Let us look for correlation between the data

<ipython-input-28-b0c0b226e882>:1: FutureWarning: The default value of

<ipython-input-35-dedbc7672ed7>:1: FutureWarning: The default value of

<ipython-input-38-c0011f0edf45>:1: FutureWarning: The default value of

But what is that really matters ?

Age Subscription_Length_Months Monthly_Bill \

Age Gender Location Subscription_Length_Months Monthly_Bill

1 62 Female New York 1 48.76

2 24 Female Los Angeles 5 85.47

3 36 Female Miami 3 97.94

4 46 Female Miami 19 58.14

['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston']

Age Gender Location Subscription_Length_Months Monthly_Bill

1 62 2 New York 1 48.76

2 24 2 Los Angeles 5 85.47

Age Gender Location Subscription_Length_Months Monthly_Bill \

Min Max Normalization

x = df.values #returns a numpy array