Download as pdf or txt
Download as pdf or txt
You are on page 1of 40

1. Cleanup: The data does not include any null or even zeros, the least is 1.

2. Outlier: From graphical analysis of the scatter plot and interquartile outlier
detection, we have shown that the data does not have any outliers.

3. Feature_Engineering: The feature engineering is minimal, we have removed


unncessary rows. We have cheecked for correlations between features using heat
map but they appear highly uncorrelated. The data set is balanced and linearly
unseperable.

4. We have tried Random forest classifier, we optimize the hyperparameters using


random search grid. We have tried to improve the evaluation metrics.

5. We have tried using deep learning as well. However, the result is not good enough.
For deep learning the data is insufficient, perhaps generating more data will be
appropriate (not including all of them here).

6. For normalization we have used a min max scaler, and this has possible eliminated
the need for outlier removal.

7. We have seperated train and test and validation data and found the result using
Random Forest Classifier.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing

Removing uncecessary columns customer Id, Name and Index

df=pd.read_excel('/content/drive/MyDrive/Sunbase/
customer_churn_large_dataset(2).xlsx')

df=df.drop(['Name'],axis=1)
df.head()

CustomerID Age Gender Location Subscription_Length_Months \


0 1 63 Male Los Angeles 17
1 2 62 Female New York 1
2 3 24 Female Los Angeles 5
3 4 36 Female Miami 3
4 5 46 Female Miami 19

Monthly_Bill Total_Usage_GB Churn


0 73.36 236 0
1 48.76 172 0
2 85.47 460 0
3 97.94 297 1
4 58.14 266 0

df=df.drop(['CustomerID'],axis=1)

df.head()

Age Gender Location Subscription_Length_Months Monthly_Bill


\
0 63 Male Los Angeles 17 73.36

1 62 Female New York 1 48.76

2 24 Female Los Angeles 5 85.47

3 36 Female Miami 3 97.94

4 46 Female Miami 19 58.14

Total_Usage_GB Churn
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0

len(df['Gender'])

100000

df.head()

Age Gender Location Subscription_Length_Months Monthly_Bill


\
0 63 Male Los Angeles 17 73.36

1 62 Female New York 1 48.76

2 24 Female Los Angeles 5 85.47

3 36 Female Miami 3 97.94

4 46 Female Miami 19 58.14

Total_Usage_GB Churn
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0

<google.colab._quickchart_helpers.SectionTitle at 0x78de8ac72710>

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_58, *['Age'], **{})


chart

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_59, *['Subscription_Length_Months'], **{})


chart

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_60, *['Monthly_Bill'], **{})


chart

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_61, *['Total_Usage_GB'], **{})


chart

<google.colab._quickchart_helpers.SectionTitle at 0x78de8b34d9c0>

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_62, *['Age'], **{})


chart

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_63, *['Subscription_Length_Months'], **{})


chart

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()
chart = histogram(_df_64, *['Monthly_Bill'], **{})
chart

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_65, *['Total_Usage_GB'], **{})


chart

<google.colab._quickchart_helpers.SectionTitle at 0x78de8ad5f1c0>

import numpy as np
from google.colab import autoviz

def categorical_histogram(df, colname, figscale=1,


mpl_palette_name='Dark2'):
from matplotlib import pyplot as plt
import seaborn as sns
df.groupby(colname).size().plot(kind='barh',
color=sns.palettes.mpl_palette(mpl_palette_name), figsize=(8*figscale,
4.8*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
return autoviz.MplChart.from_current_mpl_state()

chart = categorical_histogram(_df_66, *['Gender'], **{})


chart

import numpy as np
from google.colab import autoviz

def categorical_histogram(df, colname, figscale=1,


mpl_palette_name='Dark2'):
from matplotlib import pyplot as plt
import seaborn as sns
df.groupby(colname).size().plot(kind='barh',
color=sns.palettes.mpl_palette(mpl_palette_name), figsize=(8*figscale,
4.8*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
return autoviz.MplChart.from_current_mpl_state()

chart = categorical_histogram(_df_67, *['Location'], **{})


chart
<google.colab._quickchart_helpers.SectionTitle at 0x78de8ad5c7f0>

import numpy as np
from google.colab import autoviz

def scatter_plots(df, colname_pairs, figscale=1, alpha=.8):


from matplotlib import pyplot as plt
plt.figure(figsize=(len(colname_pairs) * 6 * figscale, 6 *
figscale))
for plot_i, (x_colname, y_colname) in enumerate(colname_pairs,
start=1):
ax = plt.subplot(1, len(colname_pairs), plot_i)
df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 *
figscale), alpha=alpha, ax=ax)
ax.spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = scatter_plots(_df_68, *[[['Age',


'Subscription_Length_Months'], ['Subscription_Length_Months',
'Monthly_Bill'], ['Monthly_Bill', 'Total_Usage_GB'],
['Total_Usage_GB', 'Churn']]], **{})
chart

<google.colab._quickchart_helpers.SectionTitle at 0x78de8ad5d7e0>

import numpy as np
from google.colab import autoviz

def heatmap(df, x_colname, y_colname, figscale=1,


mpl_palette_name='viridis'):
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
plt.subplots(figsize=(8 * figscale, 8 * figscale))
df_2dhist = pd.DataFrame({
x_label: grp[y_colname].value_counts()
for x_label, grp in df.groupby(x_colname)
})
sns.heatmap(df_2dhist, cmap=mpl_palette_name)
plt.xlabel(x_colname)
plt.ylabel(y_colname)
return autoviz.MplChart.from_current_mpl_state()

chart = heatmap(_df_69, *['Gender', 'Location'], **{})


chart

<google.colab._quickchart_helpers.SectionTitle at 0x78de8b428970>

import numpy as np
from google.colab import autoviz
def violin_plot(df, value_colname, facet_colname, figscale=1,
mpl_palette_name='Dark2', **kwargs):
from matplotlib import pyplot as plt
import seaborn as sns
figsize = (12 * figscale, 1.2 * figscale *
len(df[facet_colname].unique()))
plt.figure(figsize=figsize)
sns.violinplot(df, x=value_colname, y=facet_colname,
palette=mpl_palette_name, **kwargs)
sns.despine(top=True, right=True, bottom=True, left=True)
return autoviz.MplChart.from_current_mpl_state()

chart = violin_plot(_df_70, *['Age', 'Gender'], **{'inner': 'stick'})


chart

import numpy as np
from google.colab import autoviz

def violin_plot(df, value_colname, facet_colname, figscale=1,


mpl_palette_name='Dark2', **kwargs):
from matplotlib import pyplot as plt
import seaborn as sns
figsize = (12 * figscale, 1.2 * figscale *
len(df[facet_colname].unique()))
plt.figure(figsize=figsize)
sns.violinplot(df, x=value_colname, y=facet_colname,
palette=mpl_palette_name, **kwargs)
sns.despine(top=True, right=True, bottom=True, left=True)
return autoviz.MplChart.from_current_mpl_state()

chart = violin_plot(_df_71, *['Age', 'Location'], **{'inner':


'stick'})
chart

import numpy as np
from google.colab import autoviz

def violin_plot(df, value_colname, facet_colname, figscale=1,


mpl_palette_name='Dark2', **kwargs):
from matplotlib import pyplot as plt
import seaborn as sns
figsize = (12 * figscale, 1.2 * figscale *
len(df[facet_colname].unique()))
plt.figure(figsize=figsize)
sns.violinplot(df, x=value_colname, y=facet_colname,
palette=mpl_palette_name, **kwargs)
sns.despine(top=True, right=True, bottom=True, left=True)
return autoviz.MplChart.from_current_mpl_state()
chart = violin_plot(_df_72, *['Subscription_Length_Months', 'Gender'],
**{'inner': 'stick'})
chart

import numpy as np
from google.colab import autoviz

def violin_plot(df, value_colname, facet_colname, figscale=1,


mpl_palette_name='Dark2', **kwargs):
from matplotlib import pyplot as plt
import seaborn as sns
figsize = (12 * figscale, 1.2 * figscale *
len(df[facet_colname].unique()))
plt.figure(figsize=figsize)
sns.violinplot(df, x=value_colname, y=facet_colname,
palette=mpl_palette_name, **kwargs)
sns.despine(top=True, right=True, bottom=True, left=True)
return autoviz.MplChart.from_current_mpl_state()

chart = violin_plot(_df_73, *['Subscription_Length_Months',


'Location'], **{'inner': 'stick'})
chart

check_for_null

df.isnull().any()

Age False
Gender False
Location False
Subscription_Length_Months False
Monthly_Bill False
Total_Usage_GB False
Churn False
dtype: bool

Hence we see that there are no missing values

df.min()

Age 18
Gender Female
Location Chicago
Subscription_Length_Months 1
Monthly_Bill 30.0
Total_Usage_GB 50
Churn 0
dtype: object

df.max()
Age 70
Gender Male
Location New York
Subscription_Length_Months 24
Monthly_Bill 100.0
Total_Usage_GB 500
Churn 1
dtype: object

There are no zeros, and hence we can say that there are no missing values. The min max range is
also reasonable

Let us look for correlation between the data

sns.heatmap(df.corr(),cmap='RdBu');

<ipython-input-28-b0c0b226e882>:1: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
sns.heatmap(df.corr(),cmap='RdBu');
From the Heat-map it is clear that the values are hihgly uncorrelated. So all the features are
important and they can make a difference.

Now let us find out what really matters. What causes the churn and what does not cause the
churn

chu = df.loc[df.Churn == 1]
nchu = df.loc[df.Churn == 0]

chu.count()

Age 49779
Gender 49779
Location 49779
Subscription_Length_Months 49779
Monthly_Bill 49779
Total_Usage_GB 49779
Churn 49779
dtype: int64

nchu.count()

Age 50221
Gender 50221
Location 50221
Subscription_Length_Months 50221
Monthly_Bill 50221
Total_Usage_GB 50221
Churn 50221
dtype: int64

The data set is more or less balanced. So this is good data too work with.

Now let us compare the correlation between churn and non churn data

sns.heatmap(chu.corr(),cmap='RdBu',);

<ipython-input-35-dedbc7672ed7>:1: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
sns.heatmap(chu.corr(),cmap='RdBu',);
sns.heatmap(nchu.corr(),cmap='RdBu');

<ipython-input-38-c0011f0edf45>:1: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
sns.heatmap(nchu.corr(),cmap='RdBu');
Once again there isn't much correlation beween the data, this is because variables are
categorical. But even the non categorical variables like monthly bill and subscription length are
very much uncorrelated.

But what is that really matters ?

var = df.groupby('Gender').Churn.sum()
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
var.plot(kind='bar')
ax1.set_title("Total amount per transaction type")
ax1.set_xlabel('Type of Transaction')
ax1.set_ylabel('Amount');
The above graph is x axis gener and y axis churn

var = df.groupby('Location').Churn.sum()
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
var.plot(kind='bar')
ax1.set_title("Total amount per transaction type")
ax1.set_xlabel('Type of Transaction')
ax1.set_ylabel('Amount');
In above graph x axis is location and y axis is churn.

That data is too balanced, churn and not churn seems to be completely random and this is a
challenging work.

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(chu['Age'],chu['Churn'],c='g')
ax.scatter(nchu['Age'],nchu['Churn'],c='r')
plt.show()
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(chu['Monthly_Bill'],chu['Churn'],c='g')
ax.scatter(nchu['Monthly_Bill'],nchu['Churn'],c='r')
plt.show()
We can convfirm that the distribution of Churn and non Churn based on different features is
even and a simple If Else program can not sort the distinction based on any feature. Let us
confirm this further. There can be no thresholds for Churn and Non Churn that can be seperated
easily

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(chu['Total_Usage_GB'],chu['Churn'],c='g')
ax.scatter(nchu['Total_Usage_GB'],nchu['Churn'],c='r')
plt.show()
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(chu['Subscription_Length_Months'],chu['Churn'],c='g')
ax.scatter(nchu['Subscription_Length_Months'],nchu['Churn'],c='r')
plt.show()
Now let us look at outlier detection. But before that let us convert the data into categorical
variables and normalize.

df.describe()

Age Subscription_Length_Months Monthly_Bill \


count 100000.000000 100000.000000 100000.000000
mean 44.027020 12.490100 65.053197
std 15.280283 6.926461 20.230696
min 18.000000 1.000000 30.000000
25% 31.000000 6.000000 47.540000
50% 44.000000 12.000000 65.010000
75% 57.000000 19.000000 82.640000
max 70.000000 24.000000 100.000000

Total_Usage_GB Churn
count 100000.000000 100000.000000
mean 274.393650 0.497790
std 130.463063 0.499998
min 50.000000 0.000000
25% 161.000000 0.000000
50% 274.000000 0.000000
75% 387.000000 1.000000
max 500.000000 1.000000
Describe give the overall range of data including percentile information, which will help us find
the upper and lower limit.

df.head()

Age Gender Location Subscription_Length_Months Monthly_Bill


\
0 63 Male Los Angeles 17 73.36

1 62 Female New York 1 48.76

2 24 Female Los Angeles 5 85.47

3 36 Female Miami 3 97.94

4 46 Female Miami 19 58.14

Total_Usage_GB Churn
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0

df1=df.copy()

df['Location'].unique().tolist()

['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston']

df = df.replace(to_replace={'Male':1,'Female':2})
df.head()

Age Gender Location Subscription_Length_Months Monthly_Bill


\
0 63 1 Los Angeles 17 73.36

1 62 2 New York 1 48.76

2 24 2 Los Angeles 5 85.47

3 36 2 Miami 3 97.94

4 46 2 Miami 19 58.14

Total_Usage_GB Churn
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0

df = df.replace(to_replace={'Los Angeles':1,'New
York':2,'Miami':3,'Chicago':4,'Houston':5})
df.head()

Age Gender Location Subscription_Length_Months Monthly_Bill \


0 63 1 1 17 73.36
1 62 2 2 1 48.76
2 24 2 1 5 85.47
3 36 2 3 3 97.94
4 46 2 3 19 58.14

Total_Usage_GB Churn
0 236 0
1 172 0
2 460 0
3 297 1
4 266 0

Min Max Normalization

x = df.values #returns a numpy array


min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

df.head()

0 1 2 3 4 5 6
0 0.865385 0.0 0.00 0.695652 0.619429 0.413333 0.0
1 0.846154 1.0 0.25 0.000000 0.268000 0.271111 0.0
2 0.115385 1.0 0.00 0.173913 0.792429 0.911111 0.0
3 0.346154 1.0 0.50 0.086957 0.970571 0.548889 1.0
4 0.538462 1.0 0.50 0.782609 0.402000 0.480000 0.0

Outlier detection using Inter Quartile Range

def outlier(A):
Q1 = np.percentile(A, 25, method='midpoint')
Q3 = np.percentile(A, 75, method='midpoint')
IQR = Q3 - Q1
upper = Q3 + 1.5 * IQR
lower = Q1 - 1.5 * IQR
upper_array = np.where(A >= upper)[0]
lower_array = np.where(A <= lower)[0]
print(upper)
print(lower)
print(upper_array)
print(lower_array)
A = A.drop(index=upper_array) # Remove the outliers
A = A.drop(index=lower_array)

return A

print('subscription',outlier(df[3]))
print('Bill',outlier(df[4]))
print('Total Use GB',outlier(df[5]))

subscription (1.6304347826086958, -0.6304347826086958)


Bill (1.504142857142857, -0.5015714285714283)
Total Use GB (1.5022222222222226, -0.5066666666666668)

These are the upper and the lower bounds, anything that lies beyond these bounds are to be
considered as outliers.

df_o=df.copy()
df_o.head()

0 1 2 3 4 5 6
0 0.865385 0.0 0.00 0.695652 0.619429 0.413333 0.0
1 0.846154 1.0 0.25 0.000000 0.268000 0.271111 0.0
2 0.115385 1.0 0.00 0.173913 0.792429 0.911111 0.0
3 0.346154 1.0 0.50 0.086957 0.970571 0.548889 1.0
4 0.538462 1.0 0.50 0.782609 0.402000 0.480000 0.0

df_o[5]=outlier(df_o[5])

1.5022222222222226
-0.5066666666666668
[]
[]

The quartile upper and lower limits are well above the max and minimum, hence there are no
outliers to remove, this confirms the structure of the scatter plot and other plots. Hence there
are no outliers. In other words, the upper and lower limit are well outside the normalization
range.d So this further reiterates that there are not outliers in the data.

df_o[5].max()

1.0

df_o[5].min()

0.0
df_o[5]

0 0.413333
1 0.271111
2 0.911111
3 0.548889
4 0.480000
...
99995 0.391111
99996 0.668889
99997 0.446667
99998 0.853333
99999 0.273333
Name: 5, Length: 100000, dtype: float64

df_o[4]=outlier(df_o[4])
print(df_o[4].count())
print(df_o[4].max())
print(df_o[4].min())

1.504142857142857
-0.5015714285714283
[]
[]
100000
1.0
0.0

df_o[3]=outlier(df_o[3])
print(df_o[3].count())
print(df_o[3].max())
print(df_o[3].min())

1.6304347826086958
-0.6304347826086958
[]
[]
100000
1.0
0.0

Hence we confirm that there are no outliers on the data, the data naturally does not have
outliers and min max normalization has removed the need for outliers

df_o[3]

0 0.695652
1 0.000000
2 0.173913
3 0.086957
4 0.782609
...
99995 0.956522
99996 0.782609
99997 0.695652
99998 0.826087
99999 0.782609
Name: 3, Length: 100000, dtype: float64

Model creation and optimization

The normalized data looks like this

df.sample(5)

0 1 2 3 4 5 6
90208 0.923077 1.0 0.5 0.217391 0.421143 0.862222 1.0
65316 0.942308 1.0 0.0 0.739130 0.110857 0.600000 1.0
10386 0.576923 0.0 1.0 0.260870 0.116857 0.102222 0.0
73335 0.711538 0.0 1.0 0.043478 0.495714 0.640000 0.0
15963 0.019231 1.0 0.5 0.347826 0.819286 0.891111 0.0

<google.colab._quickchart_helpers.SectionTitle at 0x78de7ec30520>

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_74, *[0], **{})


chart

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()
chart = value_plot(_df_75, *[1], **{})
chart

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_76, *[2], **{})


chart

import numpy as np
from google.colab import autoviz

def value_plot(df, y, figscale=1):


from matplotlib import pyplot as plt
df[y].plot(kind='line', figsize=(8 * figscale, 4 * figscale),
title=y)
plt.gca().spines[['top', 'right']].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = value_plot(_df_77, *[3], **{})


chart

<google.colab._quickchart_helpers.SectionTitle at 0x78de8ac94040>

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_78, *[0], **{})


chart

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_79, *[1], **{})


chart

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_80, *[2], **{})


chart

import numpy as np
from google.colab import autoviz

def histogram(df, colname, num_bins=20, figscale=1):


from matplotlib import pyplot as plt
df[colname].plot(kind='hist', bins=num_bins, title=colname,
figsize=(8*figscale, 4*figscale))
plt.gca().spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = histogram(_df_81, *[3], **{})


chart

<google.colab._quickchart_helpers.SectionTitle at 0x78de7e03dbd0>

import numpy as np
from google.colab import autoviz

def scatter_plots(df, colname_pairs, figscale=1, alpha=.8):


from matplotlib import pyplot as plt
plt.figure(figsize=(len(colname_pairs) * 6 * figscale, 6 *
figscale))
for plot_i, (x_colname, y_colname) in enumerate(colname_pairs,
start=1):
ax = plt.subplot(1, len(colname_pairs), plot_i)
df.plot(kind='scatter', x=x_colname, y=y_colname, s=(32 *
figscale), alpha=alpha, ax=ax)
ax.spines[['top', 'right',]].set_visible(False)
plt.tight_layout()
return autoviz.MplChart.from_current_mpl_state()

chart = scatter_plots(_df_82, *[[[0, 1], [1, 2], [2, 3], [3, 4]]],
**{})
chart

Y = df[[6]]
X = df.drop([6],axis=1)

X.sample(3)

0 1 2 3 4 5
20012 0.846154 1.0 0.50 0.652174 0.916857 0.111111
6045 0.596154 1.0 0.00 0.608696 0.531429 0.695556
88476 0.961538 1.0 0.75 0.000000 0.434857 0.108889

df[5]

0 0.413333
1 0.271111
2 0.911111
3 0.548889
4 0.480000
...
99995 0.391111
99996 0.668889
99997 0.446667
99998 0.853333
99999 0.273333
Name: 5, Length: 100000, dtype: float64

df[4]

0 0.619429
1 0.268000
2 0.792429
3 0.970571
4 0.402000
...
99995 0.359000
99996 0.452143
99997 0.944429
99998 0.275000
99999 0.665286
Name: 4, Length: 100000, dtype: float64

df.count()
0 100000
1 100000
2 100000
3 100000
4 100000
5 100000
6 100000
dtype: int64

Let us analyze the code

Mmodel Optimization and Hyperparameter tuning

from sklearn.model_selection import train_test_split


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import average_precision_score

Text Train Split

train_X, test_X, train_y, test_y = train_test_split(X,Y, test_size =


0.3, random_state = 121)

train_X_, val_X, train_y_, val_y = train_test_split(X,Y, test_size =


0.2, random_state = 121)

Finding out the best random forest hyperparameters for the model

from sklearn.model_selection import RandomizedSearchCV# Number of


trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200,
num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap} #These are the
hyperparameters
#pprint(random_grid)

random_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
'max_features': ['auto', 'sqrt'],
'max_depth': [10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [True, False]}

# Use the random grid to search for best hyperparameters


# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available
cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions =
random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs =
-1)# Fit the random search model
rf_random.fit(train_X, train_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/
_search.py:909: DataConversionWarning: A column-vector y was passed
when a 1d array was expected. Please change the shape of y to
(n_samples,), for example using ravel().
self.best_estimator_.fit(X, y, **fit_params)
/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py:42
4: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and
will be removed in 1.3. To keep the past behaviour, explicitly set
`max_features='sqrt'` or remove this parameter as it is also the
default value for RandomForestClassifiers and ExtraTreesClassifiers.
warn(

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-


1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 14, 18, 22,
26, 30,
34, 38, 42, 46,
50,
None],
'max_features': ['auto',
'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5,
10],
'n_estimators': [10, 20, 30,
40, 50, 60,
70, 80, 90,
100]},
random_state=42, verbose=2)

These are the parameters best tuned for limited number of iterations.

rf_random.best_params_

{'n_estimators': 60,
'min_samples_split': 2,
'min_samples_leaf': 4,
'max_features': 'auto',
'max_depth': 46,
'bootstrap': True}

print(rf_random.best_params_)

saving the model

import joblib

# save
joblib.dump(rf_random, "/content/drive/MyDrive/Sunbase/random.joblib")

# load

['/content/drive/MyDrive/Sunbase/random.joblib']

t_pred=rf_random.predict(test_X)

from sklearn.metrics import classification_report

t_pred

array([1., 1., 1., ..., 1., 0., 0.])

print(classification_report(t_pred,test_y))

precision recall f1-score support

0.0 0.51 0.50 0.51 15272


1.0 0.49 0.50 0.49 14728

accuracy 0.50 30000


macro avg 0.50 0.50 0.50 30000
weighted avg 0.50 0.50 0.50 30000

The performance so far has been only 50 percentage, let us see if we can improve it further.

rf_random = RandomizedSearchCV(estimator = rf, param_distributions =


random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs
= -1)# Fit the random search model
rf_random.fit(train_X, train_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/
_search.py:909: DataConversionWarning: A column-vector y was passed
when a 1d array was expected. Please change the shape of y to
(n_samples,), for example using ravel().
self.best_estimator_.fit(X, y, **fit_params)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(),
n_iter=100,
n_jobs=-1,
param_distributions={'bootstrap': [True, False],
'max_depth': [10, 19, 28, 37,
46, 55,
64, 73, 82, 91,
100,
None],
'max_features': ['auto',
'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5,
10],
'n_estimators': [10, 31, 52,
73, 94,
115, 136,
157, 178,
200]},
random_state=42, verbose=2)

rf_random.best_params_

{'n_estimators': 200,
'min_samples_split': 10,
'min_samples_leaf': 1,
'max_features': 'sqrt',
'max_depth': 10,
'bootstrap': False}
t_pred=rf_random.predict(test_X)
print(classification_report(t_pred,test_y))

precision recall f1-score support

0.0 0.57 0.51 0.54 17074


1.0 0.43 0.50 0.46 12926

accuracy 0.50 30000


macro avg 0.50 0.50 0.50 30000
weighted avg 0.51 0.50 0.51 30000

There has been a slight increase in the evaluation parameters for the non-churn case and a slight
dip in parameters for the churn case. So this is a win and loose situation. It is also clear that from
the parameters for the validation data. there has been considerable overlearning.

t_pred_=rf_random.predict(val_X)
print(classification_report(t_pred_,val_y))

precision recall f1-score support

0.0 0.57 0.51 0.54 11405


1.0 0.43 0.50 0.46 8595

accuracy 0.51 20000


macro avg 0.50 0.50 0.50 20000
weighted avg 0.51 0.51 0.51 20000

from sklearn.model_selection import train_test_split


from sklearn.naive_bayes import GaussianNB

This data looks very amenable to logistic regression, although similar data sets have fared better
with random forest classifiers.

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(train_X,train_y)

/usr/local/lib/python3.10/dist-packages/sklearn/utils/
validation.py:1143: DataConversionWarning: A column-vector y was
passed when a 1d array was expected. Please change the shape of y to
(n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)

LogisticRegression()

pred=lr.predict(test_X)
print(classification_report(test_y,pred))

precision recall f1-score support

0.0 0.50 0.69 0.58 15108


1.0 0.50 0.31 0.38 14892

accuracy 0.50 30000


macro avg 0.50 0.50 0.48 30000
weighted avg 0.50 0.50 0.48 30000

The recall is good for no churn case and very poor for the churn case. In all of the cases the
machine learning model is able to predict correctly only for half the cases in general.

from keras import models, layers

model = models.Sequential()

model.add(layers.Dense(10, input_shape=(X.shape[1],),
activation='relu'))

model.add(layers.Dense(8, activation='relu'))

model.add(layers.Dense(6, activation='relu'))

model.add(layers.Dense(1, activation='sigmoid'))

from sklearn.metrics import classification_report

model.compile(optimizer='rmsprop', loss='binary_crossentropy')

For deep learning the learning improvement has been extremely small, we have tried
hyperparameter tuning like adding dropouts and changing the number of layers, epochs etc. We
conclude that data is insufficient for deep learning and generating more data will be appropriate,
for training neural networks

model.fit(X,Y,
epochs=100,
batch_size=64,
validation_data=(val_X, val_y))

Epoch 1/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6936 - val_loss: 0.6935
Epoch 2/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6933 - val_loss: 0.6931
Epoch 3/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 4/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6933
Epoch 5/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 6/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6932 - val_loss: 0.6932
Epoch 7/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 8/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6934
Epoch 9/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 10/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 11/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6932
Epoch 12/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 13/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 14/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 15/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 16/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 17/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 18/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 19/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 20/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6932 - val_loss: 0.6931
Epoch 21/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 22/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 23/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6933
Epoch 24/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 25/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 26/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 27/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 28/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 29/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 30/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 31/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6932
Epoch 32/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 33/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 34/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 35/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 36/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 37/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6932
Epoch 38/100
1563/1563 [==============================] - 2s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 39/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6932
Epoch 40/100
1563/1563 [==============================] - 4s 3ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 41/100
1563/1563 [==============================] - 4s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 42/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 43/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 44/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6931
Epoch 45/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 46/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6933
Epoch 47/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 48/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 49/100
1563/1563 [==============================] - 4s 3ms/step - loss:
0.6930 - val_loss: 0.6931
Epoch 50/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 51/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6930
Epoch 52/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6930
Epoch 53/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6929
Epoch 54/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6929
Epoch 55/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6931 - val_loss: 0.6929
Epoch 56/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6932
Epoch 57/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6931
Epoch 58/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6929
Epoch 59/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6930
Epoch 60/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6929
Epoch 61/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6928
Epoch 62/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6928
Epoch 63/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 64/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6930 - val_loss: 0.6929
Epoch 65/100
1563/1563 [==============================] - 2s 2ms/step - loss:
0.6930 - val_loss: 0.6930
Epoch 66/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6931
Epoch 67/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 68/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 69/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 70/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 71/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6930 - val_loss: 0.6928
Epoch 72/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 73/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 74/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 75/100
1563/1563 [==============================] - 4s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 76/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 77/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 78/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6928 - val_loss: 0.6928
Epoch 79/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 80/100
1563/1563 [==============================] - 2s 2ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 81/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6932
Epoch 82/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 83/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 84/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 85/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 86/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6928
Epoch 87/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 88/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 89/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6929
Epoch 90/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6929 - val_loss: 0.6930
Epoch 91/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6926
Epoch 92/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 93/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6927
Epoch 94/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6928 - val_loss: 0.6926
Epoch 95/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6928 - val_loss: 0.6927
Epoch 96/100
1563/1563 [==============================] - 3s 2ms/step - loss:
0.6928 - val_loss: 0.6926
Epoch 97/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6928 - val_loss: 0.6927
Epoch 98/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6928 - val_loss: 0.6926
Epoch 99/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6926
Epoch 100/100
1563/1563 [==============================] - 2s 1ms/step - loss:
0.6929 - val_loss: 0.6925

<keras.src.callbacks.History at 0x7e0c41d35ff0>
val_predictions = model.predict(val_X)

625/625 [==============================] - 1s 1ms/step

preds = np.around(val_predictions)

print(classification_report(val_y, preds))

precision recall f1-score support

0.0 0.51 0.68 0.59 10113


1.0 0.51 0.33 0.40 9887

accuracy 0.51 20000


macro avg 0.51 0.51 0.49 20000
weighted avg 0.51 0.51 0.50 20000

Conclusion: multiple machine learning algorithms have given similar results. There might be
need for more data, or some statistical modification during preprocessing. The data may also be
difficult.

You might also like