LDA Code

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 19

import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
In [145]:
# Loading the data set
cmd= pd.read_excel('Contraceptive_method_dataset.xlsx')
In [146]:
cmd.head().T
Out[146]:
0 1 2 3 4

Wife_age 24.0 45.0 43.0 42.0 36.0

Wife_ education Primary Uneducated Primary Secondary Secondary

Husband_education Secondary Secondary Secondary Primary Secondary

No_of_children_born 3.0 10.0 7.0 9.0 8.0

Scientolog
Wife_religion Scientology Scientology Scientology Scientology
y

Wife_Working No No No No No

Husband_Occupation 2 3 3 3 3

Standard_of_living_index High Very High Very High High Low

Media_exposure Exposed Exposed Exposed Exposed Exposed

Contraceptive_method_use
No No No No No
d

In [147]:
cmd.shape
Out[147]:
(1473, 10)
In [148]:
cmd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Wife_age 1402 non-null float64
1 Wife_ education 1473 non-null object
2 Husband_education 1473 non-null object
3 No_of_children_born 1452 non-null float64
4 Wife_religion 1473 non-null object
5 Wife_Working 1473 non-null object
6 Husband_Occupation 1473 non-null int64
7 Standard_of_living_index 1473 non-null object
8 Media_exposure 1473 non-null object
9 Contraceptive_method_used 1473 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 115.2+ KB
In [149]:
cmd.describe().transpose()
Out[149]:
count mean std min 25% 50% 75% max

16.
Wife_age 1402.0 32.606277 8.274927 26.0 32.0 39.0 49.0
0

No_of_children_born 1452.0 3.254132 2.365212 0.0 1.0 3.0 4.0 16.0

Husband_Occupation 1473.0 2.137814 0.864857 1.0 1.0 2.0 3.0 4.0

In [150]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[150]:
Wife_age 71
Wife_ education 0
Husband_education 0
No_of_children_born 21
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64
In [151]:
dups = cmd.duplicated()
print("There are ",cmd.duplicated().sum()," duplicates")
cmd[dups]
There are 80 duplicates
Out[151]:
Wif
Wif e_ Husban No_of_ch Wife_ Wife_ Husband Standard_ Media Contracepti
e_a edu d_educa ildren_b religi Worki _Occupat of_living_i _expos ve_method_
ge cati tion orn on ng ion ndex ure used
on

7 Tert Scient Expose


38.0 Tertiary 1.0 Yes 1 Very High No
9 iary ology d

1
Tert Scient Expose
6 26.0 Tertiary 1.0 No 1 Very High No
iary ology d
7

2
Tert Scient Expose
2 47.0 Tertiary 4.0 No 1 Very High No
iary ology d
4

2
Tert Scient Expose
7 30.0 Tertiary 2.0 No 1 Very High No
iary ology d
0

2
Tert Scient Expose
9 26.0 Tertiary 1.0 No 1 Very High No
iary ology d
9

... ... ... ... ... ... ... ... ... ... ...

1
3 Tert Scient Expose
44.0 Tertiary 5.0 Yes 1 Very High Yes
6 iary ology d
7

1
Sec
3 Na Scient Expose
ond Tertiary 2.0 Yes 2 Very High Yes
8 N ology d
ary
7

1
Non-
4 Na Tert Expose
Tertiary 2.0 Scient No 1 Very High Yes
2 N iary d
ology
3

1
Non-
4 Na Tert Expose
Tertiary 1.0 Scient Yes 2 Very High Yes
4 N iary d
ology
0
Wif
Wif e_ Husban No_of_ch Wife_ Wife_ Husband Standard_ Media Contracepti
e_a edu d_educa ildren_b religi Worki _Occupat of_living_i _expos ve_method_
ge cati tion orn on ng ion ndex ure used
on

1
Non-
4 Na Tert Expose
Tertiary 2.0 Scient Yes 2 Very High Yes
4 N iary d
ology
7

80 rows × 10 columns

In [152]:
# Removing the duplicate value
cmd.drop_duplicates(inplace=True)
In [153]:
dups = cmd.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
cmd[dups]
Number of duplicate rows = 0
Out[153]:
Wif
Wif e_ Husband No_of_ch Wife_ Wife_ Husband Standard_o Media_ Contracepti
e_a edu _educati ildren_bo religi Worki _Occupat f_living_in exposu ve_method_
ge cati on rn on ng ion dex re used
on

In [154]:
# To print number of rows and columns present in the dataset after removing
the
print("no.of rows: ",cmd.shape[0], "no.of columns:",cmd.shape[1])
no.of rows: 1393 no.of columns: 10
In [155]:
# To describe the dataset
cmd.describe().T
Out[155]:
count mean std min 25% 50% 75% max

16.
Wife_age 1326.0 32.557315 8.289259 26.0 32.0 39.0 49.0
0

No_of_children_born 1372.0 3.290816 2.399697 0.0 1.0 3.0 5.0 16.0

Husband_Occupation 1393.0 2.174444 0.854590 1.0 1.0 2.0 3.0 4.0

In [156]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[156]:
Wife_age 67
Wife_ education 0
Husband_education 0
No_of_children_born 21
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64
In [157]:
# Replace the null value with Median
cmd[['Wife_age', 'No_of_children_born']] = cmd[['Wife_age',
'No_of_children_born']].fillna(cmd[['Wife_age',
'No_of_children_born']].median())
In [158]:
# To Check for null values in the dataset
cmd.isnull().sum()
Out[158]:
Wife_age 0
Wife_ education 0
Husband_education 0
No_of_children_born 0
Wife_religion 0
Wife_Working 0
Husband_Occupation 0
Standard_of_living_index 0
Media_exposure 0
Contraceptive_method_used 0
dtype: int64

Outlier Treatment
In [85]:
cmd_plot = cmd.select_dtypes(include = ['float64', 'int64'])
In [162]:
#Check for presence of outliers in each feature
plt.figure(figsize = (10,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(4, 3, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [163]:
# Define a function which results the upper and lower limit to detect
outliers
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [164]:
# Cap & floor the values beyond the outlier boundaries
for i in feature_list:
LL,UL = remove_outlier(cmd_plot[i])
cmd_plot[i] = np.where (cmd_plot[i] > UL,UL, cmd_plot[i])
cmd_plot[i] = np.where (cmd_plot[i] < LL,LL, cmd_plot[i])
In [166]:
#Check for presence of outliers in each feature
plt.figure(figsize = (10,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(4, 3, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [93]:
# Univariate Analysis
plt.figure(figsize=(12,15))
feature_list = cmd.columns
for i in range (len(feature_list)):
plt.subplot(4, 3, i+1)
sns.histplot(y=cmd[feature_list[i]],data=cmd)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()
In [94]:
# Bivariate Analysis
plt.figure(figsize=(12,8))
sns.pairplot(cmd, diag_kind="kde")
plt.show()
<Figure size 1200x800 with 0 Axes>
In [95]:
cmd.corr()
Out[95]:
Wife_age No_of_children_born Husband_Occupation

Wife_age 1.000000 0.528428 -0.187070

No_of_children_born 0.528428 1.000000 -0.024213

-
Husband_Occupation -0.024213 1.000000
0.187070

In [96]:
# Multivariate Analysis
plt.figure(figsize=(12,8))
sns.heatmap(cmd.iloc[:, 0:10].corr(),annot=True)
plt.show()

In [99]:
#The following code loops through each column and checks if the column type
is object
#then converts those columns into categorical with each distinct value
becoming a category.
for feature in cmd.columns:
if cmd[feature].dtype == 'object':
print('\n')
print('feature:',feature)
print(pd.Categorical(cmd[feature].unique()))
print(pd.Categorical(cmd[feature].unique()).codes)
cmd[feature] = pd.Categorical(cmd[feature]).codes

feature: Husband_education
['Secondary', 'Primary', 'Tertiary', 'Uneducated']
Categories (4, object): ['Primary', 'Secondary', 'Tertiary', 'Uneducated']
[1 0 2 3]

feature: Wife_religion
['Scientology', 'Non-Scientology']
Categories (2, object): ['Non-Scientology', 'Scientology']
[1 0]

feature: Wife_Working
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
[0 1]

feature: Standard_of_living_index
['High', 'Very High', 'Low', 'Very Low']
Categories (4, object): ['High', 'Low', 'Very High', 'Very Low']
[0 2 1 3]

feature: Media_exposure
['Exposed', 'Not-Exposed']
Categories (2, object): ['Exposed', 'Not-Exposed']
[0 1]

feature: Contraceptive_method_used
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
[0 1]
In [100]:
cmd.head().T
Out[100]:
0 1 2 3 4

24.
Wife_age 45.0 43.0 42.0 36.0
0

Wife_ education 2.0 1.0 2.0 3.0 3.0

Husband_education 1.0 1.0 1.0 0.0 1.0

No_of_children_born 3.0 10.0 7.0 9.0 8.0

Wife_religion 1.0 1.0 1.0 1.0 1.0

Wife_Working 0.0 0.0 0.0 0.0 0.0

Husband_Occupation 2.0 3.0 3.0 3.0 3.0

Standard_of_living_index 0.0 2.0 2.0 0.0 1.0

Media_exposure 0.0 0.0 0.0 0.0 0.0

Contraceptive_method_used 0.0 0.0 0.0 0.0 0.0

In [102]:
cmd.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1393 entries, 0 to 1472
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Wife_age 1393 non-null float64
1 Wife_ education 1393 non-null int64
2 Husband_education 1393 non-null int8
3 No_of_children_born 1393 non-null float64
4 Wife_religion 1393 non-null int8
5 Wife_Working 1393 non-null int8
6 Husband_Occupation 1393 non-null int64
7 Standard_of_living_index 1393 non-null int8
8 Media_exposure 1393 non-null int8
9 Contraceptive_method_used 1393 non-null int8
dtypes: float64(2), int64(2), int8(6)
memory usage: 62.6 KB
In [109]:
#Check for presence of outliers in each feature
cmd_plot = cmd.select_dtypes(include = ['float64', 'int64', 'int8'])
plt.figure(figsize = (12,8))
feature_list = cmd_plot.columns
for i in range(len(feature_list)):
plt.subplot(3, 4, i + 1)
sns.boxplot(y = cmd_plot[feature_list[i]], data = cmd_plot)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()

In [110]:
# Define a function which results the upper and lower limit to detect
outliers
def remove_outlier(col):
Q1,Q3=col.quantile([0.25,0.75])
IQR=Q3-Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
return lower_range, upper_range
In [111]:
# Cap & floor the values beyond the outlier boundaries
for i in feature_list:
LL,UL = remove_outlier(cmd[i])
cmd[i] = np.where (cmd[i] > UL,UL, cmd[i])
cmd[i] = np.where (cmd[i] < LL,LL, cmd[i])
In [112]:
# Check the presence of outliers in the each features
plt.figure(figsize=(12,8))
feature_list = cmd.columns
for i in range (len(feature_list)):
plt.subplot(3, 4, i+1)
sns.boxplot(y=cmd[feature_list[i]],data=cmd)
plt.title('Boxplot of {}'.format(feature_list[i]))
plt.tight_layout()

In [113]:
cmd.corr()
# Check the Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(cmd.iloc[:, 0:10].corr(),annot=True)
plt.show()
CART
In [114]:
#Creating a backup
cmd_1 = cmd.copy()
In [115]:
# capture the target column ("Contraceptive_method_used") into separate
vectors
X = cmd_1.drop("Contraceptive_method_used" , axis=1)
y = cmd_1.pop("Contraceptive_method_used")
In [116]:
from sklearn.model_selection import train_test_split

X_train, X_test, train_labels, test_labels = train_test_split(X, y,


test_size=.30, random_state=1)
In [117]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'gini' )
In [118]:
dt_model.fit(X_train, train_labels)
Out[118]:
DecisionTreeClassifier
DecisionTreeClassifier()
In [121]:
from sklearn import tree

train_char_label = ['No', 'Yes']


ld_Tree_File = open('ld_Tree_File.dot','w')
dot_data = tree.export_graphviz(dt_model,
out_file=ld_Tree_File,
feature_names = list(X_train),
class_names = list(train_char_label))

ld_Tree_File.close()
In [122]:
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index
= X_train.columns).sort_values('Imp',ascending=False))
Imp
Wife_age 0.313523
No_of_children_born 0.261942
Standard_of_living_index 0.109489
Wife_ education 0.103750
Husband_Occupation 0.097337
Wife_Working 0.064697
Husband_education 0.049262
Wife_religion 0.000000
Media_exposure 0.000000
In [123]:
y_predict = dt_model.predict(X_test)
In [124]:
y_predict.shape
Out[124]:
(418,)

Regularising the Decision Tree


Adding Tuning Parameters

In [125]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],


'ccp_alpha': [0.1, .01, .001],
# cost complexity parameter, ccp_alpha. Greater values of
ccp_alpha increase the
# number of nodes pruned
'max_depth' : [1,5,10,15,20],
'min_samples_leaf':[1,5,10,15,20],
'criterion' :['gini', 'entropy']
}
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid,
cv=5, verbose=True)
grid_search.fit(X_train, train_labels)
Fitting 5 folds for each of 450 candidates, totalling 2250 fits
Out[125]:
GridSearchCV
estimator: DecisionTreeClassifier
DecisionTreeClassifier
In [126]:
grid_search.best_estimator_
Out[126]:
DecisionTreeClassifier
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=10,
max_features='auto', min_samples_leaf=20,
random_state=1024)
In [127]:
reg_dt_model = DecisionTreeClassifier( ccp_alpha=0.001,criterion='entropy',
max_depth=15,
max_features='auto',
random_state=1024)
reg_dt_model.fit(X_train, train_labels)
Out[127]:
DecisionTreeClassifier
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=15,
max_features='auto', random_state=1024)
In [128]:
ld_tree_regularized = open('ld_tree_regularized.dot','w')
dot_data = tree.export_graphviz(reg_dt_model, out_file= ld_tree_regularized
, feature_names = list(X_train), class_names = list(train_char_label))

ld_tree_regularized.close()
dot_data
In [129]:
print (pd.DataFrame(reg_dt_model.feature_importances_, columns = ["Imp"],
index = X_train.columns).sort_values('Imp',ascending=False))
Imp
Wife_age 0.335927
No_of_children_born 0.301169
Standard_of_living_index 0.112077
Husband_Occupation 0.075047
Husband_education 0.066805
Wife_ education 0.062557
Wife_Working 0.046418
Wife_religion 0.000000
Media_exposure 0.000000
In [130]:
ytrain_predict = reg_dt_model.predict(X_train)
ytest_predict = reg_dt_model.predict(X_test)
In [131]:
# AUC and ROC for the training data
# predict probabilities
probs = reg_dt_model.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.989

In [132]:
# AUC and ROC for the test data
# predict probabilities
probs = reg_dt_model.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.621
In [133]:
# Import the classification_report & Confusion_matrix library
from sklearn.metrics import classification_report,confusion_matrix

# To view the confusion matrix for train_labels, ytrain_predict


confusion_matrix(train_labels, ytrain_predict)
print(classification_report(train_labels, ytrain_predict))
precision recall f1-score support

0.0 0.93 0.92 0.93 422


1.0 0.94 0.95 0.94 553

accuracy 0.94 975


macro avg 0.94 0.93 0.94 975
weighted avg 0.94 0.94 0.94 975

In [134]:
# To view the confusion matrix for test_labels, ytest_predict
confusion_matrix(test_labels, ytest_predict)
print(classification_report(test_labels, ytest_predict))
precision recall f1-score support

0.0 0.60 0.55 0.57 192


1.0 0.64 0.69 0.67 226

accuracy 0.62 418


macro avg 0.62 0.62 0.62 418
weighted avg 0.62 0.62 0.62 418

In [135]:
# To view the model score for X_train, train_labels
reg_dt_model.score(X_train,train_labels)
Out[135]:
0.9364102564102564
In [136]:
# To view the model score for X_test, test_labels
reg_dt_model.score(X_test,test_labels)
Out[136]:
0.6244019138755981

Logistics Regression
In [137]:
# Creating a copy of the original data frame
cmd_2 = cmd.copy()
# Import the LabelEncoder library
from sklearn.preprocessing import LabelEncoder
# Defining a Label Encoder object instance
LE = LabelEncoder()
LE
Out[137]:
LabelEncoder
LabelEncoder()

You might also like