Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 20

Data Divers BDA Project Codes

1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats.mstats import winsorize

2. Read Dataset
# Retrieve datasets
cardio_df_g4 =
pd.read_csv("https://raw.githubusercontent.com/sarahsuhaimi/bda-
project/b4501d7c475ba1e31f0f2b3ce932450e60263ac0/D4_Cardiovascular
%20Patients%20Clinical%20Record.csv")

3. Preview Dataset Properties


# Preview dataset
display(cardio_df_g4)
np.random.seed(4)
display(cardio_df_g4.sample(10))

# Display the information of the dataset


cardio_df_g4.info()
# From the information above, we noticed that this dataset does not
have missing values
# Then, there is only 2 different datatypes in the dataset which are
float and integer

# Shape and size of the dataframe


print(f'Shape of the dataframe: {cardio_df_g4.shape}')
print(f'Size of the dataframe: {cardio_df_g4.size}')
4. Check Missing Values
# Checking and visualize missing values
print(f'Total missing value: {cardio_df_g4.isnull().sum().sum()}')
msno.matrix(cardio_df_g4)

5. Check Datatypes in the Dataframe

# Recheck the datatypes of the dataframe

print(cardio_df_g4.dtypes)

# Pick the data which is possible to be converted as numeric values

cardio_df_g4 = cardio_df_g4[pd.to_numeric(cardio_df_g4['age'],
errors='coerce').notnull()]

cardio_df_g4 = cardio_df_g4[pd.to_numeric(cardio_df_g4['platelets'],
errors='coerce').notnull()]

# Convert the age and platelets column as integer.

# This is because age and platelets are best known to be in numeric


(int) form

cardio_df_g4['age'], cardio_df_g4['platelets'] =
cardio_df_g4['age'].astype('int'),
cardio_df_g4['platelets'].astype('int')

# Recheck the updated datatypes

print(cardio_df_g4.dtypes)

6. Check the Description of Updated Dataset

# Descriptive Statistics

import scipy

df_summary_numerical = cardio_df_g4.describe()

scipy_df = pd.DataFrame()
for col in df_summary_numerical.columns:

d = scipy.stats.describe(cardio_df_g4[col])._asdict()

d['skewness'] = round(scipy.stats.skew(cardio_df_g4[col],
bias=False),5)

d['kurtosis'] = scipy.stats.kurtosis(cardio_df_g4[col],
bias=False)

df_temp = pd.DataFrame([d], columns=d.keys()).T

df_temp.columns = [col]

scipy_df = pd.concat([scipy_df, df_temp], axis=1)

df_temp = scipy_df.loc[['variance','skewness','kurtosis']]

df_summary_numerical = df_summary_numerical.append(df_temp)

df_summary_numerical = df_summary_numerical.T

df_summary_numerical['coefficient_of_variation'] =
df_summary_numerical['std']/df_summary_numerical['mean']

df_summary_numerical

7. Check Correlation between Variables

corr = cardio_df_g4.corr()

display(corr)

# Plot the heatmap

mask = np.zeros_like(corr)

mask[np.triu_indices_from(mask)] = True

with sns.axes_style("white"):

f, ax = plt.subplots(figsize=(20, 10))

ax = sns.heatmap(corr, mask=mask, vmax=1, cmap='YlOrBr',


linewidth=.2, square=True, annot = True)
# Print list of correlation

upper_corr_mat = corr.where( np.triu(np.ones(corr.shape),


k=1).astype(bool))

## Convert to 1-D series and drop Null values

unique_corr_pairs = upper_corr_mat.unstack().dropna()

## Sort correlation pairs

sorted_mat = unique_corr_pairs.sort_values(ascending=False)

print(sorted_mat)

8. Check Outliers for Categorical Data (Nominal) - Column Sex, Smoking, Diabetes,
Anaemia, High BP, Deceased

# However, sex, smoking, diabetes, anaemia, high_bp and deceased are


considered as nominal categorical type (binary)

# Display the count of the value

print('Unique value for sex, smoking, diabetes, anaemia, high_bp and


deceased\n')

binary_list = [cardio_df_g4['sex'], cardio_df_g4['smoking'],


cardio_df_g4['diabetes'], cardio_df_g4['anaemia'],
cardio_df_g4['high_bp'], cardio_df_g4['deceased']]

for x in binary_list: print(f'\n{x.value_counts()}')

fig = plt.figure(figsize=(10,7))

cardio_df_g4.boxplot(column=['sex', 'smoking','diabetes',
'anaemia','high_bp', 'deceased'])

plt.show()
9. Check Outliers for Continuous Variables - Columns: Age, Ejection Fraction, Serum
Sodium, Fup Days, CPK, Serum Creatinine, Platelets

fig = plt.figure(figsize =(10, 7))

cardio_df_g4.boxplot(column=['age','ejection_fraction','serum_sodium
','fup_days'])

plt.show()

fig = plt.figure(figsize =(10, 7))

plt.subplot(1,3,1)

cardio_df_g4.boxplot(column='cpk')

plt.subplot(1,3,2)

cardio_df_g4.boxplot(column='serum_creatinine')

plt.subplot(1,3,3)

cardio_df_g4.boxplot(column='platelets')

plt.show()

# Detect outliers using z-score

# Formula attached above

def outlier_zscore(data):

outliers = []

threshold = 3

mean = np.mean(data)

std = np.std(data)
for i in data:

zscore = (i - mean) / std

if (np.abs(zscore) > threshold): outliers.append(i)

return outliers

for col in cardio_df_g4.columns:

sample_outliers = outlier_zscore(cardio_df_g4[col])

print(f"Outliers using z-score method for {col}: {sample_outliers}


")

10. Handling Outliers

# Serum sodium, ejection_fraction, platelets have 90% winsorization;


top 5% - replaced by value at 95th percentile, bottom 5% - replaced
by value at 5th percentile

# cpk and serum creatinine have 80% winsorization; top 10% -


replaced by value at 90th percentile, bottom 10% - replaced by value
at 10th percentile

cardio_df_g4['serum_sodium'] =
winsorize(cardio_df_g4['serum_sodium'], (0.05, 0.05))

cardio_df_g4['cpk'] = winsorize(cardio_df_g4['cpk'], (0.10, 0.10))

cardio_df_g4['ejection_fraction'] =
winsorize(cardio_df_g4['ejection_fraction'], (0.05, 0.05))

cardio_df_g4['platelets'] = winsorize(cardio_df_g4['platelets'],
(0.05, 0.05))

cardio_df_g4['serum_creatinine'] =
winsorize(cardio_df_g4['serum_creatinine'], (0.10, 0.10))

fig = plt.figure(figsize =(10, 7))

cardio_df_g4.boxplot(column=['age','ejection_fraction','serum_sodium
','fup_days'])

plt.show()
fig = plt.figure(figsize =(10, 7))

plt.subplot(1,3,1)

cardio_df_g4.boxplot(column='cpk')

plt.subplot(1,3,2)

cardio_df_g4.boxplot(column='serum_creatinine')

plt.subplot(1,3,3)

cardio_df_g4.boxplot(column='platelets')

plt.show()

11. Exploratory Data Analysis

# scatter plot

fig = plt.figure(figsize =(10, 7))

sns.relplot(data=cardio_df_g4, x='fup_days', y='age',


hue='deceased')

# line plot

sns.lineplot(data=cardio_df_g4, x='serum_creatinine', y='age',


hue='deceased')

# Source: https://towardsdatascience.com/100-stacked-charts-in-
python-6ca3e1962d2b

data = cardio_df_g4.loc[:, ['deceased', 'smoking']].copy()


cross = pd.crosstab(index=data['smoking'],
columns=data['deceased'],
normalize = 'index')
cross.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))

plt.legend(loc='lower left', ncol=2)

plt.xlabel("Smoking")

plt.ylabel("Proportion")
for i, j in enumerate([*cross.index.values]):

for (proportion, y_loc) in zip(cross.loc[j],

cross.loc[j].cumsum()):

plt.text(x= i - 0.1,

y = (y_loc - proportion) + (proportion/2),

s=f'{np.round(proportion*100, 1)}%',

color = 'black',

fontsize=10)

plt.show()

data1 = cardio_df_g4.loc[:, ['deceased', 'sex']].copy()

cross1 = pd.crosstab(index=data1['sex'],

columns=data1['deceased'],

normalize = 'index')
cross1.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))

plt.legend(loc='lower left', ncol=2)

plt.xlabel("Gender")

plt.ylabel("Proportion")

for i, j in enumerate([*cross1.index.values]):

for (proportion, y_loc) in zip(cross1.loc[j],

cross1.loc[j].cumsum()):

plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)

plt.show()
data2 = cardio_df_g4.loc[:, ['deceased', 'diabetes']].copy()

cross2 = pd.crosstab(index=data2['diabetes'],
columns=data2['deceased'],
normalize = 'index')
cross2.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))

plt.legend(loc='lower left', ncol=2)

plt.xlabel("Diabetes")

plt.ylabel("Proportion")

for i, j in enumerate([*cross2.index.values]):

for (proportion, y_loc) in zip(cross2.loc[j],

cross2.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)

plt.show()

data3 = cardio_df_g4.loc[:, ['deceased', 'anaemia']].copy()


cross3 = pd.crosstab(index=data3['anaemia'],
columns=data3['deceased'],
normalize = 'index')
cross3.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.legend(loc='lower left', ncol=2)

plt.xlabel("Anaemia")

plt.ylabel("Proportion")

for i, j in enumerate([*cross3.index.values]):
for (proportion, y_loc) in zip(cross3.loc[j],
cross3.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)

plt.show()

data4 = cardio_df_g4.loc[:, ['deceased', 'high_bp']].copy()

cross4 = pd.crosstab(index=data4['high_bp'],
columns=data4['deceased'],
normalize = 'index')
cross4.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))

plt.legend(loc='lower left', ncol=2)

plt.xlabel("High Blood Pressure")

plt.ylabel("Proportion")
for i, j in enumerate([*cross4.index.values]):
for (proportion, y_loc) in zip(cross4.loc[j],
cross4.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)

plt.show()

12. Predictive Modelling

cardio_df_g4.columns

#select column

feature_df = cardio_df_g4[['age', 'sex', 'smoking', 'diabetes',


'anaemia', 'cpk','ejection_fraction', 'high_bp', 'platelets',
'serum_creatinine','serum_sodium', 'fup_days']]

#select column for x and y

x = np.asarray(feature_df)

y = np.asarray(cardio_df_g4['deceased'])

feature_df.head()

x[0:5]

y[0:5]

#method to split train and test is available in scikit-learn package

#random_state is a number used as seed value for machine to


determine which row used for train/test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y,


test_size=0.2, random_state=4)
#method to split train and test is available in scikit-learn package

#random_state is a number used as seed value for machine to


determine which row used for train/test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y,


test_size=0.2, random_state=4)

# 546 x 9

X_train.shape

# 546 x 1

y_train.shape

# 137 x 9

X_test.shape

# 137 x 1

y_test.shape

13. Evaluate the Model

from sklearn import tree

classifier = tree.DecisionTreeClassifier(max_depth=5, random_state =


0)

classifier.fit(X_train, y_train)

#test the model

print(f'Score: {classifier.score(X_test,y_test)}')
14. Decision Tree – Decrease by age

# Import decision tree from scikit-learn module


from sklearn import tree
train_data_age = pd.DataFrame()
train_data_age['age'] = cardio_df_g4.age
train_labels = cardio_df_g4['deceased']

classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=0


)
classifier.fit(train_data_age, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,


feature_names=["Age"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)

graph_age = graphviz.Source(dot_data)
graph_age

15. Decision Tree – Decrease by sex

# Import decision tree from scikit-learn module


from sklearn import tree
train_data_sex = pd.DataFrame()
train_data_sex['sex'] = cardio_df_g4.sex

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_sex, train_labels)

import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["sex"], class_names=["Dead", "Survived"],


filled=True,
rounded=True,
special_characters=True)

graph_sex = graphviz.Source(dot_data)

graph_sex

16. Decision Tree – Decrease by diabetes

from sklearn import tree

train_data_diabetes = pd.DataFrame()

train_data_diabetes['diabetes'] = cardio_df_g4.diabetes

classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=0


)

classifier.fit(train_data_diabetes, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["diabetes"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_diabetes = graphviz.Source(dot_data)

graph_diabetes
17. Decision Tree – Decrease by anaemia

from sklearn import tree

train_data_anaemia = pd.DataFrame()

train_data_anaemia['anaemia'] = cardio_df_g4.anaemia

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_anaemia, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["anaemia"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_anaemia = graphviz.Source(dot_data)

graph_anaemia

18. Decision Tree – Decrease by cpk

from sklearn import tree

train_data_cpk = pd.DataFrame()

train_data_cpk['cpk'] = cardio_df_g4.cpk

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_cpk, train_labels)

import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["cpk"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_cpk = graphviz.Source(dot_data)

graph_cpk

19. Decision Tree – Decrease by ejection_fraction

from sklearn import tree

train_data_ejection_fraction = pd.DataFrame()

train_data_ejection_fraction['ejection_fraction'] =
cardio_df_g4.ejection_fraction

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_ejection_fraction, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["ejection_fraction"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_ejection_fraction = graphviz.Source(dot_data)

graph_ejection_fraction
20. Decision Tree – Decrease by high_bp

from sklearn import tree

train_data_high_bp = pd.DataFrame()

train_data_high_bp['high_bp'] = cardio_df_g4.high_bp

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_high_bp, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["high_bp"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_high_bp = graphviz.Source(dot_data)

graph_high_bp
21. Decision Tree – Decrease by platelets

from sklearn import tree

train_data_platelets = pd.DataFrame()

train_data_platelets['platelets'] = cardio_df_g4.platelets

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_platelets, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["platelets"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_platelets = graphviz.Source(dot_data)

graph_platelets

22. Decision Tree – Decrease by serum_creatinine

from sklearn import tree

train_data_serum_creatinine = pd.DataFrame()

train_data_serum_creatinine['serum_creatinine'] =
cardio_df_g4.serum_creatinine

classifier = tree.DecisionTreeClassifier(max_depth=3,
random_state=0)

classifier.fit(train_data_serum_creatinine, train_labels)

import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["serum_creatinine"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_high_bp = graphviz.Source(dot_data)

graph_high_bp

23. Decision Tree – Decrease by serum_sodium

from sklearn import tree

train_data_serum_sodium = pd.DataFrame()

train_data_serum_sodium['serum_sodium'] = cardio_df_g4.serum_sodium

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_serum_sodium, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["serum_sodium"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_serum_sodium = graphviz.Source(dot_data)

graph_serum_sodium
24. Decision Tree – Decrease by fup_days

from sklearn import tree

train_data_fup_days = pd.DataFrame()

train_data_fup_days['fup_days'] = cardio_df_g4.fup_days

classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)

classifier.fit(train_data_fup_days, train_labels)

import graphviz

dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["fup_days"],

class_names=["Dead", "Survived"],

filled=True,

rounded=True,

special_characters=True)

graph_fup_days = graphviz.Source(dot_data)

graph_fup_days

You might also like