BDA Project Codes

Data Divers BDA Project Codes
1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats.mstats import winsorize
2. Read Dataset
# Retrieve datasets
cardio_df_g4 =
pd.read_csv("https://raw.githubusercontent.com/sarahsuhaimi/bda-
project/b4501d7c475ba1e31f0f2b3ce932450e60263ac0/D4_Cardiovascular
%20Patients%20Clinical%20Record.csv")
3. Preview Dataset Properties

# Preview dataset
display(cardio_df_g4)
np.random.seed(4)
display(cardio_df_g4.sample(10))
# Display the information of the dataset

cardio_df_g4.info()
# From the information above, we noticed that this dataset does not
have missing values
# Then, there is only 2 different datatypes in the dataset which are
float and integer
# Shape and size of the dataframe

print(f'Shape of the dataframe: {cardio_df_g4.shape}')
print(f'Size of the dataframe: {cardio_df_g4.size}')
4. Check Missing Values
# Checking and visualize missing values
print(f'Total missing value: {cardio_df_g4.isnull().sum().sum()}')
msno.matrix(cardio_df_g4)
5. Check Datatypes in the Dataframe
# Recheck the datatypes of the dataframe
print(cardio_df_g4.dtypes)
# Pick the data which is possible to be converted as numeric values
cardio_df_g4 = cardio_df_g4[pd.to_numeric(cardio_df_g4['age'],
errors='coerce').notnull()]
cardio_df_g4 = cardio_df_g4[pd.to_numeric(cardio_df_g4['platelets'],
errors='coerce').notnull()]
# Convert the age and platelets column as integer.
# This is because age and platelets are best known to be in numeric

(int) form
cardio_df_g4['age'], cardio_df_g4['platelets'] =
cardio_df_g4['age'].astype('int'),
cardio_df_g4['platelets'].astype('int')
# Recheck the updated datatypes
print(cardio_df_g4.dtypes)
6. Check the Description of Updated Dataset
# Descriptive Statistics
import scipy
df_summary_numerical = cardio_df_g4.describe()
scipy_df = pd.DataFrame()
for col in df_summary_numerical.columns:
d = scipy.stats.describe(cardio_df_g4[col])._asdict()
d['skewness'] = round(scipy.stats.skew(cardio_df_g4[col],
bias=False),5)
d['kurtosis'] = scipy.stats.kurtosis(cardio_df_g4[col],
bias=False)
df_temp = pd.DataFrame([d], columns=d.keys()).T
df_temp.columns = [col]
scipy_df = pd.concat([scipy_df, df_temp], axis=1)
df_temp = scipy_df.loc[['variance','skewness','kurtosis']]
df_summary_numerical = df_summary_numerical.append(df_temp)
df_summary_numerical = df_summary_numerical.T
df_summary_numerical['coefficient_of_variation'] =
df_summary_numerical['std']/df_summary_numerical['mean']
df_summary_numerical
7. Check Correlation between Variables
corr = cardio_df_g4.corr()
display(corr)
# Plot the heatmap
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(20, 10))
ax = sns.heatmap(corr, mask=mask, vmax=1, cmap='YlOrBr',

linewidth=.2, square=True, annot = True)
# Print list of correlation
upper_corr_mat = corr.where( np.triu(np.ones(corr.shape),

k=1).astype(bool))
## Convert to 1-D series and drop Null values
unique_corr_pairs = upper_corr_mat.unstack().dropna()
## Sort correlation pairs
sorted_mat = unique_corr_pairs.sort_values(ascending=False)
print(sorted_mat)
8. Check Outliers for Categorical Data (Nominal) - Column Sex, Smoking, Diabetes,
Anaemia, High BP, Deceased
# However, sex, smoking, diabetes, anaemia, high_bp and deceased are

considered as nominal categorical type (binary)
# Display the count of the value
print('Unique value for sex, smoking, diabetes, anaemia, high_bp and

deceased\n')
binary_list = [cardio_df_g4['sex'], cardio_df_g4['smoking'],

cardio_df_g4['diabetes'], cardio_df_g4['anaemia'],
cardio_df_g4['high_bp'], cardio_df_g4['deceased']]
for x in binary_list: print(f'\n{x.value_counts()}')
fig = plt.figure(figsize=(10,7))
cardio_df_g4.boxplot(column=['sex', 'smoking','diabetes',
'anaemia','high_bp', 'deceased'])
plt.show()
9. Check Outliers for Continuous Variables - Columns: Age, Ejection Fraction, Serum
Sodium, Fup Days, CPK, Serum Creatinine, Platelets
fig = plt.figure(figsize =(10, 7))
cardio_df_g4.boxplot(column=['age','ejection_fraction','serum_sodium
','fup_days'])
plt.show()
plt.subplot(1,3,1)
cardio_df_g4.boxplot(column='cpk')
plt.subplot(1,3,2)
cardio_df_g4.boxplot(column='serum_creatinine')
plt.subplot(1,3,3)
cardio_df_g4.boxplot(column='platelets')
plt.show()
# Detect outliers using z-score
# Formula attached above
def outlier_zscore(data):
outliers = []
threshold = 3
mean = np.mean(data)
std = np.std(data)
for i in data:
zscore = (i - mean) / std
if (np.abs(zscore) > threshold): outliers.append(i)
return outliers
for col in cardio_df_g4.columns:
sample_outliers = outlier_zscore(cardio_df_g4[col])
print(f"Outliers using z-score method for {col}: {sample_outliers}

")
10. Handling Outliers
# Serum sodium, ejection_fraction, platelets have 90% winsorization;

top 5% - replaced by value at 95th percentile, bottom 5% - replaced
by value at 5th percentile
# cpk and serum creatinine have 80% winsorization; top 10% -

replaced by value at 90th percentile, bottom 10% - replaced by value
at 10th percentile
cardio_df_g4['serum_sodium'] =
winsorize(cardio_df_g4['serum_sodium'], (0.05, 0.05))
cardio_df_g4['cpk'] = winsorize(cardio_df_g4['cpk'], (0.10, 0.10))
cardio_df_g4['ejection_fraction'] =
winsorize(cardio_df_g4['ejection_fraction'], (0.05, 0.05))
cardio_df_g4['platelets'] = winsorize(cardio_df_g4['platelets'],
(0.05, 0.05))
cardio_df_g4['serum_creatinine'] =
winsorize(cardio_df_g4['serum_creatinine'], (0.10, 0.10))
cardio_df_g4.boxplot(column=['age','ejection_fraction','serum_sodium
','fup_days'])
plt.show()
plt.subplot(1,3,1)
cardio_df_g4.boxplot(column='cpk')
plt.subplot(1,3,2)
cardio_df_g4.boxplot(column='serum_creatinine')
plt.subplot(1,3,3)
cardio_df_g4.boxplot(column='platelets')
plt.show()
11. Exploratory Data Analysis
# scatter plot
sns.relplot(data=cardio_df_g4, x='fup_days', y='age',

hue='deceased')
# line plot
sns.lineplot(data=cardio_df_g4, x='serum_creatinine', y='age',

hue='deceased')
# Source: https://towardsdatascience.com/100-stacked-charts-in-
python-6ca3e1962d2b
data = cardio_df_g4.loc[:, ['deceased', 'smoking']].copy()

cross = pd.crosstab(index=data['smoking'],
columns=data['deceased'],
normalize = 'index')
cross.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.legend(loc='lower left', ncol=2)
plt.xlabel("Smoking")
plt.ylabel("Proportion")
for i, j in enumerate([*cross.index.values]):
for (proportion, y_loc) in zip(cross.loc[j],
cross.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)
plt.show()
data1 = cardio_df_g4.loc[:, ['deceased', 'sex']].copy()
cross1 = pd.crosstab(index=data1['sex'],
columns=data1['deceased'],
cross1.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.xlabel("Gender")
for i, j in enumerate([*cross1.index.values]):
for (proportion, y_loc) in zip(cross1.loc[j],
cross1.loc[j].cumsum()):
color = 'black',
fontsize=10)
plt.show()
data2 = cardio_df_g4.loc[:, ['deceased', 'diabetes']].copy()
cross2 = pd.crosstab(index=data2['diabetes'],
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.xlabel("Diabetes")
color = 'black',
fontsize=10)
plt.show()
data3 = cardio_df_g4.loc[:, ['deceased', 'anaemia']].copy()

cross3 = pd.crosstab(index=data3['anaemia'],
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.xlabel("Anaemia")
color = 'black',
fontsize=10)
plt.show()
data4 = cardio_df_g4.loc[:, ['deceased', 'high_bp']].copy()
cross4 = pd.crosstab(index=data4['high_bp'],
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.xlabel("High Blood Pressure")
color = 'black',
fontsize=10)
plt.show()
12. Predictive Modelling
cardio_df_g4.columns
#select column
feature_df = cardio_df_g4[['age', 'sex', 'smoking', 'diabetes',

'anaemia', 'cpk','ejection_fraction', 'high_bp', 'platelets',
'serum_creatinine','serum_sodium', 'fup_days']]
#select column for x and y
x = np.asarray(feature_df)
y = np.asarray(cardio_df_g4['deceased'])
feature_df.head()
x[0:5]
y[0:5]
#method to split train and test is available in scikit-learn package
#random_state is a number used as seed value for machine to

determine which row used for train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y,

test_size=0.2, random_state=4)
#method to split train and test is available in scikit-learn package
#random_state is a number used as seed value for machine to

determine which row used for train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y,

test_size=0.2, random_state=4)
# 546 x 9
X_train.shape
# 546 x 1
y_train.shape
# 137 x 9
X_test.shape
# 137 x 1
y_test.shape
13. Evaluate the Model
from sklearn import tree
classifier = tree.DecisionTreeClassifier(max_depth=5, random_state =

0)
classifier.fit(X_train, y_train)
#test the model
print(f'Score: {classifier.score(X_test,y_test)}')
14. Decision Tree – Decrease by age
# Import decision tree from scikit-learn module

train_data_age = pd.DataFrame()
train_data_age['age'] = cardio_df_g4.age
train_labels = cardio_df_g4['deceased']
classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=0

)
classifier.fit(train_data_age, train_labels)
import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,

feature_names=["Age"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_age = graphviz.Source(dot_data)
graph_age
15. Decision Tree – Decrease by sex
# Import decision tree from scikit-learn module

train_data_sex = pd.DataFrame()
train_data_sex['sex'] = cardio_df_g4.sex
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_sex, train_labels)
import graphviz
feature_names=["sex"], class_names=["Dead", "Survived"],

filled=True,
rounded=True,
graph_sex = graphviz.Source(dot_data)
graph_sex
16. Decision Tree – Decrease by diabetes
train_data_diabetes = pd.DataFrame()
train_data_diabetes['diabetes'] = cardio_df_g4.diabetes
classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=0

)
classifier.fit(train_data_diabetes, train_labels)
import graphviz
feature_names=["diabetes"],
filled=True,
rounded=True,
graph_diabetes = graphviz.Source(dot_data)
graph_diabetes
17. Decision Tree – Decrease by anaemia
train_data_anaemia = pd.DataFrame()
train_data_anaemia['anaemia'] = cardio_df_g4.anaemia
random_state=0)
classifier.fit(train_data_anaemia, train_labels)
import graphviz
feature_names=["anaemia"],
filled=True,
rounded=True,
graph_anaemia = graphviz.Source(dot_data)
graph_anaemia
18. Decision Tree – Decrease by cpk
train_data_cpk = pd.DataFrame()
train_data_cpk['cpk'] = cardio_df_g4.cpk
random_state=0)
classifier.fit(train_data_cpk, train_labels)
import graphviz
feature_names=["cpk"],
filled=True,
rounded=True,
graph_cpk = graphviz.Source(dot_data)
graph_cpk
19. Decision Tree – Decrease by ejection_fraction
train_data_ejection_fraction = pd.DataFrame()
train_data_ejection_fraction['ejection_fraction'] =
cardio_df_g4.ejection_fraction
random_state=0)
classifier.fit(train_data_ejection_fraction, train_labels)
import graphviz
feature_names=["ejection_fraction"],
filled=True,
rounded=True,
graph_ejection_fraction = graphviz.Source(dot_data)
graph_ejection_fraction
20. Decision Tree – Decrease by high_bp
train_data_high_bp = pd.DataFrame()
train_data_high_bp['high_bp'] = cardio_df_g4.high_bp
random_state=0)
classifier.fit(train_data_high_bp, train_labels)
import graphviz
feature_names=["high_bp"],
filled=True,
rounded=True,
graph_high_bp = graphviz.Source(dot_data)
graph_high_bp
21. Decision Tree – Decrease by platelets
train_data_platelets = pd.DataFrame()
train_data_platelets['platelets'] = cardio_df_g4.platelets
random_state=0)
classifier.fit(train_data_platelets, train_labels)
import graphviz
feature_names=["platelets"],
filled=True,
rounded=True,
graph_platelets = graphviz.Source(dot_data)
graph_platelets
22. Decision Tree – Decrease by serum_creatinine
train_data_serum_creatinine = pd.DataFrame()
train_data_serum_creatinine['serum_creatinine'] =
cardio_df_g4.serum_creatinine
random_state=0)
classifier.fit(train_data_serum_creatinine, train_labels)
import graphviz
feature_names=["serum_creatinine"],
filled=True,
rounded=True,
graph_high_bp = graphviz.Source(dot_data)
graph_high_bp
23. Decision Tree – Decrease by serum_sodium
train_data_serum_sodium = pd.DataFrame()
train_data_serum_sodium['serum_sodium'] = cardio_df_g4.serum_sodium
random_state=0)
classifier.fit(train_data_serum_sodium, train_labels)
import graphviz
feature_names=["serum_sodium"],
filled=True,
rounded=True,
graph_serum_sodium = graphviz.Source(dot_data)
graph_serum_sodium
24. Decision Tree – Decrease by fup_days
train_data_fup_days = pd.DataFrame()
train_data_fup_days['fup_days'] = cardio_df_g4.fup_days
random_state=0)
classifier.fit(train_data_fup_days, train_labels)
import graphviz
feature_names=["fup_days"],
filled=True,
rounded=True,
graph_fup_days = graphviz.Source(dot_data)
graph_fup_days

BDA Project Codes

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

BDA Project Codes

Uploaded by

Copyright:

Available Formats

Data Divers BDA Project Codes

3. Preview Dataset Properties

# Display the information of the dataset

# Shape and size of the dataframe

5. Check Datatypes in the Dataframe

# Recheck the datatypes of the dataframe

# Pick the data which is possible to be converted as numeric values

# Convert the age and platelets column as integer.

# This is because age and platelets are best known to be in numeric

# Recheck the updated datatypes

6. Check the Description of Updated Dataset

df_temp = pd.DataFrame([d], columns=d.keys()).T

scipy_df = pd.concat([scipy_df, df_temp], axis=1)

7. Check Correlation between Variables

# Plot the heatmap

ax = sns.heatmap(corr, mask=mask, vmax=1, cmap='YlOrBr',

upper_corr_mat = corr.where( np.triu(np.ones(corr.shape),

## Convert to 1-D series and drop Null values

## Sort correlation pairs

# However, sex, smoking, diabetes, anaemia, high_bp and deceased are

# Display the count of the value

print('Unique value for sex, smoking, diabetes, anaemia, high_bp and

binary_list = [cardio_df_g4['sex'], cardio_df_g4['smoking'],

for x in binary_list: print(f'\n{x.value_counts()}')

fig = plt.figure(figsize =(10, 7))

fig = plt.figure(figsize =(10, 7))

# Detect outliers using z-score

# Formula attached above

zscore = (i - mean) / std

if (np.abs(zscore) > threshold): outliers.append(i)

for col in cardio_df_g4.columns:

print(f"Outliers using z-score method for {col}: {sample_outliers}

10. Handling Outliers

# Serum sodium, ejection_fraction, platelets have 90% winsorization;

# cpk and serum creatinine have 80% winsorization; top 10% -

cardio_df_g4['cpk'] = winsorize(cardio_df_g4['cpk'], (0.10, 0.10))

fig = plt.figure(figsize =(10, 7))

11. Exploratory Data Analysis

fig = plt.figure(figsize =(10, 7))

sns.relplot(data=cardio_df_g4, x='fup_days', y='age',

sns.lineplot(data=cardio_df_g4, x='serum_creatinine', y='age',

data = cardio_df_g4.loc[:, ['deceased', 'smoking']].copy()

plt.legend(loc='lower left', ncol=2)

for (proportion, y_loc) in zip(cross.loc[j],

y = (y_loc - proportion) + (proportion/2),

data1 = cardio_df_g4.loc[:, ['deceased', 'sex']].copy()

plt.legend(loc='lower left', ncol=2)

for (proportion, y_loc) in zip(cross1.loc[j],

plt.legend(loc='lower left', ncol=2)

for (proportion, y_loc) in zip(cross2.loc[j],

data3 = cardio_df_g4.loc[:, ['deceased', 'anaemia']].copy()

data4 = cardio_df_g4.loc[:, ['deceased', 'high_bp']].copy()

plt.legend(loc='lower left', ncol=2)

plt.xlabel("High Blood Pressure")

12. Predictive Modelling

feature_df = cardio_df_g4[['age', 'sex', 'smoking', 'diabetes',

#select column for x and y

#method to split train and test is available in scikit-learn package

#random_state is a number used as seed value for machine to

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y,

#random_state is a number used as seed value for machine to

from sklearn.model_selection import train_test_split