Professional Documents
Culture Documents
BDA Project Codes
BDA Project Codes
1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats.mstats import winsorize
2. Read Dataset
# Retrieve datasets
cardio_df_g4 =
pd.read_csv("https://raw.githubusercontent.com/sarahsuhaimi/bda-
project/b4501d7c475ba1e31f0f2b3ce932450e60263ac0/D4_Cardiovascular
%20Patients%20Clinical%20Record.csv")
print(cardio_df_g4.dtypes)
cardio_df_g4 = cardio_df_g4[pd.to_numeric(cardio_df_g4['age'],
errors='coerce').notnull()]
cardio_df_g4 = cardio_df_g4[pd.to_numeric(cardio_df_g4['platelets'],
errors='coerce').notnull()]
cardio_df_g4['age'], cardio_df_g4['platelets'] =
cardio_df_g4['age'].astype('int'),
cardio_df_g4['platelets'].astype('int')
print(cardio_df_g4.dtypes)
# Descriptive Statistics
import scipy
df_summary_numerical = cardio_df_g4.describe()
scipy_df = pd.DataFrame()
for col in df_summary_numerical.columns:
d = scipy.stats.describe(cardio_df_g4[col])._asdict()
d['skewness'] = round(scipy.stats.skew(cardio_df_g4[col],
bias=False),5)
d['kurtosis'] = scipy.stats.kurtosis(cardio_df_g4[col],
bias=False)
df_temp.columns = [col]
df_temp = scipy_df.loc[['variance','skewness','kurtosis']]
df_summary_numerical = df_summary_numerical.append(df_temp)
df_summary_numerical = df_summary_numerical.T
df_summary_numerical['coefficient_of_variation'] =
df_summary_numerical['std']/df_summary_numerical['mean']
df_summary_numerical
corr = cardio_df_g4.corr()
display(corr)
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(20, 10))
unique_corr_pairs = upper_corr_mat.unstack().dropna()
sorted_mat = unique_corr_pairs.sort_values(ascending=False)
print(sorted_mat)
8. Check Outliers for Categorical Data (Nominal) - Column Sex, Smoking, Diabetes,
Anaemia, High BP, Deceased
fig = plt.figure(figsize=(10,7))
cardio_df_g4.boxplot(column=['sex', 'smoking','diabetes',
'anaemia','high_bp', 'deceased'])
plt.show()
9. Check Outliers for Continuous Variables - Columns: Age, Ejection Fraction, Serum
Sodium, Fup Days, CPK, Serum Creatinine, Platelets
cardio_df_g4.boxplot(column=['age','ejection_fraction','serum_sodium
','fup_days'])
plt.show()
plt.subplot(1,3,1)
cardio_df_g4.boxplot(column='cpk')
plt.subplot(1,3,2)
cardio_df_g4.boxplot(column='serum_creatinine')
plt.subplot(1,3,3)
cardio_df_g4.boxplot(column='platelets')
plt.show()
def outlier_zscore(data):
outliers = []
threshold = 3
mean = np.mean(data)
std = np.std(data)
for i in data:
return outliers
sample_outliers = outlier_zscore(cardio_df_g4[col])
cardio_df_g4['serum_sodium'] =
winsorize(cardio_df_g4['serum_sodium'], (0.05, 0.05))
cardio_df_g4['ejection_fraction'] =
winsorize(cardio_df_g4['ejection_fraction'], (0.05, 0.05))
cardio_df_g4['platelets'] = winsorize(cardio_df_g4['platelets'],
(0.05, 0.05))
cardio_df_g4['serum_creatinine'] =
winsorize(cardio_df_g4['serum_creatinine'], (0.10, 0.10))
cardio_df_g4.boxplot(column=['age','ejection_fraction','serum_sodium
','fup_days'])
plt.show()
fig = plt.figure(figsize =(10, 7))
plt.subplot(1,3,1)
cardio_df_g4.boxplot(column='cpk')
plt.subplot(1,3,2)
cardio_df_g4.boxplot(column='serum_creatinine')
plt.subplot(1,3,3)
cardio_df_g4.boxplot(column='platelets')
plt.show()
# scatter plot
# line plot
# Source: https://towardsdatascience.com/100-stacked-charts-in-
python-6ca3e1962d2b
plt.xlabel("Smoking")
plt.ylabel("Proportion")
for i, j in enumerate([*cross.index.values]):
cross.loc[j].cumsum()):
plt.text(x= i - 0.1,
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)
plt.show()
cross1 = pd.crosstab(index=data1['sex'],
columns=data1['deceased'],
normalize = 'index')
cross1.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.xlabel("Gender")
plt.ylabel("Proportion")
for i, j in enumerate([*cross1.index.values]):
cross1.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)
plt.show()
data2 = cardio_df_g4.loc[:, ['deceased', 'diabetes']].copy()
cross2 = pd.crosstab(index=data2['diabetes'],
columns=data2['deceased'],
normalize = 'index')
cross2.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.xlabel("Diabetes")
plt.ylabel("Proportion")
for i, j in enumerate([*cross2.index.values]):
cross2.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)
plt.show()
plt.xlabel("Anaemia")
plt.ylabel("Proportion")
for i, j in enumerate([*cross3.index.values]):
for (proportion, y_loc) in zip(cross3.loc[j],
cross3.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)
plt.show()
cross4 = pd.crosstab(index=data4['high_bp'],
columns=data4['deceased'],
normalize = 'index')
cross4.plot(kind='bar',
stacked = True,
colormap = 'tab10',
figsize=(7,5))
plt.ylabel("Proportion")
for i, j in enumerate([*cross4.index.values]):
for (proportion, y_loc) in zip(cross4.loc[j],
cross4.loc[j].cumsum()):
plt.text(x= i - 0.1,
y = (y_loc - proportion) + (proportion/2),
s=f'{np.round(proportion*100, 1)}%',
color = 'black',
fontsize=10)
plt.show()
cardio_df_g4.columns
#select column
x = np.asarray(feature_df)
y = np.asarray(cardio_df_g4['deceased'])
feature_df.head()
x[0:5]
y[0:5]
# 546 x 9
X_train.shape
# 546 x 1
y_train.shape
# 137 x 9
X_test.shape
# 137 x 1
y_test.shape
classifier.fit(X_train, y_train)
print(f'Score: {classifier.score(X_test,y_test)}')
14. Decision Tree – Decrease by age
import graphviz
graph_age = graphviz.Source(dot_data)
graph_age
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_sex, train_labels)
import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,
graph_sex = graphviz.Source(dot_data)
graph_sex
train_data_diabetes = pd.DataFrame()
train_data_diabetes['diabetes'] = cardio_df_g4.diabetes
classifier.fit(train_data_diabetes, train_labels)
import graphviz
feature_names=["diabetes"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_diabetes = graphviz.Source(dot_data)
graph_diabetes
17. Decision Tree – Decrease by anaemia
train_data_anaemia = pd.DataFrame()
train_data_anaemia['anaemia'] = cardio_df_g4.anaemia
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_anaemia, train_labels)
import graphviz
feature_names=["anaemia"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_anaemia = graphviz.Source(dot_data)
graph_anaemia
train_data_cpk = pd.DataFrame()
train_data_cpk['cpk'] = cardio_df_g4.cpk
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_cpk, train_labels)
import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,
feature_names=["cpk"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_cpk = graphviz.Source(dot_data)
graph_cpk
train_data_ejection_fraction = pd.DataFrame()
train_data_ejection_fraction['ejection_fraction'] =
cardio_df_g4.ejection_fraction
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_ejection_fraction, train_labels)
import graphviz
feature_names=["ejection_fraction"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_ejection_fraction = graphviz.Source(dot_data)
graph_ejection_fraction
20. Decision Tree – Decrease by high_bp
train_data_high_bp = pd.DataFrame()
train_data_high_bp['high_bp'] = cardio_df_g4.high_bp
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_high_bp, train_labels)
import graphviz
feature_names=["high_bp"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_high_bp = graphviz.Source(dot_data)
graph_high_bp
21. Decision Tree – Decrease by platelets
train_data_platelets = pd.DataFrame()
train_data_platelets['platelets'] = cardio_df_g4.platelets
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_platelets, train_labels)
import graphviz
feature_names=["platelets"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_platelets = graphviz.Source(dot_data)
graph_platelets
train_data_serum_creatinine = pd.DataFrame()
train_data_serum_creatinine['serum_creatinine'] =
cardio_df_g4.serum_creatinine
classifier = tree.DecisionTreeClassifier(max_depth=3,
random_state=0)
classifier.fit(train_data_serum_creatinine, train_labels)
import graphviz
dot_data = tree.export_graphviz(classifier, out_file=None,
feature_names=["serum_creatinine"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_high_bp = graphviz.Source(dot_data)
graph_high_bp
train_data_serum_sodium = pd.DataFrame()
train_data_serum_sodium['serum_sodium'] = cardio_df_g4.serum_sodium
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_serum_sodium, train_labels)
import graphviz
feature_names=["serum_sodium"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_serum_sodium = graphviz.Source(dot_data)
graph_serum_sodium
24. Decision Tree – Decrease by fup_days
train_data_fup_days = pd.DataFrame()
train_data_fup_days['fup_days'] = cardio_df_g4.fup_days
classifier = tree.DecisionTreeClassifier(max_depth=5,
random_state=0)
classifier.fit(train_data_fup_days, train_labels)
import graphviz
feature_names=["fup_days"],
class_names=["Dead", "Survived"],
filled=True,
rounded=True,
special_characters=True)
graph_fup_days = graphviz.Source(dot_data)
graph_fup_days