Professional Documents
Culture Documents
The Goal of This Part Is To Use Descriptive Statistics and Visualization To Better Understand Your Data
The Goal of This Part Is To Use Descriptive Statistics and Visualization To Better Understand Your Data
The Goal of This Part Is To Use Descriptive Statistics and Visualization To Better Understand Your Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv(path, index_col=0, encoding = "ISO-8859-
1")
data.head() data.tail()
df.shape
#######################
print(df.dtypes)
print(df.dtypes.value_counts())
df.dtypes.value_counts().plot.pie()
#################################
df.isna() (bool: T or F)
plt.figure(figsize=(20,10))
sns.heatmap(df.isna(), cbar=False)
df.isna().sum()
(df.isna().sum()/df.shape[0]).sort_values(ascending=Tr
ue)
missing_rate =df.isna().sum()/df.shape[0]
Analyse de Fond :
df.isna().sum()/df.shape[0]<0.9
df.columns[df.isna().sum()/df.shape[0]<0.9]
df[df.columns[df.isna().sum()/df.shape[0]<0.9]]
df=df[df.columns[df.isna().sum()/df.shape[0]<0.9]]
df.head()
plt.figure(figsize=(20,10))
sns.heatmap(df.isna(), cbar=False)
df=df.drop(‘Patient ID’,axis=1)
df['SARS-Cov-2 exam
result'].value_counts(normalize=True)
for col in df.select_dtypes(‘float’):
print (col)
plt.figure()
sns.distplot(df[col])
print(col, df[col].unique())
plt.figure()
df[col]. value_counts().plot.pie()
(missing_rate =df.isna().sum()/df.shape[0])
blood_columns = df.columns[(missing_rate < 0.9) &
(missing_rate >0.88)]
viral_columns = df.columns[(missing_rate < 0.88) &
(missing_rate >0.75)]
plt.figure()
sns.heatmap(pd.crosstab(df['SARS-Cov-2 exam
result'], df[col], annot=True, fmt=’d’)