Download as pdf or txt
Download as pdf or txt
You are on page 1of 5

import pandas as pd # Read csv file containing # Look at the data type of each column and whether null

import numpy as np training datadata values are present


import matplotlib.pyplot as train_df = train_df.info()
plt pd.read_csv("D:/AI for
%matplotlib inline Medical Diagnosis/train- print(f"The total pa ent ids are {train_df['Pa entId'].count()},
import os small.csv") from those the unique
import seaborn as sns # Print first 5 rows ids are {train_dfdf['Pa entId'].value_counts().shape[0]} ")
sns.set() print(f'There are columns = train_df.keys()
{train_df.shape[0]} rows columns = list(columns)
and {train_df.shape[1]} print(columns)
columns in this data
frame')
train_df.head()
# Remove unnecesary # Print out the number of # Extract numpy values from Image column in data frame
elements posi ve labels for each images = train_df['Image'].values
columns.remove('Image') class
columns.remove('Pa entId') for column in columns: # Extract 9 random images from it
# Get the total classes print(f"The class random_images = [np.random.choice(images) for i in range(9)]
print(f"There are {column} has
{len(columns)} columns of {train_df[column].sum()} # Loca on of the image dir
labels for these condi ons: samples") img_dir = 'nih/images-small/'
{columns}")
print('Display Random Images')

# Adjust the size of your images


plt.figure(figsize=(20,10))

# Iterate and plot random images


for i in range(9):
plt.subplot(3, 3, i + 1)
img = plt.imread(os.path.join(img_dir, random_images[i]))
plt.imshow(img, cmap='gray')
plt.axis('off')

# Adjust subplot parameters to give specified padding


plt. ght_layout()

# Get the first image that was listed in the train_df # Plot a histogram of the # Import data generator
dataframe distribu on of the pixels from keras
sample_img = train_df.Image[0] sns.distplot(raw_image.rave from keras.preprocessing.image
raw_image = plt.imread(os.path.join(img_dir, l(), import ImageDataGenerator
sample_img)) label=f'Pixel Mean Using TensorFlow backend.
plt.imshow(raw_image, cmap='gray') {np.mean(raw_image):.4f}
plt.colorbar() & Standard Devia on # Normalize images
plt. tle('Raw Chest X Ray Image') {np.std(raw_image):.4f}', image_generator =
print(f"The dimensions of the image are kde=False) ImageDataGenerator(
{raw_image.shape[0]} pixels width and plt.legend(loc='upper samplewise_center=True, #Set
{raw_image.shape[1]} pixels height, one single color center') each sample mean to 0.
channel") plt. tle('Distribu on of Pixel
print(f"The maximum pixel value is {raw_image.max():.4f} Intensi es in the Image') samplewise_std_normaliza on=
and the minimum is {raw_image.min():.4f}") plt.xlabel('Pixel Intensity') True # Divide each input by its
print(f"The mean value of the pixels is plt.ylabel('# Pixels in standard devia on
{raw_image.mean():.4f} and the standard devia on is Image')
{raw_image.std():.4f}")
# Flow from directory with specified batch size and # Include a histogram of the distribu on of the pixels
target image size sns.set()
generator = image_generator.flow_from_dataframe( plt.figure(figsize=(10, 7))
dataframe=train_df,
directory="nih/images-small/", # Plot histogram for original iamge
x_col="Image", # features sns.distplot(raw_image.ravel(),
y_col= ['Mass'], # labels label=f'Original Image: mean {np.mean(raw_image):.4f} -
class_mode="raw", # 'Mass' column should be in Standard Devia on {np.std(raw_image):.4f} \n '
train_df f'Min pixel value {np.min(raw_image):.4} - Max pixel value
batch_size= 1, # images per batch {np.max(raw_image):.4}',
shuffle=False, # shuffle the rows or not color='blue',
target_size=(320,320) # width and height of output kde=False)
image
# Plot a processed image # Plot histogram for generated image
sns.set_style("white") sns.distplot(generated_image[0].ravel(),
generated_image, label = generator.__ge tem__(0) label=f'Generated Image: mean
plt.imshow(generated_image[0], cmap='gray') {np.mean(generated_image[0]):.4f} - Standard Devia on
plt.colorbar() {np.std(generated_image[0]):.4f} \n'
plt. tle('Raw Chest X Ray Image') f'Min pixel value {np.min(generated_image[0]):.4} - Max
print(f"The dimensions of the image are pixel value {np.max(generated_image[0]):.4}',
{generated_image.shape[1]} pixels width and color='red',
{generated_image.shape[2]} pixels height") kde=False)
print(f"The maximum pixel value is
{generated_image.max():.4f} and the minimum is # Place legends
{generated_image.min():.4f}") plt.legend()
print(f"The mean value of the pixels is plt. tle('Distribu on of Pixel Intensi es in the Image')
{generated_image.mean():.4f} and the standard devia on plt.xlabel('Pixel Intensity')
is {generated_image.std():.4f}") plt.ylabel('# Pixel')

Counting labels
# Import the necessary packages # Plot up the distribu on of counts # Make model predic ons that are
import numpy as np sns.barplot(class_counts.values, always 0.9 for all examples
import pandas as pd class_counts.index, color='b') y_pred_1 = 0.9 *
import seaborn as sns plt. tle('Distribu on of Classes for np.ones(y_true.shape)
import matplotlib.pyplot as plt Training Dataset', fontsize=15) print(f"y_pred_1: \n{y_pred_1}")
%matplotlib inline plt.xlabel('Number of Pa ents', print()
# Read csv file containing training fontsize=15) y_pred_2 = 0.1 *
datadata plt.ylabel('Diseases', fontsize=15) np.ones(y_true.shape)
train_df = pd.read_csv("nih/train- plt.show() print(f"y_pred_2: \n{y_pred_2}")
small.csv") # Generate an array of 4 binary label loss_reg_1 = -1 * np.sum(y_true *
# Count up the number of instances of values, 3 posi ve and 1 nega ve np.log(y_pred_1)) + \
each class (drop non-class columns from y_true = np.array( -1 * np.sum((1 - y_true) *
the counts) [[1], np.log(1 - y_pred_1))
class_counts = [1], print(f"loss_reg_1: {loss_reg_1:.4f}")
train_df.sum().drop(['Image','Pa entId']) [1],
[0]]) loss_reg_2 = -1 * np.sum(y_true *
for column in class_counts.keys(): print(f"y_true: \n{y_true}") np.log(y_pred_2)) + \
print(f"The class {column} has -1 * np.sum((1 - y_true) *
{train_df[column].sum()} samples") np.log(1 - y_pred_2))
print(f"loss_reg_2: {loss_reg_2:.4f}")
print(f"When the model 1 always predicts 0.9, the regular # Calculate and print out the second term in the loss func on,
loss is {loss_reg_1:.4f}") which we're calling 'loss_neg'
print(f"When the model 2 always predicts 0.1, the regular loss_2_neg = -1 * np.sum(w_n * (1 - y_true) * np.log(1 -
loss is {loss_reg_2:.4f}") y_pred_2))
# calculate the posi ve weight as the frac on of nega ve print(f"loss_2_neg: {loss_2_neg:.4f}")
labels # Sum posi ve and nega ve losses to calculate total loss when
w_p = 1/4 the predic on is y_pred_2
loss_2 = loss_2_pos + loss_2_neg
# calculate the nega ve weight as the frac on of posi ve print(f"loss_2: {loss_2:.4f}")
labels print(f"When the model always predicts 0.9, the total loss is
w_n = 3/4 {loss_1:.4f}")
print(f"When the model always predicts 0.1, the total loss is
print(f"posi ve weight w_p: {w_p}") {loss_2:.4f}")
print(f"nega ve weight w_n {w_n}") print(f"loss_1_pos: {loss_1_pos:.4f} \t loss_1_neg:
# Calculate and print out the first term in the loss func on, {loss_1_neg:.4f}")
which we are calling 'loss_pos' print()
loss_1_pos = -1 * np.sum(w_p * y_true * np.log(y_pred_1 print(f"loss_2_pos: {loss_2_pos:.4f} \t loss_2_neg:
)) {loss_2_neg:.4f}")
print(f"loss_1_pos: {loss_1_pos:.4f}") # View the labels (true values) that you will prac ce with
# Calculate and print out the second term in the loss y_true = np.array(
func on, which we're calling 'loss_neg' [[1,0],
loss_1_neg = -1 * np.sum(w_n * (1 - y_true) * np.log(1 - [1,0],
y_pred_1 )) [1,0],
print(f"loss_1_neg: {loss_1_neg:.4f}") [1,0],
# Sum posi ve and nega ve losses to calculate total loss [0,1]
loss_1 = loss_1_pos + loss_1_neg ])
print(f"loss_1: {loss_1:.4f}") y_true
# Calculate and print out the first term in the loss func on, # See what happens when you set axis=0
which we are calling 'loss_pos' print(f"using axis = 0 {np.sum(y_true,axis=0)}")
loss_2_pos = -1 * np.sum(w_p * y_true * np.log(y_pred_2))
print(f"loss_2_pos: {loss_2_pos:.4f}") # Compare this to what happens when you set axis=1
print(f"using axis = 1 {np.sum(y_true,axis=1)}")

# set the posi ve weights as the frac on of nega ve labels # calculate the loss from the posi ve predic ons, for class 0
(0) for each class (each column) loss_0_pos = -1 * np.sum(w_p[0] *
w_p = np.sum(y_true == 0,axis=0) / y_true.shape[0] y_true[:, 0] *
w_p np.log(y_pred[:, 0])
# set the nega ve weights as the frac on of posi ve labels )
(1) for each class print(f"loss_0_pos: {loss_0_pos:.4f}")
w_n = np.sum(y_true == 1, axis=0) / y_true.shape[0] # Print and view column zero of the weight
w_n print(f"w_n[0]: {w_n[0]}")
# Set model predic ons where all predic ons are the same print(f"y_true[:,0]: {y_true[:,0]}")
y_pred = np.ones(y_true.shape) print(f"y_pred[:,0]: {y_pred[:,0]}")
y_pred[:,0] = 0.3 * y_pred[:,0] # Calculate the loss from the nega ve predic ons, for class 0
y_pred[:,1] = 0.7 * y_pred[:,1] loss_0_neg = -1 * np.sum(
y_pred w_n[0] *
# Print and view column zero of the weight (1 - y_true[:, 0]) *
print(f"w_p[0]: {w_p[0]}") np.log(1 - y_pred[:, 0])
print(f"y_true[:,0]: {y_true[:,0]}") )
print(f"y_pred[:,0]: {y_pred[:,0]}") print(f"loss_0_neg: {loss_0_neg:.4f}")

# calculate the loss from the posi ve predic ons, for class 1
# add the two loss terms to get the total loss for class 0 loss_1_pos = None
loss_0 = loss_0_neg + loss_0_pos # Calculate the loss from the nega ve predic ons, for class 1
print(f"loss_0: {loss_0:.4f}" loss_1_neg = None
# add the two loss terms to get the total loss for class 0
loss_-- # calculate the loss from the posi ve predic ons, for class 1 loss_1_pos = -1 * np.sum(w_p[1] * y_true[:, 1] *
np.log(y_pred[:, 1]) ) print(f"loss_1_pos: {loss_1_pos:.4f}")
-- # Calculate the loss from the nega ve predic ons, for class 1 loss_1_neg = -1 * np.sum( w_n[1] * (1 - y_true[:, 1]) * np.log(1 -
y_pred[:, 1]) ) print(f"loss_1_neg: {loss_1_neg:.4f}")
-- # add the two loss terms to get the total loss for class 1 loss_1 = loss_1_neg + loss_1_pos print(f"loss_1: {loss_1:.4f}")
1 = None

# Import Densenet from Keras # Define a set of five class labels to use as an example
from keras.applica ons.densenet import DenseNet121 labels = ['Emphysema',
from keras.layers import Dense, GlobalAveragePooling2D 'Hernia',
from keras.models import Model 'Mass',
from keras import backend as K 'Pneumonia',
Using TensorFlow backend. 'Edema']
# Create the base pre-trained model n_classes = len(labels)
base_model = DenseNet121(weights='./nih/densenet.hdf5', print(f"In this example, you want your model to iden fy
include_top=False); {n_classes} classes")
# Print the model summary # Add a logis c layer the same size as the number of
base_model.summary() classes you're trying to predict
# Print out the first five layers predic ons = Dense(n_classes,
layers_l = base_model.layers ac va on="sigmoid")(x_pool)
# Print out the last five layers print(f"Predic ons have {n_classes} units, one for each
print("Last 5 layers") class")
layers_l[-6:-1] predic ons
print("First 5 layers") # Create an updated model
layers_l[0:5] model = Model(inputs=base_model.input,
outputs=predic ons)
# Get the convolu onal layers and print the first 5 # Compile the model
conv2D_layers = [layer for layer in base_model.layers model.compile(op mizer='adam',
if str(type(layer)).find('Conv2D') > -1] loss='categorical_crossentropy')
print("The first five conv2D layers") # (You'll customize the loss func on in the assignment!)
conv2D_layers[0:5]
Pa ent Overlap and Data Leakage
# Print out the total number of convolu onal layers # Import necessary packages
print(f"There are {len(conv2D_layers)} convolu onal layers") import pandas as pd
# Print the number of channels in the input import numpy as np
print("The input has 3 channels") import matplotlib.pyplot as plt
base_model.input %matplotlib inline
# Print the number of output channels import os
print("The output has 1024 channels") import seaborn as sns
x = base_model.output sns.set()
x # Read csv file containing training data
train_df = pd.read_csv("nih/train-small.csv")
# Add a global spa al average pooling layer # Print first 5 rows
print(f'There are {train_df.shape[0]} rows and
x_pool = GlobalAveragePooling2D()(x)
{train_df.shape[1]} columns in the training dataframe')
x_pool train_df.head()

# Read csv file containing valida on data


valid_df = pd.read_csv("nih/valid-small.csv")
# Print first 5 rows
print(f'There are {valid_df.shape[0]} rows and
{valid_df.shape[1]} columns in the valida on dataframe')
valid_df.head()
# Extract pa ent id's for the training set
ids_train = train_df.Pa entId.values
# Extract pa ent id's for the valida on set
ids_valid = valid_df.Pa entId.values
# Create a "set" datastructure of the training set id's to iden fy unique id's
ids_train_set = set(ids_train)
print(f'There are {len(ids_train_set)} unique Pa ent IDs in the training set')
# Create a "set" datastructure of the valida on set id's to iden fy unique id's
ids_valid_set = set(ids_valid)
print(f'There are {len(ids_valid_set)} unique Pa ent IDs in the valida on set')
# Iden fy pa ent overlap by looking at the intersec on between the sets
pa ent_overlap = list(ids_train_set.intersec on(ids_valid_set))
n_overlap = len(pa ent_overlap)
print(f'There are {n_overlap} Pa ent IDs in both the training and valida on sets')
print('')
print(f'These pa ents are in both the training and valida on datasets:')
print(f'{pa ent_overlap}')
train_overlap_idxs = []
valid_overlap_idxs = []
for idx in range(n_overlap):
train_overlap_idxs.extend(train_df.index[train_df['Pa entId'] == pa ent_overlap[idx]].tolist())
valid_overlap_idxs.extend(valid_df.index[valid_df['Pa entId'] == pa ent_overlap[idx]].tolist())

print(f'These are the indices of overlapping pa ents in the training set: ')
print(f'{train_overlap_idxs}')
print(f'These are the indices of overlapping pa ents in the valida on set: ')
print(f'{valid_overlap_idxs}')

# Drop the overlapping rows from the valida on set


valid_df.drop(valid_overlap_idxs, inplace=True)

# Extract pa ent id's for the valida on set


ids_valid = valid_df.Pa entId.values
# Create a "set" datastructure of the valida on set id's to iden fy unique id's
ids_valid_set = set(ids_valid)
print(f'There are {len(ids_valid_set)} unique Pa ent IDs in the valida on set')
# Iden fy pa ent overlap by looking at the intersec on between the sets
pa ent_overlap = list(ids_train_set.intersec on(ids_valid_set))
n_overlap = len(pa ent_overlap)
print(f'There are {n_overlap} Pa ent IDs in both the training and valida on sets')

You might also like