Q1 Statistics

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 1

import numpy as np

import pandas as pd

# a)
autoMPG = pd.read_csv("autoMPG.csv")
discreteColumns = ["cylinders", "model year", "origin"]
autoMPG = pd.get_dummies(autoMPG, columns=['cylinders', 'model year', 'origin'])
autoMPG.to_csv('autoMPGmodified.csv', index=False)
print("Applied one hot encoding and saved in a new file!")

# One hot encoding has been applied on discrete attributes


# Since there are no non-numeric ordinal attributes I haven't applied integer mapping

# b) Calculating mean and variance manually without inbuilt functions


columns = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration'] # Choosing only the
numerical attributes
autoMPG = autoMPG[autoMPG['horsepower'] != '?'] # A few entries in the column
horsepower are '?' and need to be removed
autoMPG['horsepower'] = autoMPG['horsepower'].astype(float) # Converting object type to
float type
data = autoMPG[columns].values
n, d = data.shape

meanVector = np.sum(data, axis=0) / n # Calculating mean


varianceVector = np.sum(np.square(data - meanVector), axis=0) / n # Calculating variance

for i in range(d):
print(f"Feature {i + 1} - Mean (x̄): {meanVector[i]}, Variance (σ^2): {varianceVector[i]}")

# c) Normalizing the data and calculating mean and variance again for the normalized data
normalizedData = data - meanVector / varianceVector ** 0.5 # Calculating normalized
values
print(f'\nNormalized data:\n {normalizedData}')
n, d = normalizedData.shape
varianceVectorNorm = np.zeros(d)

for j in range(d):
squared_diff = (normalizedData[:, j] - np.mean(normalizedData[:, j])) ** 2
varianceVectorNorm[j] = np.sum(squared_diff) / (n - 1) # Calculating variance

print(f'\nNormalized data variance:\n {varianceVectorNorm}')

You might also like