Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 4

Pre-Processing

Example – 1:

import pandas as pd

data = pd.read_csv("J:\Machine Learning\Class\Practical\Practical_1\RawData.csv")

print(data.columns)

print(len(data.columns))

print(len(data))

print(data.dtypes)

print(data.isnull().values.any())

print(data.isnull().values.any())

print("\nTotal empty cells by column :\n", data.isnull().sum(), "\n\n")

print("\n\nNumber of Unique Locations : ", len(data['Location'].unique()))

print("\n\nNumber of Unique Salaries : ", len(data['Salary'].unique()))

print(len(data['Salary'].unique()))

print("\n\nUnique Salaries:\n", data['Salary'].unique())

#Cleaning the experience

exp = list(data.Experience)

print(exp)

min_ex = []

max_ex = []

for i in range(len(exp)):

exp[i] = exp[i].replace("yrs","").strip()

min_ex.append(int(exp[i].split("-")[0].strip()))

max_ex.append(int(exp[i].split("-")[1].strip()))
#Attaching the new experiences to the original dataset

data["minimum_exp"] = min_ex

data["maximum_exp"] = max_ex

#Label encoding location and salary

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['Location'] = le.fit_transform(data['Location'])

data['Salary'] = le.fit_transform(data['Salary'])

print(data['Location'])

print(data['Salary'])

print(data)

Index=data['Index']

Company=data['Company']

Location = data['Location']

Salary = data['Salary']

minimum_exp = data['minimum_exp']

maximum_exp = data['maximum_exp']

# dictionary of lists

dict = {'Index': Index, 'Company': Company, 'Location': Location, 'Salary':Salary,


'minimum_exp':minimum_exp, 'maximum_exp':maximum_exp}

df = pd.DataFrame(dict)

# saving the dataframe

df.to_csv('J:\Machine Learning\Class\Practical\Practical_1\File4.csv')

#Read New dataset

data = pd.read_csv("J:\Machine Learning\Class\Practical\Practical_1\File4.csv")


#Splitting the dataset into training and validation sets

from sklearn.model_selection import train_test_split

training_set, validation_set = train_test_split(data, test_size = 0.2, random_state = 21)

#classifying the predictors and target variables as X and Y

X_train = training_set.iloc[:,0:-1].values

Y_train = training_set.iloc[:,-1].values

X_val = validation_set.iloc[:,0:-1].values

y_val = validation_set.iloc[:,-1].values

X_train

Y_train

X_val

y_val

def accuracy(confusion_matrix):

diagonal_sum = confusion_matrix.trace()

sum_of_all_elements = confusion_matrix.sum()

return diagonal_sum / sum_of_all_elements

#Importing the library

from sklearn.naive_bayes import GaussianNB

#Initializing the classifier

classifier = GaussianNB()

#Fitting the training data

classifier.fit(X_train, Y_train)

y_pred = classifier.predict(X_val)

#Generating the confusion Matrix


from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_val, y_pred)

print(cm)

print("__ACCURACY = ",accuracy(cm))

Example – 2:

import pandas as pd

dataset = pd.read_csv('J:\\Machine Learning\\Class\Practical\\Preprocessing\\Data1.csv')

print(dataset.columns)

dataset

dataset.info()

#Creating Independent variable

X = dataset.iloc[:, :-1].values #Takes all rows of all columns except the last column

#Creating Dependent variable

Y = dataset.iloc[:, -1].values # Takes all rows of the last column

#Dealing with missing values with mean imputer

from sklearn.preprocessing import Imputer

imputer=Imputer(missing_values='NaN',strategy='mean',axis=0)

imputer.fit(X[:,1:3])

X[:,1:3]=imputer.transform(X[:,1:3])

You might also like