Professional Documents
Culture Documents
20MIS1025 - Comparative Analysis - Ipynb - Colaboratory
20MIS1025 - Comparative Analysis - Ipynb - Colaboratory
ipynb - Colaboratory
import os
from pathlib import Path
import pandas as pd
df = pd.read_csv("KDD_Train.csv")
import warnings
warnings.filterwarnings('ignore')
df.shape
(125973, 42)
print(df.shape)
df.head(5)
(125973, 42)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ...
5 rows × 42 columns
#DATA PREPROCESSING
df.replace(('normal','anomaly'), (0,1), inplace=True)
df.head(5)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_hos
5 rows × 42 columns
#CATEGORICAL FEATURES
for column_name in df.columns:
if df[column_name].dtypes=='object':
a =df[column_name].unique()
a=len(a)
#print(a)
print(column_name+ " has "+ str(a) +" unique values. ")
#CONVERT CATEGORICAL DATA INTO BINARY VARIABLES BY ONE HOT ENCODING
df['protocol_type'].head(5)
0 tcp
1 udp
2 tcp
3 tcp
4 tcp
Name: protocol_type, dtype: object
df['protocol_type'].value_counts()
tcp 102689
udp 14993
icmp 8291
Name: protocol_type, dtype: int64
print(pd.get_dummies(df['protocol_type']).head(5))
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 1/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
p (p g _ ( [ p _ yp ]) ( ))
def dummy_df(df):
todummy_list = ['protocol_type', 'service','flag']
for x in todummy_list:
#dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
#dummy_na=False: If false NaNs are ignored. If true, add col to indicate Nans
dummies=pd.get_dummies(df[x],dummy_na=False)
df = df.drop(x, 1)
#Drop label coln.
df = pd.concat([df, dummies], axis=1)
#concat along columns.
return df
#Appling one hot encoding function
df = dummy_df(df)
df.head(10)
duration src_bytes dst_bytes land wrong_fragment urgent hot num_failed_logins logged_in num_compromised ... REJ RSTO
0 0 491 0 0 0 0 0 0 0 0 ... 0 0
1 0 146 0 0 0 0 0 0 0 0 ... 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 1 0
6 0 0 0 0 0 0 0 0 0 0 ... 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0
# Checking how much of my data is missing?
df.isnull().sum().sort_values(ascending=False).head()
duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64
# Impute missing values using Imputer in sklearn.preprocessing
import numpy as np
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr.fit(df)
df = pd.DataFrame(data=imr.transform(df), columns=df.columns)
df.isnull().sum().sort_values(ascending=False).head()
duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64
X = df.drop ('class', 1) # Dropping target, train_features = train.iloc[:,:41]
y = df['class'] #train_target = train.class
X.shape
(125973, 122)
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 2/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
#Split train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(y_train)
print(X_train)
print(y_test)
print(X_test)
RSTR S0 S1 S2 S3 SF SH
58516 0.0 0.0 0.0 0.0 0.0 1.0 0.0
5800 0.0 0.0 0.0 0.0 0.0 1.0 0.0
109276 0.0 0.0 0.0 0.0 0.0 1.0 0.0
105855 0.0 0.0 0.0 0.0 0.0 1.0 0.0
112275 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 0.0 0.0 0.0 0.0 1.0 0.0
45777 0.0 0.0 0.0 0.0 0.0 1.0 0.0
62341 0.0 0.0 0.0 0.0 0.0 1.0 0.0
66126 0.0 0.0 0.0 0.0 0.0 1.0 0.0
116409 0.0 0.0 0.0 0.0 0.0 1.0 0.0
from sklearn.linear_model import Perceptron
ppn = Perceptron(eta0=0.1, random_state=1)
ppn.fit(X_train, y_train)
▾ Perceptron
Perceptron(eta0=0.1, random_state=1)
from sklearn.metrics import accuracy_score
predictions_train = ppn.predict(X_train)
predictions_test = ppn.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 3/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = ppn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
[[ 812 16097]
[ 8120 6465]]
from sklearn.metrics import confusion_matrix
# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)
# Calculate the total number of actual positives (P)
total_actual_positives = cm[0, 0] + cm[0, 1]
# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100
# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100
print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")
from sklearn.metrics import classification_report
print(classification_report(ppn.predict(X_train), y_train))
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
▾ DecisionTreeClassifier
DecisionTreeClassifier(criterion='entropy', random_state=0)
from sklearn.metrics import accuracy_score
predictions_train = classifier.predict(X_train)
predictions_test = classifier.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
[[16884 25]
[ 21 14564]]
from sklearn.metrics import confusion_matrix
# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)
# Calculate the total number of actual positives (P)
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 4/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
total_actual_positives = cm[0, 0] + cm[0, 1]
# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100
# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100
print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")
from sklearn.metrics import classification_report
print(classification_report(classifier.predict(X_train), y_train))
from sklearn.naive_bayes import GaussianNB
classify = GaussianNB()
classify.fit(X_train, y_train)
▾ GaussianNB
GaussianNB()
from sklearn.metrics import accuracy_score
predictions_train = classify.predict(X_train)
predictions_test = classify.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classify.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
[[16641 268]
[14361 224]]
from sklearn.metrics import confusion_matrix
# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)
# Calculate the total number of actual positives (P)
total_actual_positives = cm[0, 0] + cm[0, 1]
# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100
# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100
print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")
from sklearn.metrics import classification_report
print(classification_report(classify.predict(X_train), y_train))
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 5/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 6/6