Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

9/5/23, 10:14 PM 20MIS1025_comparative analysis.

ipynb - Colaboratory

import os
from pathlib import Path
import pandas as pd
df = pd.read_csv("KDD_Train.csv")

import warnings
warnings.filterwarnings('ignore')

df.shape

(125973, 42)

print(df.shape)
df.head(5)

(125973, 42)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ...

0 0 tcp ftp_data SF 491 0 0 0 0 0 ...

1 0 udp other SF 146 0 0 0 0 0 ...

2 0 tcp private S0 0 0 0 0 0 0 ...

3 0 tcp http SF 232 8153 0 0 0 0 ...

4 0 tcp http SF 199 420 0 0 0 0 ...

5 rows × 42 columns

#DATA PREPROCESSING
df.replace(('normal','anomaly'), (0,1), inplace=True)
df.head(5)

duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_hos

0 0 tcp ftp_data SF 491 0 0 0 0 0 ... 25

1 0 udp other SF 146 0 0 0 0 0 ... 1

2 0 tcp private S0 0 0 0 0 0 0 ... 26

3 0 tcp http SF 232 8153 0 0 0 0 ... 255

4 0 tcp http SF 199 420 0 0 0 0 ... 255

5 rows × 42 columns

#CATEGORICAL FEATURES
for column_name in df.columns:
    if df[column_name].dtypes=='object':
        a =df[column_name].unique()
        a=len(a)
        #print(a)
        print(column_name+ " has "+ str(a) +" unique values. ")

protocol_type has 3 unique values.


service has 70 unique values.
flag has 11 unique values.

#CONVERT CATEGORICAL DATA INTO BINARY VARIABLES BY ONE HOT ENCODING
df['protocol_type'].head(5)

0 tcp
1 udp
2 tcp
3 tcp
4 tcp
Name: protocol_type, dtype: object

df['protocol_type'].value_counts()

tcp 102689
udp 14993
icmp 8291
Name: protocol_type, dtype: int64

print(pd.get_dummies(df['protocol_type']).head(5))
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 1/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
p (p g _ ( [ p _ yp ]) ( ))

icmp tcp udp


0 0 1 0
1 0 0 1
2 0 1 0
3 0 1 0
4 0 1 0

def dummy_df(df):
    todummy_list = ['protocol_type', 'service','flag']
    for x in todummy_list:
        #dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        #dummy_na=False: If false NaNs are ignored. If true, add col to indicate Nans
        dummies=pd.get_dummies(df[x],dummy_na=False)
        df = df.drop(x, 1)
        #Drop label coln.
        df = pd.concat([df, dummies], axis=1)
        #concat along columns.
    return df

#Appling one hot encoding function
df = dummy_df(df)
df.head(10)

duration src_bytes dst_bytes land wrong_fragment urgent hot num_failed_logins logged_in num_compromised ... REJ RSTO

0 0 491 0 0 0 0 0 0 0 0 ... 0 0

1 0 146 0 0 0 0 0 0 0 0 ... 0 0

2 0 0 0 0 0 0 0 0 0 0 ... 0 0

3 0 232 8153 0 0 0 0 0 1 0 ... 0 0

4 0 199 420 0 0 0 0 0 1 0 ... 0 0

5 0 0 0 0 0 0 0 0 0 0 ... 1 0

6 0 0 0 0 0 0 0 0 0 0 ... 0 0

7 0 0 0 0 0 0 0 0 0 0 ... 0 0

8 0 0 0 0 0 0 0 0 0 0 ... 0 0

9 0 0 0 0 0 0 0 0 0 0 ... 0 0

10 rows × 123 columns

# Checking how much of my data is missing?
df.isnull().sum().sort_values(ascending=False).head()

duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64

# Impute missing values using Imputer in sklearn.preprocessing

import numpy as np
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr.fit(df)
df = pd.DataFrame(data=imr.transform(df), columns=df.columns)

df.isnull().sum().sort_values(ascending=False).head()

duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64

X = df.drop ('class', 1) # Dropping target, train_features = train.iloc[:,:41]
y = df['class'] #train_target = train.class
X.shape

(125973, 122)

https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 2/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory

#Split train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

print(y_train)
print(X_train)
print(y_test)
print(X_test)

98047 0.0 0.0 0.0 0.0 0.0 1.0 0.0


5192 0.0 1.0 0.0 0.0 0.0 0.0 0.0
77708 0.0 0.0 0.0 0.0 0.0 1.0 0.0
98539 0.0 0.0 0.0 0.0 0.0 1.0 0.0

[94479 rows x 122 columns]


58516 0.0
5800 1.0
109276 0.0
105855 1.0
112275 0.0
...
101554 0.0
45777 0.0
62341 0.0
66126 0.0
116409 0.0
Name: class, Length: 31494, dtype: float64
duration src_bytes dst_bytes land wrong_fragment urgent hot \
58516 0.0 45.0 135.0 0.0 0.0 0.0 0.0
5800 0.0 520.0 0.0 0.0 0.0 0.0 0.0
109276 0.0 30.0 0.0 0.0 0.0 0.0 0.0
105855 0.0 28.0 0.0 0.0 1.0 0.0 0.0
112275 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 3690.0 0.0 0.0 0.0 0.0 0.0
45777 0.0 332.0 9211.0 0.0 0.0 0.0 0.0
62341 0.0 342.0 9395.0 0.0 0.0 0.0 0.0
66126 0.0 306.0 1738.0 0.0 0.0 0.0 0.0
116409 0.0 36.0 0.0 0.0 0.0 0.0 0.0

num_failed_logins logged_in num_compromised ... REJ RSTO RSTOS0 \


58516 0.0 0.0 0.0 ... 0.0 0.0 0.0
5800 0.0 0.0 0.0 ... 0.0 0.0 0.0
109276 0.0 0.0 0.0 ... 0.0 0.0 0.0
105855 0.0 0.0 0.0 ... 0.0 0.0 0.0
112275 0.0 0.0 0.0 ... 1.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 1.0 0.0 ... 0.0 0.0 0.0
45777 0.0 1.0 0.0 ... 0.0 0.0 0.0
62341 0.0 1.0 0.0 ... 0.0 0.0 0.0
66126 0.0 1.0 0.0 ... 0.0 0.0 0.0
116409 0.0 0.0 0.0 ... 0.0 0.0 0.0

RSTR S0 S1 S2 S3 SF SH
58516 0.0 0.0 0.0 0.0 0.0 1.0 0.0
5800 0.0 0.0 0.0 0.0 0.0 1.0 0.0
109276 0.0 0.0 0.0 0.0 0.0 1.0 0.0
105855 0.0 0.0 0.0 0.0 0.0 1.0 0.0
112275 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 0.0 0.0 0.0 0.0 1.0 0.0
45777 0.0 0.0 0.0 0.0 0.0 1.0 0.0
62341 0.0 0.0 0.0 0.0 0.0 1.0 0.0
66126 0.0 0.0 0.0 0.0 0.0 1.0 0.0
116409 0.0 0.0 0.0 0.0 0.0 1.0 0.0

[31494 rows x 122 columns]

from sklearn.linear_model import Perceptron

ppn = Perceptron(eta0=0.1, random_state=1)
ppn.fit(X_train, y_train)

▾ Perceptron
Perceptron(eta0=0.1, random_state=1)

from sklearn.metrics import accuracy_score

predictions_train = ppn.predict(X_train)
predictions_test = ppn.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)

https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 3/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory

score on train data: 0.230167550460949


score on test data: 0.23105988442242967

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = ppn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 812 16097]
[ 8120 6465]]

from sklearn.metrics import confusion_matrix

# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)

# Calculate the total number of actual positives (P)
total_actual_positives = cm[0, 0] + cm[0, 1]

# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100

# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100

print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")

True Positives Percentage: 4.80217635578686 %


False Negatives Percentage: 48.02176355786859 %

from sklearn.metrics import classification_report

print(classification_report(ppn.predict(X_train), y_train))

precision recall f1-score support

0.0 0.05 0.09 0.06 27125


1.0 0.44 0.29 0.35 67354

accuracy 0.23 94479


macro avg 0.24 0.19 0.20 94479
weighted avg 0.33 0.23 0.27 94479

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

▾ DecisionTreeClassifier
DecisionTreeClassifier(criterion='entropy', random_state=0)

from sklearn.metrics import accuracy_score

predictions_train = classifier.predict(X_train)
predictions_test = classifier.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)

score on train data: 0.9999364938240244


score on test data: 0.9985394043309836

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[16884 25]
[ 21 14564]]

from sklearn.metrics import confusion_matrix

# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)

# Calculate the total number of actual positives (P)

https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 4/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
total_actual_positives = cm[0, 0] + cm[0, 1]

# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100

# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100

print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")

True Positives Percentage: 99.85214974274055 %


False Negatives Percentage: 0.124194216097936 %

from sklearn.metrics import classification_report

print(classification_report(classifier.predict(X_train), y_train))

precision recall f1-score support

0.0 1.00 1.00 1.00 50440


1.0 1.00 1.00 1.00 44039

accuracy 1.00 94479


macro avg 1.00 1.00 1.00 94479
weighted avg 1.00 1.00 1.00 94479

from sklearn.naive_bayes import GaussianNB
classify = GaussianNB()
classify.fit(X_train, y_train)

▾ GaussianNB
GaussianNB()

from sklearn.metrics import accuracy_score

predictions_train = classify.predict(X_train)
predictions_test = classify.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)

score on train data: 0.5337164872617195


score on test data: 0.5354988251730488

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classify.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[16641 268]
[14361 224]]

from sklearn.metrics import confusion_matrix

# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)

# Calculate the total number of actual positives (P)
total_actual_positives = cm[0, 0] + cm[0, 1]

# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100

# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100

print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")

True Positives Percentage: 98.41504524217872 %


False Negatives Percentage: 84.9311017801171 %

from sklearn.metrics import classification_report

print(classification_report(classify.predict(X_train), y_train))

https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 5/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory

precision recall f1-score support

0.0 0.98 0.53 0.69 92934


1.0 0.02 0.50 0.03 1545

accuracy 0.53 94479


macro avg 0.50 0.52 0.36 94479
weighted avg 0.97 0.53 0.68 94479

Colab paid products Cancel contracts here


check 0s completed at 10:12 PM

https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 6/6

You might also like