20MIS1025 - Comparative Analysis - Ipynb - Colaboratory

9/5/23, 10:14 PM 20MIS1025_comparative analysis.
ipynb - Colaboratory
import os
from pathlib import Path
import pandas as pd
df = pd.read_csv("KDD_Train.csv")
import warnings
warnings.filterwarnings('ignore')
df.shape
(125973, 42)
print(df.shape)
df.head(5)
(125973, 42)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ...
0 0 tcp ftp_data SF 491 0 0 0 0 0 ...
1 0 udp other SF 146 0 0 0 0 0 ...
2 0 tcp private S0 0 0 0 0 0 0 ...
3 0 tcp http SF 232 8153 0 0 0 0 ...
4 0 tcp http SF 199 420 0 0 0 0 ...
5 rows × 42 columns
#DATA PREPROCESSING
df.replace(('normal','anomaly'), (0,1), inplace=True)
df.head(5)
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_hos
0 0 tcp ftp_data SF 491 0 0 0 0 0 ... 25
1 0 udp other SF 146 0 0 0 0 0 ... 1
2 0 tcp private S0 0 0 0 0 0 0 ... 26
3 0 tcp http SF 232 8153 0 0 0 0 ... 255
4 0 tcp http SF 199 420 0 0 0 0 ... 255
#CATEGORICAL FEATURES
for column_name in df.columns:
if df[column_name].dtypes=='object':
a =df[column_name].unique()
a=len(a)
#print(a)
print(column_name+ " has "+ str(a) +" unique values. ")
protocol_type has 3 unique values.

service has 70 unique values.
flag has 11 unique values.
#CONVERT CATEGORICAL DATA INTO BINARY VARIABLES BY ONE HOT ENCODING
df['protocol_type'].head(5)
0 tcp
1 udp
2 tcp
3 tcp
4 tcp
Name: protocol_type, dtype: object
df['protocol_type'].value_counts()
tcp 102689
udp 14993
icmp 8291
Name: protocol_type, dtype: int64
print(pd.get_dummies(df['protocol_type']).head(5))
https://colab.research.google.com/drive/1s4m7XDZg_PUoNMtOPJN9ZyExHaELvSQH 1/6
9/5/23, 10:14 PM 20MIS1025_comparative analysis.ipynb - Colaboratory
p (p g _ ( [ p _ yp ]) ( ))
icmp tcp udp

0 0 1 0
1 0 0 1
2 0 1 0
3 0 1 0
4 0 1 0
def dummy_df(df):
todummy_list = ['protocol_type', 'service','flag']
for x in todummy_list:
#dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
#dummy_na=False: If false NaNs are ignored. If true, add col to indicate Nans
dummies=pd.get_dummies(df[x],dummy_na=False)
df = df.drop(x, 1)
#Drop label coln.
df = pd.concat([df, dummies], axis=1)
#concat along columns.
return df
#Appling one hot encoding function
df = dummy_df(df)
df.head(10)
duration src_bytes dst_bytes land wrong_fragment urgent hot num_failed_logins logged_in num_compromised ... REJ RSTO
0 0 491 0 0 0 0 0 0 0 0 ... 0 0
1 0 146 0 0 0 0 0 0 0 0 ... 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0
3 0 232 8153 0 0 0 0 0 1 0 ... 0 0
4 0 199 420 0 0 0 0 0 1 0 ... 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 1 0
6 0 0 0 0 0 0 0 0 0 0 ... 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0
# Checking how much of my data is missing?
df.isnull().sum().sort_values(ascending=False).head()
duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64
# Impute missing values using Imputer in sklearn.preprocessing
import numpy as np
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr.fit(df)
df = pd.DataFrame(data=imr.transform(df), columns=df.columns)
df.isnull().sum().sort_values(ascending=False).head()
duration 0
red_i 0
printer 0
pop_3 0
pop_2 0
dtype: int64
X = df.drop ('class', 1) # Dropping target, train_features = train.iloc[:,:41]
y = df['class'] #train_target = train.class
X.shape
(125973, 122)
#Split train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print(y_train)
print(X_train)
print(y_test)
print(X_test)
98047 0.0 0.0 0.0 0.0 0.0 1.0 0.0

5192 0.0 1.0 0.0 0.0 0.0 0.0 0.0
77708 0.0 0.0 0.0 0.0 0.0 1.0 0.0
98539 0.0 0.0 0.0 0.0 0.0 1.0 0.0
[94479 rows x 122 columns]

58516 0.0
5800 1.0
109276 0.0
105855 1.0
112275 0.0
...
101554 0.0
45777 0.0
62341 0.0
66126 0.0
116409 0.0
Name: class, Length: 31494, dtype: float64
duration src_bytes dst_bytes land wrong_fragment urgent hot \
58516 0.0 45.0 135.0 0.0 0.0 0.0 0.0
5800 0.0 520.0 0.0 0.0 0.0 0.0 0.0
109276 0.0 30.0 0.0 0.0 0.0 0.0 0.0
105855 0.0 28.0 0.0 0.0 1.0 0.0 0.0
112275 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 3690.0 0.0 0.0 0.0 0.0 0.0
45777 0.0 332.0 9211.0 0.0 0.0 0.0 0.0
62341 0.0 342.0 9395.0 0.0 0.0 0.0 0.0
66126 0.0 306.0 1738.0 0.0 0.0 0.0 0.0
116409 0.0 36.0 0.0 0.0 0.0 0.0 0.0
num_failed_logins logged_in num_compromised ... REJ RSTO RSTOS0 \

58516 0.0 0.0 0.0 ... 0.0 0.0 0.0
5800 0.0 0.0 0.0 ... 0.0 0.0 0.0
109276 0.0 0.0 0.0 ... 0.0 0.0 0.0
105855 0.0 0.0 0.0 ... 0.0 0.0 0.0
112275 0.0 0.0 0.0 ... 1.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 1.0 0.0 ... 0.0 0.0 0.0
45777 0.0 1.0 0.0 ... 0.0 0.0 0.0
62341 0.0 1.0 0.0 ... 0.0 0.0 0.0
66126 0.0 1.0 0.0 ... 0.0 0.0 0.0
116409 0.0 0.0 0.0 ... 0.0 0.0 0.0
RSTR S0 S1 S2 S3 SF SH
58516 0.0 0.0 0.0 0.0 0.0 1.0 0.0
5800 0.0 0.0 0.0 0.0 0.0 1.0 0.0
109276 0.0 0.0 0.0 0.0 0.0 1.0 0.0
105855 0.0 0.0 0.0 0.0 0.0 1.0 0.0
112275 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ...
101554 0.0 0.0 0.0 0.0 0.0 1.0 0.0
45777 0.0 0.0 0.0 0.0 0.0 1.0 0.0
62341 0.0 0.0 0.0 0.0 0.0 1.0 0.0
66126 0.0 0.0 0.0 0.0 0.0 1.0 0.0
116409 0.0 0.0 0.0 0.0 0.0 1.0 0.0
[31494 rows x 122 columns]
from sklearn.linear_model import Perceptron
ppn = Perceptron(eta0=0.1, random_state=1)
ppn.fit(X_train, y_train)
▾ Perceptron
Perceptron(eta0=0.1, random_state=1)
from sklearn.metrics import accuracy_score
predictions_train = ppn.predict(X_train)
predictions_test = ppn.predict(X_test)
train_score = accuracy_score(predictions_train, y_train)
print("score on train data: ", train_score)
test_score = accuracy_score(predictions_test, y_test)
print("score on test data: ", test_score)
score on train data: 0.230167550460949

score on test data: 0.23105988442242967
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = ppn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
[[ 812 16097]
[ 8120 6465]]
from sklearn.metrics import confusion_matrix
# Assuming you already have the confusion matrix 'cm'
# cm = confusion_matrix(y_test, y_pred)
# Calculate the total number of actual positives (P)
total_actual_positives = cm[0, 0] + cm[0, 1]
# True Positives (TP) as a percentage of total actual positives
true_positives_percentage = (cm[0, 0] / total_actual_positives) * 100
# False Negatives (FN) as a percentage of total actual positives
false_negatives_percentage = (cm[1, 0] / total_actual_positives) * 100
print("True Positives Percentage:", true_positives_percentage, "%")
print("False Negatives Percentage:", false_negatives_percentage, "%")
True Positives Percentage: 4.80217635578686 %

False Negatives Percentage: 48.02176355786859 %
from sklearn.metrics import classification_report
print(classification_report(ppn.predict(X_train), y_train))
precision recall f1-score support
0.0 0.05 0.09 0.06 27125

1.0 0.44 0.29 0.35 67354
accuracy 0.23 94479

macro avg 0.24 0.19 0.20 94479
weighted avg 0.33 0.23 0.27 94479
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
▾ DecisionTreeClassifier
DecisionTreeClassifier(criterion='entropy', random_state=0)
predictions_train = classifier.predict(X_train)
predictions_test = classifier.predict(X_test)

y_pred = classifier.predict(X_test)
print(cm)
[[16884 25]
[ 21 14564]]

print(classification_report(classifier.predict(X_train), y_train))
0.0 1.00 1.00 1.00 50440

1.0 1.00 1.00 1.00 44039
accuracy 1.00 94479

macro avg 1.00 1.00 1.00 94479
weighted avg 1.00 1.00 1.00 94479
from sklearn.naive_bayes import GaussianNB
classify = GaussianNB()
classify.fit(X_train, y_train)
▾ GaussianNB
GaussianNB()
predictions_train = classify.predict(X_train)
predictions_test = classify.predict(X_test)

y_pred = classify.predict(X_test)
print(cm)
[[16641 268]
[14361 224]]

print(classification_report(classify.predict(X_train), y_train))
0.0 0.98 0.53 0.69 92934

1.0 0.02 0.50 0.03 1545
accuracy 0.53 94479

macro avg 0.50 0.52 0.36 94479
weighted avg 0.97 0.53 0.68 94479
Colab paid products Cancel contracts here

check 0s completed at 10:12 PM

20MIS1025 - Comparative Analysis - Ipynb - Colaboratory

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

20MIS1025 - Comparative Analysis - Ipynb - Colaboratory

Uploaded by

Copyright:

Available Formats

9/5/23, 10:14 PM 20MIS1025_comparative analysis.

0 0 tcp ftp_data SF 491 0 0 0 0 0 ...

1 0 udp other SF 146 0 0 0 0 0 ...

2 0 tcp private S0 0 0 0 0 0 0 ...

3 0 tcp http SF 232 8153 0 0 0 0 ...

4 0 tcp http SF 199 420 0 0 0 0 ...

0 0 tcp ftp_data SF 491 0 0 0 0 0 ... 25

1 0 udp other SF 146 0 0 0 0 0 ... 1

2 0 tcp private S0 0 0 0 0 0 0 ... 26

3 0 tcp http SF 232 8153 0 0 0 0 ... 255

4 0 tcp http SF 199 420 0 0 0 0 ... 255

protocol_type has 3 unique values.

icmp tcp udp

3 0 232 8153 0 0 0 0 0 1 0 ... 0 0

4 0 199 420 0 0 0 0 0 1 0 ... 0 0

10 rows × 123 columns

98047 0.0 0.0 0.0 0.0 0.0 1.0 0.0

[94479 rows x 122 columns]

num_failed_logins logged_in num_compromised ... REJ RSTO RSTOS0 \

[31494 rows x 122 columns]

score on train data: 0.230167550460949

True Positives Percentage: 4.80217635578686 %

precision recall f1-score support

0.0 0.05 0.09 0.06 27125

accuracy 0.23 94479

score on train data: 0.9999364938240244

True Positives Percentage: 99.85214974274055 %

precision recall f1-score support

0.0 1.00 1.00 1.00 50440

accuracy 1.00 94479

score on train data: 0.5337164872617195

True Positives Percentage: 98.41504524217872 %

precision recall f1-score support

0.0 0.98 0.53 0.69 92934

accuracy 0.53 94479

Colab paid products Cancel contracts here

You might also like