Download as pdf or txt
Download as pdf or txt
You are on page 1of 19

Python Advance

title

1. Import Necessary Packages


In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

2.Import Dataset
In [4]: dataset = pd.read_csv('./Dataset/titanic.csv')

In [6]: dataset.head(50)
Out[6]: PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket

Braund, Mr. A/5


0 1 0 3 male 22.0 1 0 7
Owen Harris 21171

Cumings,
Mrs. John
1 2 1 1 Bradley female 38.0 1 0 PC 17599 71
(Florence
Briggs Th...

Heikkinen, STON/O2.
2 3 1 3 female 26.0 0 0 7
Miss. Laina 3101282

Futrelle, Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53
Heath (Lily
May Peel)

Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8
Henry

Moran, Mr.
5 6 0 3 male NaN 0 0 330877 8
James

McCarthy,
6 7 0 1 Mr. Timothy male 54.0 0 0 17463 51
J

Palsson,
Master.
7 8 0 3 male 2.0 3 1 349909 21
Gosta
Leonard

Johnson,
Mrs. Oscar
8 9 1 3 W (Elisabeth female 27.0 0 2 347742 11
Vilhelmina
Berg)

Nasser, Mrs.
Nicholas
9 10 1 2 female 14.0 1 0 237736 30
(Adele
Achem)

Sandstrom,
Miss.
10 11 1 3 female 4.0 1 1 PP 9549 16
Marguerite
Rut

Bonnell,
11 12 1 1 Miss. female 58.0 0 0 113783 26
Elizabeth

Saundercock,
12 13 0 3 Mr. William male 20.0 0 0 A/5. 2151 8
Henry
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket

Andersson,
13 14 0 3 Mr. Anders male 39.0 1 5 347082 31
Johan

Vestrom,
Miss. Hulda
14 15 0 3 female 14.0 0 0 350406 7
Amanda
Adolfina

Hewlett, Mrs.
15 16 1 2 (Mary D female 55.0 0 0 248706 16
Kingcome)

Rice, Master.
16 17 0 3 male 2.0 4 1 382652 29
Eugene

Williams, Mr.
17 18 1 2 Charles male NaN 0 0 244373 13
Eugene

Vander
Planke, Mrs.
18 19 0 3 Julius (Emelia female 31.0 1 0 345763 18
Maria
Vande...

Masselmani,
19 20 1 3 female NaN 0 0 2649 7
Mrs. Fatima

Fynney, Mr.
20 21 0 2 male 35.0 0 0 239865 26
Joseph J

Beesley, Mr.
21 22 1 2 male 34.0 0 0 248698 13
Lawrence

McGowan,
22 23 1 3 Miss. Anna female 15.0 0 0 330923 8
"Annie"

Sloper, Mr.
23 24 1 1 William male 28.0 0 0 113788 35
Thompson

Palsson,
Miss.
24 25 0 3 female 8.0 3 1 349909 21
Torborg
Danira

Asplund,
Mrs. Carl
25 26 1 3 Oscar (Selma female 38.0 1 5 347077 31
Augusta
Emilia...

26 27 0 3 Emir, Mr. male NaN 0 0 2631 7


Farred
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
Chehab

Fortune, Mr.
27 28 0 1 Charles male 19.0 3 2 19950 263
Alexander

O'Dwyer,
28 29 1 3 Miss. Ellen female NaN 0 0 330959 7
"Nellie"

Todoroff, Mr.
29 30 0 3 male NaN 0 0 349216 7
Lalio

Uruchurtu,
30 31 0 1 Don. Manuel male 40.0 0 0 PC 17601 27
E

Spencer,
Mrs. William
31 32 1 1 Augustus female NaN 1 0 PC 17569 146
(Marie
Eugenie)

Glynn, Miss.
32 33 1 3 female NaN 0 0 335677 7
Mary Agatha

Wheadon,
C.A.
33 34 0 2 Mr. Edward male 66.0 0 0 10
24579
H

Meyer, Mr.
34 35 0 1 Edgar male 28.0 1 0 PC 17604 82
Joseph

Holverson,
Mr.
35 36 0 1 male 42.0 1 0 113789 52
Alexander
Oskar

Mamee, Mr.
36 37 1 3 male NaN 0 0 2677 7
Hanna

Cann, Mr.
A./5.
37 38 0 3 Ernest male 21.0 0 0 8
2152
Charles

Vander
Planke, Miss.
38 39 0 3 female 18.0 2 0 345764 18
Augusta
Maria

Nicola-
39 40 1 3 Yarred, Miss. female 14.0 1 0 2651 11
Jamila

40 41 0 3 Ahlin, Mrs. female 40.0 1 0 7546 9


Johan
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
(Johanna
Persdotter
Larsson)

Turpin, Mrs.
William John
41 42 0 2 Robert female 27.0 1 0 11668 21
(Dorothy
Ann ...

Kraeff, Mr.
42 43 0 3 male NaN 0 0 349253 7
Theodor

Laroche,
Miss.
SC/Paris
43 44 1 2 Simonne female 3.0 1 2 41
2123
Marie Anne
Andree

Devaney,
Miss.
44 45 1 3 female 19.0 0 0 330958 7
Margaret
Delia

Rogers, Mr. S.C./A.4.


45 46 0 3 male NaN 0 0 8
William John 23567

Lennon, Mr.
46 47 0 3 male NaN 1 0 370371 15
Denis

O'Driscoll,
47 48 1 3 female NaN 0 0 14311 7
Miss. Bridget

Samaan, Mr.
48 49 0 3 male NaN 2 0 2662 21
Youssef

Arnold-
Franchi, Mrs.
49 50 0 3 Josef female 18.0 1 0 349237 17
(Josefine
Franchi)

3.Data Preprocessing and Visualization


In [9]: dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

In [11]: dataset.isna().sum()

Out[11]: PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64

In [13]: dataset = dataset.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [15]: dataset.head(50)
Out[15]: Survived Pclass Sex Age SibSp Parch Fare Embarked

0 0 3 male 22.0 1 0 7.2500 S

1 1 1 female 38.0 1 0 71.2833 C

2 1 3 female 26.0 0 0 7.9250 S

3 1 1 female 35.0 1 0 53.1000 S

4 0 3 male 35.0 0 0 8.0500 S

5 0 3 male NaN 0 0 8.4583 Q

6 0 1 male 54.0 0 0 51.8625 S

7 0 3 male 2.0 3 1 21.0750 S

8 1 3 female 27.0 0 2 11.1333 S

9 1 2 female 14.0 1 0 30.0708 C

10 1 3 female 4.0 1 1 16.7000 S

11 1 1 female 58.0 0 0 26.5500 S

12 0 3 male 20.0 0 0 8.0500 S

13 0 3 male 39.0 1 5 31.2750 S

14 0 3 female 14.0 0 0 7.8542 S

15 1 2 female 55.0 0 0 16.0000 S

16 0 3 male 2.0 4 1 29.1250 Q

17 1 2 male NaN 0 0 13.0000 S

18 0 3 female 31.0 1 0 18.0000 S

19 1 3 female NaN 0 0 7.2250 C

20 0 2 male 35.0 0 0 26.0000 S

21 1 2 male 34.0 0 0 13.0000 S

22 1 3 female 15.0 0 0 8.0292 Q

23 1 1 male 28.0 0 0 35.5000 S

24 0 3 female 8.0 3 1 21.0750 S

25 1 3 female 38.0 1 5 31.3875 S

26 0 3 male NaN 0 0 7.2250 C

27 0 1 male 19.0 3 2 263.0000 S

28 1 3 female NaN 0 0 7.8792 Q

29 0 3 male NaN 0 0 7.8958 S


Survived Pclass Sex Age SibSp Parch Fare Embarked

30 0 1 male 40.0 0 0 27.7208 C

31 1 1 female NaN 1 0 146.5208 C

32 1 3 female NaN 0 0 7.7500 Q

33 0 2 male 66.0 0 0 10.5000 S

34 0 1 male 28.0 1 0 82.1708 C

35 0 1 male 42.0 1 0 52.0000 S

36 1 3 male NaN 0 0 7.2292 C

37 0 3 male 21.0 0 0 8.0500 S

38 0 3 female 18.0 2 0 18.0000 S

39 1 3 female 14.0 1 0 11.2417 C

40 0 3 female 40.0 1 0 9.4750 S

41 0 2 female 27.0 1 0 21.0000 S

42 0 3 male NaN 0 0 7.8958 C

43 1 2 female 3.0 1 2 41.5792 C

44 1 3 female 19.0 0 0 7.8792 Q

45 0 3 male NaN 0 0 8.0500 S

46 0 3 male NaN 1 0 15.5000 Q

47 1 3 female NaN 0 0 7.7500 Q

48 0 3 male NaN 2 0 21.6792 C

49 0 3 female 18.0 1 0 17.8000 S

In [17]: dataset.isna().sum()

Out[17]: Survived 0
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Fare 0
Embarked 2
dtype: int64

Filling NaN values


In [20]: # fill NaN values with the mean of each column
dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)
Converting modal values to numerical values
In [23]: dataset['Embarked'].value_counts()

Out[23]: Embarked
S 644
C 168
Q 77
Name: count, dtype: int64

In [25]: dataset['Embarked'].fillna('S', inplace=True)

In [27]: dataset['Embarked'].value_counts()

Out[27]: Embarked
S 646
C 168
Q 77
Name: count, dtype: int64

In [29]: sex_stat = {'female':0,'male':1}


embarked_stat = {'S':0,'C':1,'Q':2}

In [31]: dataset['Sex'] = dataset['Sex'].replace(sex_stat)


dataset['Embarked'] = dataset['Embarked'].replace(embarked_stat)

In [33]: dataset.head(50)
Out[33]: Survived Pclass Sex Age SibSp Parch Fare Embarked

0 0 3 1 22.000000 1 0 7.2500 0

1 1 1 0 38.000000 1 0 71.2833 1

2 1 3 0 26.000000 0 0 7.9250 0

3 1 1 0 35.000000 1 0 53.1000 0

4 0 3 1 35.000000 0 0 8.0500 0

5 0 3 1 29.699118 0 0 8.4583 2

6 0 1 1 54.000000 0 0 51.8625 0

7 0 3 1 2.000000 3 1 21.0750 0

8 1 3 0 27.000000 0 2 11.1333 0

9 1 2 0 14.000000 1 0 30.0708 1

10 1 3 0 4.000000 1 1 16.7000 0

11 1 1 0 58.000000 0 0 26.5500 0

12 0 3 1 20.000000 0 0 8.0500 0

13 0 3 1 39.000000 1 5 31.2750 0

14 0 3 0 14.000000 0 0 7.8542 0

15 1 2 0 55.000000 0 0 16.0000 0

16 0 3 1 2.000000 4 1 29.1250 2

17 1 2 1 29.699118 0 0 13.0000 0

18 0 3 0 31.000000 1 0 18.0000 0

19 1 3 0 29.699118 0 0 7.2250 1

20 0 2 1 35.000000 0 0 26.0000 0

21 1 2 1 34.000000 0 0 13.0000 0

22 1 3 0 15.000000 0 0 8.0292 2

23 1 1 1 28.000000 0 0 35.5000 0

24 0 3 0 8.000000 3 1 21.0750 0

25 1 3 0 38.000000 1 5 31.3875 0

26 0 3 1 29.699118 0 0 7.2250 1

27 0 1 1 19.000000 3 2 263.0000 0

28 1 3 0 29.699118 0 0 7.8792 2

29 0 3 1 29.699118 0 0 7.8958 0
Survived Pclass Sex Age SibSp Parch Fare Embarked

30 0 1 1 40.000000 0 0 27.7208 1

31 1 1 0 29.699118 1 0 146.5208 1

32 1 3 0 29.699118 0 0 7.7500 2

33 0 2 1 66.000000 0 0 10.5000 0

34 0 1 1 28.000000 1 0 82.1708 1

35 0 1 1 42.000000 1 0 52.0000 0

36 1 3 1 29.699118 0 0 7.2292 1

37 0 3 1 21.000000 0 0 8.0500 0

38 0 3 0 18.000000 2 0 18.0000 0

39 1 3 0 14.000000 1 0 11.2417 1

40 0 3 0 40.000000 1 0 9.4750 0

41 0 2 0 27.000000 1 0 21.0000 0

42 0 3 1 29.699118 0 0 7.8958 1

43 1 2 0 3.000000 1 2 41.5792 1

44 1 3 0 19.000000 0 0 7.8792 2

45 0 3 1 29.699118 0 0 8.0500 0

46 0 3 1 29.699118 1 0 15.5000 2

47 1 3 0 29.699118 0 0 7.7500 2

48 0 3 1 29.699118 2 0 21.6792 1

49 0 3 0 18.000000 1 0 17.8000 0

In [35]: # Pairplot to visualize pairwise relationships


sns.pairplot(dataset, hue='Survived')
plt.show()

C:\Users\ASHISH\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: Th
e figure layout has changed to tight
self._figure.tight_layout(*args, **kwargs)
In [36]: # Correlation heatmap
correlation_matrix = dataset.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
In [37]: # Distribution of Age by Outcome
plt.figure(figsize=(8, 6))
sns.histplot(data=dataset, x='Age', hue='Survived', kde=True)
plt.title('Distribution of Age by Survived')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend(title='Outcome')
plt.show()

No artists with labels found to put in legend. Note that artists whose label start
with an underscore are ignored when legend() is called with no argument.
4. Data Split
In [39]: dataset.head(10)

Out[39]: Survived Pclass Sex Age SibSp Parch Fare Embarked

0 0 3 1 22.000000 1 0 7.2500 0

1 1 1 0 38.000000 1 0 71.2833 1

2 1 3 0 26.000000 0 0 7.9250 0

3 1 1 0 35.000000 1 0 53.1000 0

4 0 3 1 35.000000 0 0 8.0500 0

5 0 3 1 29.699118 0 0 8.4583 2

6 0 1 1 54.000000 0 0 51.8625 0

7 0 3 1 2.000000 3 1 21.0750 0

8 1 3 0 27.000000 0 2 11.1333 0

9 1 2 0 14.000000 1 0 30.0708 1
In [40]: X = dataset[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y = dataset['Survived']

In [46]: x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify=y

5. Application of Algorithms
1. SVM
In [50]: model = SVC()

In [52]: model.fit(x_train,y_train)

Out[52]: ▾ SVC

SVC()

In [54]: y_pred = model.predict(x_test)

In [56]: accuracy_svc = accuracy_score(y_test,y_pred)


print('Accuracy of SVM Classifier:',accuracy_svc)

Accuracy of SVM Classifier: 0.6201117318435754

In [58]: cm = confusion_matrix(y_test, y_pred)


sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for SVC')
plt.show()
In [60]: cr = classification_report(y_test, y_pred)
print(cr)

precision recall f1-score support

0 0.64 0.86 0.74 110


1 0.52 0.23 0.32 69

accuracy 0.62 179


macro avg 0.58 0.55 0.53 179
weighted avg 0.59 0.62 0.58 179

2. KNN
In [63]: model = KNeighborsClassifier()

In [65]: model.fit(x_train,y_train)

Out[65]: ▾ KNeighborsClassifier

KNeighborsClassifier()

In [67]: y_pred = model.predict(x_test)


In [69]: accuracy_svc = accuracy_score(y_test,y_pred)
print('Accuracy of KNN Classifier:',accuracy_svc)

Accuracy of KNN Classifier: 0.6759776536312849

In [71]: cm = confusion_matrix(y_test, y_pred)


sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for KNN')
plt.show()

In [73]: cr = classification_report(y_test, y_pred)


print(cr)

precision recall f1-score support

0 0.72 0.77 0.75 110


1 0.59 0.52 0.55 69

accuracy 0.68 179


macro avg 0.66 0.65 0.65 179
weighted avg 0.67 0.68 0.67 179

3. Logistic Regression
In [76]: model = LogisticRegression()

In [78]: model.fit(x_train,y_train)

Out[78]: ▾ LogisticRegression

LogisticRegression()

In [80]: y_pred = model.predict(x_test)

In [82]: accuracy_svc = accuracy_score(y_test,y_pred)


print('Accuracy of LR Classifier:',accuracy_svc)

Accuracy of LR Classifier: 0.8044692737430168

In [84]: cm = confusion_matrix(y_test, y_pred)


sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for LR')
plt.show()

In [86]: cr = classification_report(y_test, y_pred)


print(cr)
precision recall f1-score support

0 0.81 0.89 0.85 110


1 0.79 0.67 0.72 69

accuracy 0.80 179


macro avg 0.80 0.78 0.79 179
weighted avg 0.80 0.80 0.80 179

In [ ]:

You might also like