Professional Documents
Culture Documents
ML#05
ML#05
ML#05
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("Air_Quality.csv",encoding='cp1252')
print(df)
df.dtypes
agency \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
97418 National Environmental Engineering Research In...
97419 National Environmental Engineering Research In...
97420 National Environmental Engineering Research In...
97421 National Environmental Engineering Research In...
97422 Na
stn_code float64
sampling_date object
state object
location object
agency object
type object
so2 float64
no2 float64
rspm float64
spm float64
location_monitoring_station object
pm2_5 float64
date object
dtype: object
pm2_5
count 1848.000000
mean 34.651136
std 37.800496
min 7.000000
25% 14.000000
50% 21.000000
75% 31.050000
max 318.000000
import pandas as pd
if columns_to_drop:
# Drop the columns if they exist
df = df.drop(columns=columns_to_drop)
print("Dropping less valued columns:")
print(df)
else:
print("Columns to drop not found in the DataFrame.")
97418 21.0 83.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97419 19.0 74.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97420 16.0 71.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97421 21.0 82.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
date
0 1990-02-01
1 1990-02-01
2 1990-02-01
3 1990-03-01
4 1990-03-01
... ...
97418 2012-05-12
97419 2012-05-16
97420 2012-05-19
97421 2012-05-23
97422 NaN
import pandas as pd
97417 24.0 74.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97418 21.0 83.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97419 19.0 74.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97420 16.0 71.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97421 21.0 82.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
date
0 1990-02-01
1 1990-02-01
2 1990-02-01
3 1990-03-01
4 1990-03-01
... ...
97417 2012-05-09
97418 2012-05-12
97419 2012-05-16
97420 2012-05-19
97421 2012-05-23
print('---------------------------------------------------------------
-----------------')
print('---------------------------------------------------------------
-----------------')
#Titanic Dataset
url =
"https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/master/
titanic_original.csv"
df = pd.read_csv(url, encoding='cp1252')
Dataset:
pclass survived
name \
0 1.0 1.0 Allen, Miss. Elisabeth
Walton
1 1.0 1.0 Allison, Master. Hudson
Trevor
2 1.0 0.0 Allison, Miss. Helen
Loraine
3 1.0 0.0 Allison, Mr. Hudson Joshua
Creighton
4 1.0 0.0 Allison, Mrs. Hudson J C (Bessie Waldo
Daniels)
... ... ... ..
.
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
1308 3.0 0.0 Zimmerman, Mr.
Leo
1309 NaN NaN
NaN
body home.dest
0 NaN St Louis, MO
1 NaN Montreal, PQ / Chesterville, ON
2 NaN Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
4 NaN Montreal, PQ / Chesterville, ON
... ... ...
1305 NaN NaN
1306 304.0 NaN
1307 NaN NaN
1308 NaN NaN
1309 NaN NaN
print('---------------------------------------------------------------
-----------------')
print('---------------------------------------------------------------
-----------------')
print('---------------------------------------------------------------
-----------------')
body home.dest
0 NaN St Louis, MO
1 NaN Montreal, PQ / Chesterville, ON
2 NaN Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
4 NaN Montreal, PQ / Chesterville, ON
... ... ...
1304 328.0 NaN
1305 NaN NaN
1306 304.0 NaN
1307 NaN NaN
1308 NaN NaN
Descriptive Statistics:
pclass survived age sibsp parch
\
count 1309.000000 1309.000000 1046.000000 1309.000000 1309.000000
fare body
count 1308.000000 121.000000
mean 33.295479 160.809917
std 51.758668 97.696922
min 0.000000 1.000000
25% 7.895800 72.000000
50% 14.454200 155.000000
75% 31.275000 256.000000
max 512.329200 328.000000
# 7. Fill NaN values in each column
# c. Forward Fill
df_filled_ffill = df.fillna(method='ffill')
print("c. Forward Fill:")
print(df_filled_ffill)
# c. Backward Fill
df_filled_bfill = df.fillna(method='bfill')
print("c. Backward Fill:")
print(df_filled_bfill)
body home.dest
0 155.0 St Louis, MO
1 155.0 Montreal, PQ / Chesterville, ON
2 155.0 Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
4 155.0 Montreal, PQ / Chesterville, ON
... ... ...
1304 328.0 NaN
1305 155.0 NaN
1306 304.0 NaN
1307 155.0 NaN
1308 155.0 NaN
body home.dest
0 0.0 St Louis, MO
1 0.0 Montreal, PQ / Chesterville, ON
2 0.0 Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
4 0.0 Montreal, PQ / Chesterville, ON
... ... ...
1304 328.0 0
1305 0.0 0
1306 304.0 0
1307 0.0 0
1308 0.0 0
body home.dest
0 NaN St Louis, MO
1 NaN Montreal, PQ / Chesterville, ON
2 NaN Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
4 135.0 Montreal, PQ / Chesterville, ON
... ... ...
1304 328.0 Antwerp, Belgium / Stanton, OH
1305 328.0 Antwerp, Belgium / Stanton, OH
1306 304.0 Antwerp, Belgium / Stanton, OH
1307 304.0 Antwerp, Belgium / Stanton, OH
1308 304.0 Antwerp, Belgium / Stanton, OH
body home.dest
0 135.0 St Louis, MO
1 135.0 Montreal, PQ / Chesterville, ON
2 135.0 Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
4 22.0 Montreal, PQ / Chesterville, ON
... ... ...
1304 328.0 NaN
1305 304.0 NaN
1306 304.0 NaN
1307 NaN NaN
1308 NaN NaN
# Plot boxplots
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.boxplot(df['age'])
plt.title("Age Boxplot")
plt.subplot(1, 2, 2)
plt.boxplot(df['fare'])
plt.title("Fare Boxplot")
plt.show()
home.dest
5 New York, NY
7 Belfast, NI
8 Bayside, Queens, NY
20 New York, NY
21 New York, NY
... ...
1301 NaN
1304 NaN
1306 NaN
1307 NaN
1308 NaN
home.dest
5 New York, NY
7 Belfast, NI
8 Bayside, Queens, NY
20 New York, NY
21 New York, NY
... ...
1301 NaN
1304 NaN
1306 NaN
1307 NaN
1308 NaN
Cleaned Dataset:
pclass survived
name \
5 1.0 1.0 Anderson, Mr.
Harry
7 1.0 0.0 Andrews, Mr. Thomas
Jr
8 1.0 1.0 Appleton, Mrs. Edward Dale (Charlotte
Lamson)
20 1.0 1.0 Beckwith, Mr. Richard
Leonard
21 1.0 1.0 Beckwith, Mrs. Richard Leonard (Sallie
Monypeny)
... ... ... .
..
1301 3.0 0.0 Youseff, Mr.
Gerious
1304 3.0 0.0 Zabour, Miss.
Hileni
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
1308 3.0 0.0 Zimmerman, Mr.
Leo
home.dest
5 New York, NY
7 Belfast, NI
8 Bayside, Queens, NY
20 New York, NY
21 New York, NY
... ...
1301 NaN
1304 NaN
1306 NaN
1307 NaN
1308 NaN