ML#05

#PRATICAL 5-Data Cleaning and Exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#Air Quality Dataset
df = pd.read_csv("Air_Quality.csv",encoding='cp1252')
print(df)
df.dtypes
stn_code sampling_date state location \

0 150.0 February - M021990 Andhra Pradesh Hyderabad
3 150.0 March - M031990 Andhra Pradesh Hyderabad
4 151.0 March - M031990 Andhra Pradesh Hyderabad
... ... ... ... ...
97418 155.0 12/5/2012 Gujarat Ahmedabad
agency \
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
... ...
97418 National Environmental Engineering Research In...
97422 Na
type so2 no2 rspm spm \

0 Residential, Rural and other Areas 4.8 17.4 NaN NaN
1 Industrial Area 3.1 7.0 NaN NaN
... ... ... ... ... ...
97418 Residential, Rural and other Areas 11.0 21.0 83.0 NaN
97422 NaN NaN NaN NaN NaN
location_monitoring_station pm2_5 date

0 NaN NaN 1990-02-01
1 NaN NaN 1990-02-01
2 NaN NaN 1990-02-01
3 NaN NaN 1990-03-01
4 NaN NaN 1990-03-01
... ... ... ...
97418 R.C. High School, Mirzapur, Ahmadabad NaN 2012-05-12
97422 NaN NaN NaN
[97423 rows x 13 columns]
stn_code float64
sampling_date object
state object
location object
agency object
type object
so2 float64
no2 float64
rspm float64
spm float64
location_monitoring_station object
pm2_5 float64
date object
dtype: object
print("1. Displaying all columns:", df.keys())

print('---------------------------------------------------------------
-----------------')
print("2. The first ten rows of the dataset are:",df.head(10))
print('---------------------------------------------------------------
-----------------')
print("3. The data shape is:", df.shape)
print('---------------------------------------------------------------
-----------------')
print("4. Return the number of null values in each column:",
df.isnull().sum())
print('---------------------------------------------------------------
-----------------')
print("5. Using describe() to display the generated descriptive
statistics:", df.describe())
1. Displaying all columns: Index(['stn_code', 'sampling_date',
'state', 'location', 'agency', 'type',
'so2', 'no2', 'rspm', 'spm', 'location_monitoring_station',
'pm2_5',
'date'],
dtype='object')
----------------------------------------------------------------------
----------
2. The first ten rows of the dataset are: stn_code
sampling_date state location agency \
0 150.0 February - M021990 Andhra Pradesh Hyderabad NaN
3 150.0 March - M031990 Andhra Pradesh Hyderabad NaN
6 150.0 April - M041990 Andhra Pradesh Hyderabad NaN
9 151.0 May - M051990 Andhra Pradesh Hyderabad NaN
type so2 no2 rspm spm \

location_monitoring_station pm2_5 date

0 NaN NaN 1990-02-01
1 NaN NaN 1990-02-01
2 NaN NaN 1990-02-01
3 NaN NaN 1990-03-01
4 NaN NaN 1990-03-01
5 NaN NaN 1990-03-01
6 NaN NaN 1990-04-01
7 NaN NaN 1990-04-01
8 NaN NaN 1990-04-01
9 NaN NaN 1990-05-01
----------------------------------------------------------------------
----------
3. The data shape is: (97423, 13)
----------------------------------------------------------------------
----------
4. Return the number of null values in each column: stn_code
36670
sampling_date 0
state 0
location 0
agency 38202
type 1629
so2 7494
no2 3047
rspm 8767
spm 46934
location_monitoring_station 7374
pm2_5 95575
date 1
dtype: int64
----------------------------------------------------------------------
----------
5. Using describe() to display the generated descriptive statistics:
stn_code so2 no2 rspm spm \
count 60753.000000 89929.000000 94376.000000 88656.000000
50489.000000
mean 383.033019 9.666180 23.268016 99.148675
215.214880
std 220.903480 8.782058 16.194474 64.431394
134.409625
min 21.000000 0.000000 0.000000 0.000000
0.000000
25% 193.000000 5.000000 13.300000 60.000000
135.000000
50% 389.000000 7.000000 19.000000 85.000000
190.000000
75% 581.000000 12.000000 28.400000 119.000000
268.000000
max 788.000000 228.000000 334.900000 892.000000
1885.000000
pm2_5
count 1848.000000
mean 34.651136
std 37.800496
min 7.000000
25% 14.000000
50% 21.000000
75% 31.050000
max 318.000000
import pandas as pd
#6. Cleaning The Dataset:
# Checking if the columns exist in the DataFrame before attempting to

drop them
columns_to_drop = ['stn_code', 'agency', 'sampling_date',
'location_monitoring_agency']
columns_to_drop = [col for col in columns_to_drop if col in
df.columns]
if columns_to_drop:
# Drop the columns if they exist
df = df.drop(columns=columns_to_drop)
print("Dropping less valued columns:")
print(df)
else:
print("Columns to drop not found in the DataFrame.")
Dropping less valued columns:

state location type
so2 \
0 Andhra Pradesh Hyderabad Residential, Rural and other Areas
4.8
1 Andhra Pradesh Hyderabad Industrial Area
3.1
6.2
6.3
4.7
... ... ... ...
...
97418 Gujarat Ahmedabad Residential, Rural and other Areas
11.0
10.0
9.0
10.0
97422 Gujarat Ahmedabad NaN
NaN
no2 rspm spm location_monitoring_station

pm2_5 \
0 17.4 NaN NaN NaN NaN

... ... ... ... ... ...
97418 21.0 83.0 NaN R.C. High School, Mirzapur, Ahmadabad NaN
97422 NaN NaN NaN NaN NaN
date
0 1990-02-01
1 1990-02-01
2 1990-02-01
3 1990-03-01
4 1990-03-01
... ...
97418 2012-05-12
97419 2012-05-16
97420 2012-05-19
97421 2012-05-23
97422 NaN
import pandas as pd
#6. Cleaning The Dataset:
# b. Drop rows where no date is available

df = df.dropna(subset=['date'])
print("Dropping rows where no date is available:")
print(df)
print('---------------------------------------------------------------
-----------------')
# c. Clean 'type' column

df['type'] = df['type'].str.replace('Industrial Areas', 'Industrial
Area')
print("Cleaning the 'type' column:")
print(df['type'])
print('---------------------------------------------------------------
-----------------')
# d. Create a new column holding year values
df['year'] = pd.to_datetime(df['date']).dt.year
print("Creating a new column holding year values:")
print(df['year'])
Dropping rows where no date is available:

state location type
so2 \
4.8
3.1
6.2
6.3
4.7
... ... ... ...
...
13.0
11.0
10.0
9.0
10.0
no2 rspm spm location_monitoring_station

pm2_5 \
... ... ... ... ... ...
date
0 1990-02-01
1 1990-02-01
2 1990-02-01
3 1990-03-01
4 1990-03-01
... ...
97417 2012-05-09
97418 2012-05-12
97419 2012-05-16
97420 2012-05-19
97421 2012-05-23

----------------------------------------------------------------------
----------
Cleaning the 'type' column:
0 Residential, Rural and other Areas
1 Industrial Area
4 Industrial Area
...
Name: type, Length: 97422, dtype: object
----------------------------------------------------------------------
----------
Creating a new column holding year values:
0 1990
1 1990
2 1990
3 1990
4 1990
...
97417 2012
97418 2012
97419 2012
97420 2012
97421 2012
Name: year, Length: 97422, dtype: int64
# f. Check the number of null values in each column
print("\nNumber of Null Values in Each Column After Cleansing:")
df2=df.isnull().sum()
print(df2)
Number of Null Values in Each Column After Cleansing:

state 0
location 0
type 1628
so2 0
no2 0
rspm 0
spm 0
location_monitoring_station 7373
pm2_5 0
date 0
year 0
dtype: int64
# 7. State with higher SO2 content

so2_median = df.groupby('state')
['so2'].median().sort_values(ascending=False)
so2_median.plot(kind='bar')
plt.title('SO2 Content by State')
plt.show()
# 8. State with higher NO2 content
no2_median = df.groupby('state')
['no2'].median().sort_values(ascending=False)
no2_median.plot(kind='bar')
plt.title('NO2 Content by State')
plt.show()
df['year']=pd.to_datetime(df['date'].astype(str), format ='%Y-%m-
%d').dt.year
# 9. Create a new data frame containing only relevant columns

df_imp = df[['so2', 'state', 'year', 'no2', 'rspm', 'spm']]
print(df_imp)
print(df_imp.columns)
so2 state year no2 rspm spm

0 4.8 Andhra Pradesh 1990 17.4 99.148675 215.21488
1 3.1 Andhra Pradesh 1990 7.0 99.148675 215.21488
2 6.2 Andhra Pradesh 1990 28.5 99.148675 215.21488
3 6.3 Andhra Pradesh 1990 14.7 99.148675 215.21488
4 4.7 Andhra Pradesh 1990 7.5 99.148675 215.21488
... ... ... ... ... ... ...
97417 13.0 Gujarat 2012 24.0 74.000000 215.21488
97418 11.0 Gujarat 2012 21.0 83.000000 215.21488
97419 10.0 Gujarat 2012 19.0 74.000000 215.21488
97420 9.0 Gujarat 2012 16.0 71.000000 215.21488
97421 10.0 Gujarat 2012 21.0 82.000000 215.21488

Index(['so2', 'state', 'year', 'no2', 'rspm', 'spm'], dtype='object')
# 10. Yearly trend in 'Andhra Pradesh'

import pandas as pd
print ("Filter the data for Andhra Pradesh:")

andhra_pradesh = df_imp[df_imp['state'] == 'Andhra
Pradesh'].groupby('year').median()
print(andhra_pradesh)
print('---------------------------------------------------------------
-----------------')
print("Plot SO2 and NO2:")

andhra_pradesh[['so2', 'no2']].plot(kind='line')
plt.title('Yearly Trend in Andhra Pradesh - SO2 and NO2')
plt.xlabel('Year')
plt.ylabel('Concentration')
plt.legend(['SO2', 'NO2'])
plt.show()
print('---------------------------------------------------------------
-----------------')
print ("Plot of RSPM and SPM:")

andhra_pradesh[['rspm', 'spm']].plot(kind='line')
plt.title('Yearly Trend in Andhra Pradesh - RSPM and SPM')
plt.xlabel('Year')
plt.ylabel('Concentration')
plt.legend(['RSPM', 'SPM'])
plt.show()
Filter the data for Andhra Pradesh:

so2 no2 rspm spm
year
1990 5.60 13.60 99.148675 179.00000
1991 8.25 12.80 99.148675 141.50000
1992 12.40 27.60 99.148675 192.00000
1993 6.00 11.40 99.148675 215.21488
1994 8.70 14.20 99.148675 215.21488
1995 14.10 26.30 99.148675 147.00000
1996 18.85 30.15 99.148675 181.50000
1997 16.40 28.40 99.148675 145.00000
1998 10.15 20.25 99.148675 152.00000
1999 13.35 17.05 99.148675 152.50000
2000 12.15 20.45 99.148675 128.50000
2001 12.50 24.00 99.148675 132.00000
2002 6.70 23.00 99.148675 86.00000
2004 8.00 32.00 85.150000 145.00000
2005 5.10 28.55 77.000000 208.00000
2006 5.80 30.30 88.000000 191.00000
2007 5.70 29.85 85.000000 184.00000
2008 5.20 26.40 81.500000 189.00000
2009 4.90 21.30 80.000000 195.00000
2010 4.00 13.00 67.000000 215.21488
2011 4.30 23.00 74.000000 215.21488
2012 5.00 13.00 73.000000 215.21488
2013 5.00 16.00 67.000000 215.21488
2014 6.00 18.00 70.000000 215.21488
2015 6.00 19.00 69.000000 215.21488
----------------------------------------------------------------------
----------
Plot SO2 and NO2:
----------------------------------------------------------------------
----------
Plot of RSPM and SPM:
#PRATICAL 5B - Data Cleaning and Exploration
#Titanic Dataset
url =
"https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/master/
titanic_original.csv"
df = pd.read_csv(url, encoding='cp1252')
# 2. Display the dataset

print("Dataset:")
print(df)
Dataset:
pclass survived
name \
0 1.0 1.0 Allen, Miss. Elisabeth
Walton
1 1.0 1.0 Allison, Master. Hudson
Trevor
2 1.0 0.0 Allison, Miss. Helen
Loraine
3 1.0 0.0 Allison, Mr. Hudson Joshua
Creighton
4 1.0 0.0 Allison, Mrs. Hudson J C (Bessie Waldo
Daniels)
... ... ... ..
.
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
1308 3.0 0.0 Zimmerman, Mr.
Leo
1309 NaN NaN
NaN
sex age sibsp parch ticket fare cabin

embarked boat \
0 female 29.0000 0.0 0.0 24160 211.3375 B5
S 2
1 male 0.9167 1.0 2.0 113781 151.5500 C22 C26
S 11
2 female 2.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
3 male 30.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
4 female 25.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
... ... ... ... ... ... ... ... ..
. ...
1305 female NaN 1.0 0.0 2665 14.4542 NaN
C NaN
1306 male 26.5000 0.0 0.0 2656 7.2250 NaN
C NaN
1307 male 27.0000 0.0 0.0 2670 7.2250 NaN
C NaN
1308 male 29.0000 0.0 0.0 315082 7.8750 NaN
S NaN
1309 NaN NaN NaN NaN NaN NaN NaN
NaN NaN
body home.dest
0 NaN St Louis, MO
1 NaN Montreal, PQ / Chesterville, ON
3 135.0 Montreal, PQ / Chesterville, ON
... ... ...
1305 NaN NaN
1306 304.0 NaN
1307 NaN NaN
1308 NaN NaN
1309 NaN NaN
# 3. Find out if any column contains Null values

null_values = df.isnull()
print("\nNull Values in Each Column:")
print(null_values)
print('---------------------------------------------------------------
-----------------')
# 4. Drop rows with all NaN values

df = df.dropna(how="all")
print(df)
print('---------------------------------------------------------------
-----------------')
# 5. Check each column data type

print("\nData Types for Each Column:")
print(df.dtypes)
print('---------------------------------------------------------------
-----------------')
# 6. Generate descriptive statistics using describe()

print("\nDescriptive Statistics:")
print(df.describe())
Null Values in Each Column:

pclass survived name sex age sibsp parch ticket
fare \
0 False False False False False False False False
False
False
False
False
False
... ... ... ... ... ... ... ... ... .
..
1305 False False False False True False False False
False
False
False
False
1309 True True True True True True True True
True
cabin embarked boat body home.dest

0 False False False True False
1 False False False True False
2 False False True True False
3 False False True False False
4 False False True True False
... ... ... ... ... ...
1305 True False True True True
1306 True False True False True
1309 True True True True True

----------------------------------------------------------------------
----------
pclass survived
name \
Walton
Trevor
Loraine
Creighton
Daniels)
... ... ... ..
.
1304 3.0 0.0 Zabour, Miss.
Hileni
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

embarked boat \
0 female 29.0000 0.0 0.0 24160 211.3375 B5
S 2
1 male 0.9167 1.0 2.0 113781 151.5500 C22 C26
S 11
2 female 2.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
3 male 30.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
4 female 25.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
... ... ... ... ... ... ... ... ..
. ...
1304 female 14.5000 1.0 0.0 2665 14.4542 NaN
C NaN
1305 female NaN 1.0 0.0 2665 14.4542 NaN
C NaN
1306 male 26.5000 0.0 0.0 2656 7.2250 NaN
C NaN
1307 male 27.0000 0.0 0.0 2670 7.2250 NaN
C NaN
1308 male 29.0000 0.0 0.0 315082 7.8750 NaN
S NaN
body home.dest
0 NaN St Louis, MO
... ... ...
1304 328.0 NaN
1305 NaN NaN
1306 304.0 NaN
1307 NaN NaN
1308 NaN NaN

----------------------------------------------------------------------
----------
Data Types for Each Column:
pclass float64
survived float64
name object
sex object
age float64
sibsp float64
parch float64
ticket object
fare float64
cabin object
embarked object
boat object
body float64
home.dest object
dtype: object
----------------------------------------------------------------------
----------
Descriptive Statistics:
pclass survived age sibsp parch
\
count 1309.000000 1309.000000 1046.000000 1309.000000 1309.000000
mean 2.294882 0.381971 29.881135 0.498854 0.385027
std 0.837836 0.486055 14.413500 1.041658 0.865560
min 1.000000 0.000000 0.166700 0.000000 0.000000
25% 2.000000 0.000000 21.000000 0.000000 0.000000
50% 3.000000 0.000000 28.000000 0.000000 0.000000
75% 3.000000 1.000000 39.000000 1.000000 0.000000
max 3.000000 1.000000 80.000000 8.000000 9.000000
fare body
count 1308.000000 121.000000
mean 33.295479 160.809917
std 51.758668 97.696922
min 0.000000 1.000000
25% 7.895800 72.000000
50% 14.454200 155.000000
75% 31.275000 256.000000
max 512.329200 328.000000
# 7. Fill NaN values in each column
# a. Fill with median for numeric columns

numeric_columns = df.select_dtypes(include=['number'])
df_filled_median = df.copy()
df_filled_median[numeric_columns.columns] =
df_filled_median[numeric_columns.columns].fillna(df_filled_median.medi
an())
print("a. Fill with median for numeric columns:")
print(df_filled_median)
print('---------------------------------------------------------------
-----------------')
# b. Fill with a constant (e.g., 0)

df_filled_constant = df.fillna(0)
print("b. Fill with a constant:")
print(df_filled_constant)
print('---------------------------------------------------------------
-----------------')
# c. Forward Fill
df_filled_ffill = df.fillna(method='ffill')
print("c. Forward Fill:")
print(df_filled_ffill)
# c. Backward Fill
df_filled_bfill = df.fillna(method='bfill')
print("c. Backward Fill:")
print(df_filled_bfill)
a. Fill with median for numeric columns:

pclass survived
name \
Walton
Trevor
Loraine
Creighton
Daniels)
... ... ... ..
.
1304 3.0 0.0 Zabour, Miss.
Hileni
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

embarked boat \
0 female 29.0000 0.0 0.0 24160 211.3375 B5
S 2
1 male 0.9167 1.0 2.0 113781 151.5500 C22 C26
S 11
2 female 2.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
3 male 30.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
4 female 25.0000 1.0 2.0 113781 151.5500 C22 C26
S NaN
... ... ... ... ... ... ... ... ..
. ...
1304 female 14.5000 1.0 0.0 2665 14.4542 NaN
C NaN
1305 female 28.0000 1.0 0.0 2665 14.4542 NaN
C NaN
1306 male 26.5000 0.0 0.0 2656 7.2250 NaN
C NaN
1307 male 27.0000 0.0 0.0 2670 7.2250 NaN
C NaN
1308 male 29.0000 0.0 0.0 315082 7.8750 NaN
S NaN
body home.dest
0 155.0 St Louis, MO
... ... ...
1304 328.0 NaN
1305 155.0 NaN
1306 304.0 NaN
1307 155.0 NaN
1308 155.0 NaN

----------------------------------------------------------------------
----------
b. Fill with a constant:
pclass survived
name \
Walton
Trevor
Loraine
Creighton
Daniels)
... ... ... ..
.
1304 3.0 0.0 Zabour, Miss.
Hileni
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

embarked boat \
0 female 29.0000 0.0 0.0 24160 211.3375 B5
S 2
1 male 0.9167 1.0 2.0 113781 151.5500 C22 C26
S 11
2 female 2.0000 1.0 2.0 113781 151.5500 C22 C26
S 0
3 male 30.0000 1.0 2.0 113781 151.5500 C22 C26
S 0
4 female 25.0000 1.0 2.0 113781 151.5500 C22 C26
S 0
... ... ... ... ... ... ... ... ..
. ...
1304 female 14.5000 1.0 0.0 2665 14.4542 0
C 0
1305 female 0.0000 1.0 0.0 2665 14.4542 0
C 0
1306 male 26.5000 0.0 0.0 2656 7.2250 0
C 0
1307 male 27.0000 0.0 0.0 2670 7.2250 0
C 0
1308 male 29.0000 0.0 0.0 315082 7.8750 0
S 0
body home.dest
0 0.0 St Louis, MO
... ... ...
1304 328.0 0
1305 0.0 0
1306 304.0 0
1307 0.0 0
1308 0.0 0

----------------------------------------------------------------------
----------
c. Forward Fill:
pclass survived
name \
Walton
Trevor
Loraine
Creighton
Daniels)
... ... ... ..
.
1304 3.0 0.0 Zabour, Miss.
Hileni
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

embarked boat \
0 female 29.0000 0.0 0.0 24160 211.3375 B5
S 2
1 male 0.9167 1.0 2.0 113781 151.5500 C22 C26
S 11
2 female 2.0000 1.0 2.0 113781 151.5500 C22 C26
S 11
3 male 30.0000 1.0 2.0 113781 151.5500 C22 C26
S 11
4 female 25.0000 1.0 2.0 113781 151.5500 C22 C26
S 11
... ... ... ... ... ... ... ... ..
. ...
1304 female 14.5000 1.0 0.0 2665 14.4542 F38
C C
1305 female 14.5000 1.0 0.0 2665 14.4542 F38
C C
1306 male 26.5000 0.0 0.0 2656 7.2250 F38
C C
1307 male 27.0000 0.0 0.0 2670 7.2250 F38
C C
1308 male 29.0000 0.0 0.0 315082 7.8750 F38
S C
body home.dest
0 NaN St Louis, MO
... ... ...
1304 328.0 Antwerp, Belgium / Stanton, OH

c. Backward Fill:
pclass survived
name \
Walton
Trevor
Loraine
Creighton
Daniels)
... ... ... ..
.
1304 3.0 0.0 Zabour, Miss.
Hileni
1305 3.0 0.0 Zabour, Miss.
Thamine
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

embarked boat \
0 female 29.0000 0.0 0.0 24160 211.3375 B5
S 2
1 male 0.9167 1.0 2.0 113781 151.5500 C22 C26
S 11
2 female 2.0000 1.0 2.0 113781 151.5500 C22 C26
S 3
3 male 30.0000 1.0 2.0 113781 151.5500 C22 C26
S 3
4 female 25.0000 1.0 2.0 113781 151.5500 C22 C26
S 3
... ... ... ... ... ... ... ... ..
. ...
1304 female 14.5000 1.0 0.0 2665 14.4542 NaN
C NaN
1305 female 26.5000 1.0 0.0 2665 14.4542 NaN
C NaN
1306 male 26.5000 0.0 0.0 2656 7.2250 NaN
C NaN
1307 male 27.0000 0.0 0.0 2670 7.2250 NaN
C NaN
1308 male 29.0000 0.0 0.0 315082 7.8750 NaN
S NaN
body home.dest
0 135.0 St Louis, MO
... ... ...
1304 328.0 NaN
1305 304.0 NaN
1306 304.0 NaN
1307 NaN NaN
1308 NaN NaN
print("8. Convert appropriate columns to numeric data types:")

df['age'] = pd.to_numeric(df['age'], errors='coerce')
print(df['age'])
print('---------------------------------------------------------------
-----------------')
df['fare'] = pd.to_numeric(df['fare'], errors='coerce')
print(df['fare'])
8. Convert appropriate columns to numeric data types:

0 29.0000
1 0.9167
2 2.0000
3 30.0000
4 25.0000
...
1304 14.5000
1305 NaN
1306 26.5000
1307 27.0000
1308 29.0000
Name: age, Length: 1309, dtype: float64
----------------------------------------------------------------------
----------
0 211.3375
1 151.5500
2 151.5500
3 151.5500
4 151.5500
...
1304 14.4542
1305 14.4542
1306 7.2250
1307 7.2250
1308 7.8750
Name: fare, Length: 1309, dtype: float64
print("9. Plot each numeric column with a box plot:")

numeric_columns =
df.select_dtypes(include=[np.number]).columns.tolist()
for column in numeric_columns:

df.boxplot(column=column)
plt.title(f"{column} Boxplot")
plt.show()
9. Plot each numeric column with a box plot:

import pandas as pd
# Plot boxplots
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.boxplot(df['age'])
plt.title("Age Boxplot")
plt.subplot(1, 2, 2)
plt.boxplot(df['fare'])
plt.title("Fare Boxplot")
plt.show()
# Identify and address outliers for the 'age' column

Q1_age = df['age'].quantile(0.25)
Q3_age = df['age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
outlier_threshold_age = 1.5 * IQR_age
print("Remove or cap outliers for 'age':")

df2 = df[(df['age'] >= (Q1_age - outlier_threshold_age)) & (df['age']
<= (Q3_age + outlier_threshold_age))]
print(df2)
print('---------------------------------------------------------------
-----------------')
# Identify and address outliers for the 'fare' column

Q1_fare = df['fare'].quantile(0.25)
Q3_fare = df['fare'].quantile(0.75)
IQR_fare = Q3_fare - Q1_fare
outlier_threshold_fare = 1.5 * IQR_fare
print(" Remove or cap outliers for 'fare':")

df3 = df[(df['fare'] >= (Q1_fare - outlier_threshold_fare)) &
(df['fare'] <= (Q3_fare + outlier_threshold_fare))]
print(df3)
Remove or cap outliers for 'age':

pclass survived
name \
5 1.0 1.0 Anderson, Mr.
Harry
7 1.0 0.0 Andrews, Mr. Thomas
Jr
8 1.0 1.0 Appleton, Mrs. Edward Dale (Charlotte
Lamson)
20 1.0 1.0 Beckwith, Mr. Richard
Leonard
21 1.0 1.0 Beckwith, Mrs. Richard Leonard (Sallie
Monypeny)
... ... ... .
..
1301 3.0 0.0 Youseff, Mr.
Gerious
1304 3.0 0.0 Zabour, Miss.
Hileni
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo
sex age sibsp parch ticket fare cabin embarked boat

body \
5 male 48.0 0.0 0.0 19952 26.5500 E12 S 3
NaN
7 male 39.0 0.0 0.0 112050 0.0000 A36 S NaN
NaN
8 female 53.0 2.0 0.0 11769 51.4792 C101 S D
NaN
20 male 37.0 1.0 1.0 11751 52.5542 D35 S 5
NaN
21 female 47.0 1.0 1.0 11751 52.5542 D35 S 5
NaN
... ... ... ... ... ... ... ... ... ...
...
1301 male 45.5 0.0 0.0 2628 7.2250 NaN C NaN
312.0
1304 female 14.5 1.0 0.0 2665 14.4542 NaN C NaN
328.0
1306 male 26.5 0.0 0.0 2656 7.2250 NaN C NaN
304.0
1307 male 27.0 0.0 0.0 2670 7.2250 NaN C NaN
NaN
1308 male 29.0 0.0 0.0 315082 7.8750 NaN S NaN
NaN
home.dest
5 New York, NY
7 Belfast, NI
8 Bayside, Queens, NY
20 New York, NY
21 New York, NY
... ...
1301 NaN
1304 NaN
1306 NaN
1307 NaN
1308 NaN

----------------------------------------------------------------------
----------
Remove or cap outliers for 'fare':
pclass survived
name \
Harry
Jr
Lamson)
Leonard
Monypeny)
... ... ... .
..
1301 3.0 0.0 Youseff, Mr.
Gerious
1304 3.0 0.0 Zabour, Miss.
Hileni
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

body \
5 male 48.0 0.0 0.0 19952 26.5500 E12 S 3
NaN
7 male 39.0 0.0 0.0 112050 0.0000 A36 S NaN
NaN
8 female 53.0 2.0 0.0 11769 51.4792 C101 S D
NaN
20 male 37.0 1.0 1.0 11751 52.5542 D35 S 5
NaN
21 female 47.0 1.0 1.0 11751 52.5542 D35 S 5
NaN
... ... ... ... ... ... ... ... ... ...
...
1301 male 45.5 0.0 0.0 2628 7.2250 NaN C NaN
312.0
1304 female 14.5 1.0 0.0 2665 14.4542 NaN C NaN
328.0
1306 male 26.5 0.0 0.0 2656 7.2250 NaN C NaN
304.0
1307 male 27.0 0.0 0.0 2670 7.2250 NaN C NaN
NaN
1308 male 29.0 0.0 0.0 315082 7.8750 NaN S NaN
NaN
home.dest
5 New York, NY
7 Belfast, NI
20 New York, NY
21 New York, NY
... ...
1301 NaN
1304 NaN
1306 NaN
1307 NaN
1308 NaN
# Display the cleaned dataset after handling outliers

print("Cleaned Dataset:")
print(df)
Cleaned Dataset:
pclass survived
name \
Harry
Jr
Lamson)
Leonard
Monypeny)
... ... ... .
..
1301 3.0 0.0 Youseff, Mr.
Gerious
1304 3.0 0.0 Zabour, Miss.
Hileni
1306 3.0 0.0 Zakarian, Mr.
Mapriededer
1307 3.0 0.0 Zakarian, Mr.
Ortin
Leo

body \
5 male 48.0 0.0 0.0 19952 26.5500 E12 S 3
NaN
7 male 39.0 0.0 0.0 112050 0.0000 A36 S NaN
NaN
8 female 53.0 2.0 0.0 11769 51.4792 C101 S D
NaN
20 male 37.0 1.0 1.0 11751 52.5542 D35 S 5
NaN
21 female 47.0 1.0 1.0 11751 52.5542 D35 S 5
NaN
... ... ... ... ... ... ... ... ... ...
...
1301 male 45.5 0.0 0.0 2628 7.2250 NaN C NaN
312.0
1304 female 14.5 1.0 0.0 2665 14.4542 NaN C NaN
328.0
1306 male 26.5 0.0 0.0 2656 7.2250 NaN C NaN
304.0
1307 male 27.0 0.0 0.0 2670 7.2250 NaN C NaN
NaN
1308 male 29.0 0.0 0.0 315082 7.8750 NaN S NaN
NaN
home.dest
5 New York, NY
7 Belfast, NI
20 New York, NY
21 New York, NY
... ...
1301 NaN
1304 NaN
1306 NaN
1307 NaN
1308 NaN

ML#05

Uploaded by

Copyright:

Available Formats

You might also like

ML#05

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

ML#05

Uploaded by

Copyright:

Available Formats

#PRATICAL 5-Data Cleaning and Exploration

#Air Quality Dataset

stn_code sampling_date state location \

type so2 no2 rspm spm \

location_monitoring_station pm2_5 date

[97423 rows x 13 columns]

print("1. Displaying all columns:", df.keys())

type so2 no2 rspm spm \

location_monitoring_station pm2_5 date

#6. Cleaning The Dataset:

# Checking if the columns exist in the DataFrame before attempting to

Dropping less valued columns:

no2 rspm spm location_monitoring_station

1 7.0 NaN NaN NaN NaN

2 28.5 NaN NaN NaN NaN

4 7.5 NaN NaN NaN NaN

... ... ... ... ... ...

97422 NaN NaN NaN NaN NaN

[97423 rows x 10 columns]

#6. Cleaning The Dataset:

# b. Drop rows where no date is available

# c. Clean 'type' column

Dropping rows where no date is available:

no2 rspm spm location_monitoring_station

1 7.0 NaN NaN NaN NaN

2 28.5 NaN NaN NaN NaN

3 14.7 NaN NaN NaN NaN

4 7.5 NaN NaN NaN NaN

... ... ... ... ... ...

[97422 rows x 10 columns]

Number of Null Values in Each Column After Cleansing:

# 7. State with higher SO2 content

# 9. Create a new data frame containing only relevant columns

so2 state year no2 rspm spm

[97422 rows x 6 columns]

# 10. Yearly trend in 'Andhra Pradesh'

print ("Filter the data for Andhra Pradesh:")

print("Plot SO2 and NO2:")

print ("Plot of RSPM and SPM:")

Filter the data for Andhra Pradesh:

#PRATICAL 5B - Data Cleaning and Exploration

# 2. Display the dataset

sex age sibsp parch ticket fare cabin

[1310 rows x 14 columns]

# 3. Find out if any column contains Null values

# 4. Drop rows with all NaN values

# 5. Check each column data type

# 6. Generate descriptive statistics using describe()

Null Values in Each Column:

cabin embarked boat body home.dest

[1310 rows x 14 columns]

sex age sibsp parch ticket fare cabin

[1309 rows x 14 columns]

mean 2.294882 0.381971 29.881135 0.498854 0.385027

std 0.837836 0.486055 14.413500 1.041658 0.865560

min 1.000000 0.000000 0.166700 0.000000 0.000000

25% 2.000000 0.000000 21.000000 0.000000 0.000000

50% 3.000000 0.000000 28.000000 0.000000 0.000000

75% 3.000000 1.000000 39.000000 1.000000 0.000000