Professional Documents
Culture Documents
Data Cleaning With Python by Raju Gajelli
Data Cleaning With Python by Raju Gajelli
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')
[ ]:
[24]: df
1
99 Rob Robertson Graphic Designer USA
Email \
0 taras.kolcio@evidera.com
1 duane.centola@microban.com
2 jolonr@millerslab.com
3 d.schmieder@schumacherandfarley.com
4 sukumar@harpercollins-india.com
.. …
95 nicki.robson@bp.com
96 melanie.wittmann@sncf.com
97 girmanna@revelers.com
98 cdezoysa@freewheel.tv
99 rrobertson@norgren.com
Linkedin
0 http://www.linkedin.com/in/taras-kolcio-73ab0921
1 http://www.linkedin.com/in/duane-centola-86831712
2 NaN
3 NaN
4 NaN
.. …
95 http://www.linkedin.com/in/nicki-robson-2b9aa989
96 NaN
97 NaN
98 http://www.linkedin.com/in/chamildz
99 NaN
df.columns
[ ]:
[26]: # Checking columns Data type , memory usage and null values
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
# Column Non-Null Count Dtype
2
--- ------ -------------- -----
0 CompanyName 100 non-null object
1 WebAddress 100 non-null object
2 ContactName 100 non-null object
3 Title 100 non-null object
4 Country 100 non-null object
5 Email 100 non-null object
6 Linkedin 47 non-null object
dtypes: object(7)
memory usage: 5.6+ KB
[ ]:
1 Renaming Columns
[27]: df.rename(columns={"CompanyName":"Company" , "WebAddress":"Website" ,␣
↪"ContactName" : "Contact Name" },inplace = True)
[28]: df
Email \
3
0 taras.kolcio@evidera.com
1 duane.centola@microban.com
2 jolonr@millerslab.com
3 d.schmieder@schumacherandfarley.com
4 sukumar@harpercollins-india.com
.. …
95 nicki.robson@bp.com
96 melanie.wittmann@sncf.com
97 girmanna@revelers.com
98 cdezoysa@freewheel.tv
99 rrobertson@norgren.com
Linkedin
0 http://www.linkedin.com/in/taras-kolcio-73ab0921
1 http://www.linkedin.com/in/duane-centola-86831712
2 NaN
3 NaN
4 NaN
.. …
95 http://www.linkedin.com/in/nicki-robson-2b9aa989
96 NaN
97 NaN
98 http://www.linkedin.com/in/chamildz
99 NaN
2 Removing Spaces
[29]: df['Company'] = df['Company'].str.strip()
[ ]:
4
5 Checking for Null values
[32]: df.isnull().sum()
[32]: Company 0
Website 0
Contact Name 0
Title 0
Country 0
Email 0
Linkedin 53
dtype: int64
[ ]:
[33]: df["Linkedin"]=df["Linkedin"].fillna("")
[34]: df
Email \
0 taras.kolcio@evidera.com
5
1 duane.centola@microban.com
2 jolonr@millerslab.com
3 d.schmieder@schumacherandfarley.com
4 sukumar@harpercollins-india.com
.. …
95 nicki.robson@bp.com
96 melanie.wittmann@sncf.com
97 girmanna@revelers.com
98 cdezoysa@freewheel.tv
99 rrobertson@norgren.com
Linkedin
0 http://www.linkedin.com/in/taras-kolcio-73ab0921
1 http://www.linkedin.com/in/duane-centola-86831712
2
3
4
.. …
95 http://www.linkedin.com/in/nicki-robson-2b9aa989
96
97
98 http://www.linkedin.com/in/chamildz
99
[35]: df.isnull().sum()
[35]: Company 0
Website 0
Contact Name 0
Title 0
Country 0
Email 0
Linkedin 0
dtype: int64
[ ]:
6
6.1 Ensuring emails are in lowercase for consistency
[37]: df['Email'] = df['Email'].str.lower()
[39]: df
Email \
0 taras.kolcio@evidera.com
1 duane.centola@microban.com
2 jolonr@millerslab.com
3 d.schmieder@schumacherandfarley.com
4 sukumar@harpercollins-india.com
.. …
95 nicki.robson@bp.com
96 melanie.wittmann@sncf.com
97 girmanna@revelers.com
7
98 cdezoysa@freewheel.tv
99 rrobertson@norgren.com
Linkedin
0 http://www.linkedin.com/in/taras-kolcio-73ab0921
1 http://www.linkedin.com/in/duane-centola-86831712
2 None
3 None
4 None
.. …
95 http://www.linkedin.com/in/nicki-robson-2b9aa989
96 None
97 None
98 http://www.linkedin.com/in/chamildz
99 None
[ ]: