Professional Documents
Culture Documents
Pandas
Pandas
In [2]:
import pandas as pd
In [1]:
df = pd.read_csv('Tweets.csv')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-8cbae2239962> in <module>
----> 1 df = pd.read_csv('Tweets.csv')
In [4]:
df.shape
In [5]:
df.columns
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 tweet_id 14640 non-null int64
1 airline_sentiment 14640 non-null object
2 airline_sentiment_confidence 14640 non-null float64
3 negativereason 9178 non-null object
4 negativereason_confidence 10522 non-null float64
5 airline 14640 non-null object
6 airline_sentiment_gold 40 non-null object
7 name 14640 non-null object
8 negativereason_gold 32 non-null object
9 retweet_count 14640 non-null int64
10 text 14640 non-null object
11 tweet_coord 1019 non-null object
12 tweet_created 14640 non-null object
13 tweet_location 9907 non-null object
14 user_timezone 9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB
In [7]:
df.describe()
localhost:8888/nbconvert/html/pandas.ipynb?download=false 1/23
4/4/2021 pandas
In [12]:
df
Customer
14636 569587371693355008 negative 1.0000
Service Issue
localhost:8888/nbconvert/html/pandas.ipynb?download=false 2/23
4/4/2021 pandas
Customer
14638 569587188687634433 negative 1.0000
Service Issue
In [8]:
df.head()
In [9]:
df.head(10)
localhost:8888/nbconvert/html/pandas.ipynb?download=false 3/23
4/4/2021 pandas
In [15]:
df.head(60)
localhost:8888/nbconvert/html/pandas.ipynb?download=false 4/23
4/4/2021 pandas
localhost:8888/nbconvert/html/pandas.ipynb?download=false 5/23
4/4/2021 pandas
Customer
24 570256553502068736 negative 1.0000
Service Issue
Customer
25 570249102404923392 negative 1.0000
Service Issue
Flight Booking
30 570114021854212096 negative 1.0000
Problems
Customer
32 570088404156698625 negative 1.0000
Service Issue
Customer
33 570084582780899328 negative 1.0000
Service Issue
localhost:8888/nbconvert/html/pandas.ipynb?download=false 6/23
4/4/2021 pandas
Customer
39 570035876845084672 negative 1.0000
Service Issue
Flight Booking
41 570025482344898560 negative 0.6688
Problems
localhost:8888/nbconvert/html/pandas.ipynb?download=false 7/23
4/4/2021 pandas
Flight Booking
55 569996412286582784 negative 0.6939
Problems
In [16]:
df.head(70)
localhost:8888/nbconvert/html/pandas.ipynb?download=false 8/23
4/4/2021 pandas
Customer
66 569976620158578688 negative 1.0000
Service Issue
70 rows × 15 columns
In [17]:
df.tail()
Customer
14636 569587371693355008 negative 1.0000
Service Issue
Customer
14638 569587188687634433 negative 1.0000
Service Issue
localhost:8888/nbconvert/html/pandas.ipynb?download=false 9/23
4/4/2021 pandas
In [18]:
df.tail(10)
Customer
14636 569587371693355008 negative 1.0000
Service Issue
Customer
14638 569587188687634433 negative 1.0000
Service Issue
localhost:8888/nbconvert/html/pandas.ipynb?download=false 10/23
4/4/2021 pandas
In [20]:
df[100:110]
Customer
106 569884551712886785 negative 1.0000
Service Issue
Flight Booking
107 569884407852437504 negative 1.0000
Problems
In [10]:
df['tweet_id']
Out[10]: 0 570306133677760513
localhost:8888/nbconvert/html/pandas.ipynb?download=false 11/23
4/4/2021 pandas
1 570301130888122368
2 570301083672813571
3 570301031407624196
4 570300817074462722
...
14635 569587686496825344
14636 569587371693355008
14637 569587242672398336
14638 569587188687634433
14639 569587140490866689
Name: tweet_id, Length: 14640, dtype: int64
In [11]:
type(df['tweet_id'])
Out[11]: pandas.core.series.Series
In [12]:
type(df[['tweet_id']])
Out[12]: pandas.core.frame.DataFrame
In [14]:
df[['tweet_id', 'airline_sentiment']]
0 570306133677760513 neutral
1 570301130888122368 positive
2 570301083672813571 neutral
3 570301031407624196 negative
4 570300817074462722 negative
In [27]:
df[['negativereason','tweet_id', 'airline_sentiment']]
localhost:8888/nbconvert/html/pandas.ipynb?download=false 12/23
4/4/2021 pandas
In [15]:
df2 = pd.read_csv('Suicides in India 2001-2012.csv')
In [16]:
df2
A&N
0 2001 Causes Illness (Aids/STD) Female 0-14 0
Islands
A&N Cancellation/Non-Settlement of
2 2001 Causes Female 0-14 0
Islands Marriage
A&N
3 2001 Causes Physical Abuse (Rape/Incest Etc.) Female 0-14 0
Islands
A&N
4 2001 Causes Dowry Dispute Female 0-14 0
Islands
West
237514 2012 Social_Status Seperated Male 0-100+ 149
Bengal
West
237515 2012 Social_Status Widowed/Widower Male 0-100+ 233
Bengal
West
237516 2012 Social_Status Married Male 0-100+ 5451
Bengal
West
237517 2012 Social_Status Divorcee Male 0-100+ 189
Bengal
West
237518 2012 Social_Status Never Married Male 0-100+ 2658
Bengal
localhost:8888/nbconvert/html/pandas.ipynb?download=false 13/23
4/4/2021 pandas
In [17]: df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237519 entries, 0 to 237518
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 State 237519 non-null object
1 Year 237519 non-null int64
2 Type_code 237519 non-null object
3 Type 237519 non-null object
4 Gender 237519 non-null object
5 Age_group 237519 non-null object
6 Total 237519 non-null int64
dtypes: int64(2), object(5)
memory usage: 12.7+ MB
In [18]:
df2.shape
Out[18]: (237519, 7)
In [19]:
df2['Year'].unique()
Out[19]: array([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
2012], dtype=int64)
In [20]:
df2['Year'].value_counts()
In [37]:
df2['Type'].value_counts()
In [38]:
df2['Gender'].value_counts()
localhost:8888/nbconvert/html/pandas.ipynb?download=false 14/23
4/4/2021 pandas
In [43]:
df2['State'].value_counts()
In [21]:
df2[df2['State'] == 'Karnataka']
localhost:8888/nbconvert/html/pandas.ipynb?download=false 15/23
4/4/2021 pandas
In [23]:
df2[(df2['State'] == 'Karnataka') | (df2['State'] == 'Punjab')]
In [25]:
df2[(df2['State'] == 'Bihar') & (df2['Gender'] == 'Male') & (df2['Type'] == 'Cancer')].
Out[25]: (60, 7)
In [45]:
karnataka_df = df2[df2['State'] == 'Karnataka']
In [46]:
karnataka_df
localhost:8888/nbconvert/html/pandas.ipynb?download=false 16/23
4/4/2021 pandas
In [47]:
karnataka_df.to_excel('karnataka_df.xlsx')
In [50]:
karnataka_df[karnataka_df['Year'] == 2005]
110409 Karnataka 2005 Causes Other Causes (Please Specity) Female 0-14 38
In [51]:
df2[(df2['State'] == 'Kerala') & (df2['Year'] == 2005)]
localhost:8888/nbconvert/html/pandas.ipynb?download=false 17/23
4/4/2021 pandas
Cancellation/Non-Settlement of
117202 Kerala 2005 Causes Female 0-14 0
Marriage
In [52]:
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237519 entries, 0 to 237518
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 State 237519 non-null object
1 Year 237519 non-null int64
2 Type_code 237519 non-null object
3 Type 237519 non-null object
4 Gender 237519 non-null object
5 Age_group 237519 non-null object
6 Total 237519 non-null int64
dtypes: int64(2), object(5)
memory usage: 12.7+ MB
In [33]:
df2[(df2['State']=='Gujarat') & (df2['Year'] >= 2003) & (df2['Year'] <= 2005 ) & (df2[
localhost:8888/nbconvert/html/pandas.ipynb?download=false 18/23
4/4/2021 pandas
In [40]:
df3 = pd.read_csv('sucide df.csv')
df3.shape
Out[40]: (237530, 7)
In [36]:
df3
A&N
0 2001 Causes Illness (Aids/STD) Female 0-14 0
Islands
A&N Cancellation/Non-Settlement of
2 2001 Causes Female 0-14 0
Islands Marriage
A&N
3 2001 Causes Physical Abuse (Rape/Incest Etc.) Female 0-14 0
Islands
A&N
4 2001 Causes Dowry Dispute Female 0-14 0
Islands
West
237525 2012 Social_Status Seperated Male 0-100+ 149
Bengal
West
237526 2012 Social_Status Widowed/Widower Male 0-100+ 233
Bengal
West
237527 2012 Social_Status Married Male 0-100+ 5451
Bengal
West
237528 2012 Social_Status Divorcee Male 0-100+ 189
Bengal
West
237529 2012 Social_Status Never Married Male 0-100+ 2658
Bengal
In [37]:
df3[df3.duplicated()]
localhost:8888/nbconvert/html/pandas.ipynb?download=false 19/23
4/4/2021 pandas
237524 West Bengal 2012 Social_Status Never Married Female 0-100+ 1513
237529 West Bengal 2012 Social_Status Never Married Male 0-100+ 2658
In [41]:
df3 = df3.drop_duplicates()
In [42]:
df3[df3.duplicated()]
In [43]:
df4 = df3.drop_duplicates(subset=['State'])
In [44]:
df4
A&N
0 2001 Causes Illness (Aids/STD) Female 0-14 0
Islands
Andhra
6712 2001 Causes Paralysis Female 0-14 0
Pradesh
Arunachal Cancellation/Non-Settlement
13503 2001 Causes Female 0-14 0
Pradesh of Marriage
Daman &
53997 2001 Causes Illegitimate Pregnancy Female 0-14 0
Diu
localhost:8888/nbconvert/html/pandas.ipynb?download=false 20/23
4/4/2021 pandas
Cancellation/Non-Settlement
67489 Goa 2001 Causes Female 0-14 0
of Marriage
Himachal
87824 2001 Causes Dowry Dispute Female 0-14 0
Pradesh
Jammu &
94598 2001 Causes Illness (Aids/STD) Female 0-14 0
Kashmir
Madhya
128398 2001 Causes Insanity/Mental Illness Female 0-14 2
Pradesh
Ideological Causes/Hero
135190 Maharashtra 2001 Causes Female 0-14 0
Worshipping
Bankruptcy or Sudden
168857 Odisha 2001 Causes Female 0-14 0
change in Economic
202690 Tamil Nadu 2001 Causes Death of Dear Person Female 0-14 2
Hr.
Total (All
209476 2001 Education_Status Secondary/Intermediate/Pre- Female 0-100+ 2391
India)
Universit
localhost:8888/nbconvert/html/pandas.ipynb?download=false 21/23
4/4/2021 pandas
In [45]:
df3.describe()
In [46]:
import matplotlib.pyplot as plt
In [48]:
churn_df = pd.read_csv('Churn.csv')
In [49]:
churn_df
Out[49]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance
... ... ... ... ... ... ... ... ... ...
localhost:8888/nbconvert/html/pandas.ipynb?download=false 22/23
4/4/2021 pandas
In [50]:
churn_df.describe()
In [51]:
churn_df.boxplot(column=['Age'])
In [53]:
churn_df = churn_df[churn_df['Age'] < 62]
In [ ]:
localhost:8888/nbconvert/html/pandas.ipynb?download=false 23/23