Download as pdf or txt
Download as pdf or txt
You are on page 1of 23

4/4/2021 pandas

In [2]:
import pandas as pd

In [1]:
df = pd.read_csv('Tweets.csv')

---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-8cbae2239962> in <module>
----> 1 df = pd.read_csv('Tweets.csv')

NameError: name 'pd' is not defined

In [4]:
df.shape

Out[4]: (14640, 15)

In [5]:
df.columns

Out[5]: Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',


'negativereason', 'negativereason_confidence', 'airline',
'airline_sentiment_gold', 'name', 'negativereason_gold',
'retweet_count', 'text', 'tweet_coord', 'tweet_created',
'tweet_location', 'user_timezone'],
dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 tweet_id 14640 non-null int64
1 airline_sentiment 14640 non-null object
2 airline_sentiment_confidence 14640 non-null float64
3 negativereason 9178 non-null object
4 negativereason_confidence 10522 non-null float64
5 airline 14640 non-null object
6 airline_sentiment_gold 40 non-null object
7 name 14640 non-null object
8 negativereason_gold 32 non-null object
9 retweet_count 14640 non-null int64
10 text 14640 non-null object
11 tweet_coord 1019 non-null object
12 tweet_created 14640 non-null object
13 tweet_location 9907 non-null object
14 user_timezone 9820 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.7+ MB

In [7]:
df.describe()

Out[7]: tweet_id airline_sentiment_confidence negativereason_confidence retweet_count

count 1.464000e+04 14640.000000 10522.000000 14640.000000

localhost:8888/nbconvert/html/pandas.ipynb?download=false 1/23
4/4/2021 pandas

tweet_id airline_sentiment_confidence negativereason_confidence retweet_count

mean 5.692184e+17 0.900169 0.638298 0.082650

std 7.791112e+14 0.162830 0.330440 0.745778

min 5.675883e+17 0.335000 0.000000 0.000000

25% 5.685592e+17 0.692300 0.360600 0.000000

50% 5.694779e+17 1.000000 0.670600 0.000000

75% 5.698905e+17 1.000000 1.000000 0.000000

max 5.703106e+17 1.000000 1.000000 44.000000

In [12]:
df

Out[12]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereas

0 570306133677760513 neutral 1.0000 NaN

1 570301130888122368 positive 0.3486 NaN

2 570301083672813571 neutral 0.6837 NaN

3 570301031407624196 negative 1.0000 Bad Flight

4 570300817074462722 negative 1.0000 Can't Tell

... ... ... ... ...

14635 569587686496825344 positive 0.3487 NaN

Customer
14636 569587371693355008 negative 1.0000
Service Issue

14637 569587242672398336 neutral 1.0000 NaN

localhost:8888/nbconvert/html/pandas.ipynb?download=false 2/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereas

Customer
14638 569587188687634433 negative 1.0000
Service Issue

14639 569587140490866689 neutral 0.6771 NaN

14640 rows × 15 columns

In [8]:
df.head()

Out[8]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_c

0 570306133677760513 neutral 1.0000 NaN

1 570301130888122368 positive 0.3486 NaN

2 570301083672813571 neutral 0.6837 NaN

3 570301031407624196 negative 1.0000 Bad Flight

4 570300817074462722 negative 1.0000 Can't Tell

In [9]:
df.head(10)

Out[9]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_c

0 570306133677760513 neutral 1.0000 NaN

1 570301130888122368 positive 0.3486 NaN

localhost:8888/nbconvert/html/pandas.ipynb?download=false 3/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_c

2 570301083672813571 neutral 0.6837 NaN

3 570301031407624196 negative 1.0000 Bad Flight

4 570300817074462722 negative 1.0000 Can't Tell

5 570300767074181121 negative 1.0000 Can't Tell

6 570300616901320704 positive 0.6745 NaN

7 570300248553349120 neutral 0.6340 NaN

8 570299953286942721 positive 0.6559 NaN

9 570295459631263746 positive 1.0000 NaN

In [15]:
df.head(60)

Out[15]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

0 570306133677760513 neutral 1.0000 NaN

1 570301130888122368 positive 0.3486 NaN

2 570301083672813571 neutral 0.6837 NaN

3 570301031407624196 negative 1.0000 Bad Flight

4 570300817074462722 negative 1.0000 Can't Tell

localhost:8888/nbconvert/html/pandas.ipynb?download=false 4/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

5 570300767074181121 negative 1.0000 Can't Tell

6 570300616901320704 positive 0.6745 NaN

7 570300248553349120 neutral 0.6340 NaN

8 570299953286942721 positive 0.6559 NaN

9 570295459631263746 positive 1.0000 NaN

10 570294189143031808 neutral 0.6769 NaN

11 570289724453216256 positive 1.0000 NaN

12 570289584061480960 positive 1.0000 NaN

13 570287408438120448 positive 0.6451 NaN

14 570285904809598977 positive 1.0000 NaN

15 570282469121007616 negative 0.6842 Late Flight

16 570277724385734656 positive 1.0000 NaN

17 570276917301137409 negative 1.0000 Bad Flight

18 570270684619923457 positive 1.0000 NaN

19 570267956648792064 positive 1.0000 NaN

20 570265883513384960 negative 0.6705 Can't Tell

localhost:8888/nbconvert/html/pandas.ipynb?download=false 5/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

21 570264145116819457 positive 1.0000 NaN

22 570259420287868928 positive 1.0000 NaN

23 570258822297579520 neutral 1.0000 NaN

Customer
24 570256553502068736 negative 1.0000
Service Issue

Customer
25 570249102404923392 negative 1.0000
Service Issue

26 570239632807370753 negative 1.0000 Can't Tell

27 570217831557677057 neutral 0.6854 NaN

28 570207886493782019 negative 1.0000 Bad Flight

29 570124596180955136 neutral 0.6150 NaN

Flight Booking
30 570114021854212096 negative 1.0000
Problems

31 570094701371469825 neutral 1.0000 NaN

Customer
32 570088404156698625 negative 1.0000
Service Issue

Customer
33 570084582780899328 negative 1.0000
Service Issue

34 570076792993611776 positive 1.0000 NaN

35 570051991277342720 neutral 0.6207 NaN

36 570051381534396416 positive 1.0000 NaN

localhost:8888/nbconvert/html/pandas.ipynb?download=false 6/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

37 570045393565691904 positive 1.0000 NaN

38 570038941497192448 neutral 0.6791 NaN

Customer
39 570035876845084672 negative 1.0000
Service Issue

40 570033593394667521 positive 0.6639 NaN

Flight Booking
41 570025482344898560 negative 0.6688
Problems

42 570016304284901379 neutral 1.0000 NaN

43 570015408788414464 neutral 0.6578 NaN

44 570013523650048002 neutral 1.0000 NaN

45 570012257549070337 positive 1.0000 NaN

46 570011341483843584 neutral 0.6799 NaN

47 570010571707256832 positive 1.0000 NaN

48 570010539499393025 neutral 1.0000 NaN

49 570009713447825408 neutral 0.6436 NaN

50 570009035455344640 neutral 0.6764 NaN

51 570006886012973056 positive 0.6570 NaN

localhost:8888/nbconvert/html/pandas.ipynb?download=false 7/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

52 570004391731847169 neutral 1.0000 NaN

53 570001194900426752 neutral 0.7118 NaN

54 570000071644872704 neutral 1.0000 NaN

Flight Booking
55 569996412286582784 negative 0.6939
Problems

56 569996245462159361 positive 1.0000 NaN

57 569990222609412097 positive 0.6350 NaN

58 569990163209850881 neutral 0.7007 NaN

59 569989504431316993 neutral 1.0000 NaN

In [16]:
df.head(70)

Out[16]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

0 570306133677760513 neutral 1.0000 NaN

1 570301130888122368 positive 0.3486 NaN

2 570301083672813571 neutral 0.6837 NaN

3 570301031407624196 negative 1.0000 Bad Flight

localhost:8888/nbconvert/html/pandas.ipynb?download=false 8/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason_

4 570300817074462722 negative 1.0000 Can't Tell

... ... ... ... ...

65 569982307634794497 neutral 0.6814 NaN

Customer
66 569976620158578688 negative 1.0000
Service Issue

67 569973821396152323 negative 1.0000 Late Flight

68 569972508499283968 positive 0.6922 NaN

69 569967019958730753 negative 1.0000 Lost Luggage

70 rows × 15 columns

In [17]:
df.tail()

Out[17]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereas

14635 569587686496825344 positive 0.3487 NaN

Customer
14636 569587371693355008 negative 1.0000
Service Issue

14637 569587242672398336 neutral 1.0000 NaN

Customer
14638 569587188687634433 negative 1.0000
Service Issue

localhost:8888/nbconvert/html/pandas.ipynb?download=false 9/23
4/4/2021 pandas

tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereas

14639 569587140490866689 neutral 0.6771 NaN

In [18]:
df.tail(10)

Out[18]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereas

14630 569588473050611712 positive 1.0000 NaN

14631 569588464896876545 negative 1.0000 Bad Flight

14632 569587813856841728 neutral 0.6760 NaN

14633 569587705937600512 negative 1.0000 Cancelled Flight

14634 569587691626622976 negative 0.6684 Late Flight

14635 569587686496825344 positive 0.3487 NaN

Customer
14636 569587371693355008 negative 1.0000
Service Issue

14637 569587242672398336 neutral 1.0000 NaN

Customer
14638 569587188687634433 negative 1.0000
Service Issue

14639 569587140490866689 neutral 0.6771 NaN

localhost:8888/nbconvert/html/pandas.ipynb?download=false 10/23
4/4/2021 pandas

In [20]:
df[100:110]

Out[20]: tweet_id airline_sentiment airline_sentiment_confidence negativereason negativereason

100 569894407001939968 neutral 1.0000 NaN

101 569892199690678272 negative 1.0000 Late Flight

102 569891469210755074 neutral 1.0000 NaN

103 569891436100874241 negative 0.6925 Late Flight

104 569887310713479168 negative 1.0000 Late Flight

105 569887049446076416 positive 1.0000 NaN

Customer
106 569884551712886785 negative 1.0000
Service Issue

Flight Booking
107 569884407852437504 negative 1.0000
Problems

108 569881548515708928 neutral 0.6593 NaN

109 569873669700358144 positive 0.6823 NaN

In [10]:
df['tweet_id']

Out[10]: 0 570306133677760513

localhost:8888/nbconvert/html/pandas.ipynb?download=false 11/23
4/4/2021 pandas
1 570301130888122368
2 570301083672813571
3 570301031407624196
4 570300817074462722
...
14635 569587686496825344
14636 569587371693355008
14637 569587242672398336
14638 569587188687634433
14639 569587140490866689
Name: tweet_id, Length: 14640, dtype: int64

In [11]:
type(df['tweet_id'])

Out[11]: pandas.core.series.Series

In [12]:
type(df[['tweet_id']])

Out[12]: pandas.core.frame.DataFrame

In [14]:
df[['tweet_id', 'airline_sentiment']]

Out[14]: tweet_id airline_sentiment

0 570306133677760513 neutral

1 570301130888122368 positive

2 570301083672813571 neutral

3 570301031407624196 negative

4 570300817074462722 negative

... ... ...

14635 569587686496825344 positive

14636 569587371693355008 negative

14637 569587242672398336 neutral

14638 569587188687634433 negative

14639 569587140490866689 neutral

14640 rows × 2 columns

In [27]:
df[['negativereason','tweet_id', 'airline_sentiment']]

Out[27]: negativereason tweet_id airline_sentiment

0 NaN 570306133677760513 neutral

1 NaN 570301130888122368 positive

2 NaN 570301083672813571 neutral

localhost:8888/nbconvert/html/pandas.ipynb?download=false 12/23
4/4/2021 pandas

negativereason tweet_id airline_sentiment

3 Bad Flight 570301031407624196 negative

4 Can't Tell 570300817074462722 negative

... ... ... ...

14635 NaN 569587686496825344 positive

14636 Customer Service Issue 569587371693355008 negative

14637 NaN 569587242672398336 neutral

14638 Customer Service Issue 569587188687634433 negative

14639 NaN 569587140490866689 neutral

14640 rows × 3 columns

In [15]:
df2 = pd.read_csv('Suicides in India 2001-2012.csv')

In [16]:
df2

Out[16]: State Year Type_code Type Gender Age_group Total

A&N
0 2001 Causes Illness (Aids/STD) Female 0-14 0
Islands

A&N Bankruptcy or Sudden change in


1 2001 Causes Female 0-14 0
Islands Economic

A&N Cancellation/Non-Settlement of
2 2001 Causes Female 0-14 0
Islands Marriage

A&N
3 2001 Causes Physical Abuse (Rape/Incest Etc.) Female 0-14 0
Islands

A&N
4 2001 Causes Dowry Dispute Female 0-14 0
Islands

... ... ... ... ... ... ... ...

West
237514 2012 Social_Status Seperated Male 0-100+ 149
Bengal

West
237515 2012 Social_Status Widowed/Widower Male 0-100+ 233
Bengal

West
237516 2012 Social_Status Married Male 0-100+ 5451
Bengal

West
237517 2012 Social_Status Divorcee Male 0-100+ 189
Bengal

West
237518 2012 Social_Status Never Married Male 0-100+ 2658
Bengal

237519 rows × 7 columns

localhost:8888/nbconvert/html/pandas.ipynb?download=false 13/23
4/4/2021 pandas

In [17]: df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237519 entries, 0 to 237518
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 State 237519 non-null object
1 Year 237519 non-null int64
2 Type_code 237519 non-null object
3 Type 237519 non-null object
4 Gender 237519 non-null object
5 Age_group 237519 non-null object
6 Total 237519 non-null int64
dtypes: int64(2), object(5)
memory usage: 12.7+ MB

In [18]:
df2.shape

Out[18]: (237519, 7)

In [19]:
df2['Year'].unique()

Out[19]: array([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
2012], dtype=int64)

In [20]:
df2['Year'].value_counts()

Out[20]: 2011 19806


2005 19803
2012 19799
2008 19797
2001 19797
2007 19794
2010 19792
2002 19790
2009 19786
2006 19786
2003 19786
2004 19783
Name: Year, dtype: int64

In [37]:
df2['Type'].value_counts()

Out[37]: Others (Please Specify) 7263


Illness (Aids/STD) 4200
Professional/Career Problem 4200
Dowry Dispute 4200
Love Affairs 4200
...
Seperated 912
No Education 912
Not having Children (Barrenness/Impotency 350
Bankruptcy or Sudden change in Economic Status 350
By Other means 350
Name: Type, Length: 69, dtype: int64

In [38]:
df2['Gender'].value_counts()

localhost:8888/nbconvert/html/pandas.ipynb?download=false 14/23
4/4/2021 pandas

Out[38]: Male 118879


Female 118640
Name: Gender, dtype: int64

In [43]:
df2['State'].value_counts()

Out[43]: Maharashtra 6792


Madhya Pradesh 6792
Karnataka 6792
Rajasthan 6791
Andhra Pradesh 6791
Odisha 6791
Haryana 6790
Chhattisgarh 6790
Bihar 6790
Kerala 6788
Uttar Pradesh 6787
Tamil Nadu 6786
Gujarat 6786
Assam 6786
Jharkhand 6785
Delhi (Ut) 6782
Tripura 6782
West Bengal 6780
Punjab 6779
Himachal Pradesh 6774
Jammu & Kashmir 6761
Goa 6759
Uttarakhand 6758
Sikkim 6742
Mizoram 6737
Meghalaya 6733
Puducherry 6730
Chandigarh 6717
A & N Islands 6712
Daman & Diu 6710
Arunachal Pradesh 6707
Nagaland 6705
D & N Haveli 6704
Manipur 6700
Lakshadweep 6674
Total (States) 312
Total (All India) 312
Total (Uts) 312
Name: State, dtype: int64

In [21]:
df2[df2['State'] == 'Karnataka']

Out[21]: State Year Type_code Type Gender Age_group Total

108144 Karnataka 2001 Causes Insanity/Mental Illness Female 0-14 11

108145 Karnataka 2001 Causes Causes Not known Female 0-14 42

108146 Karnataka 2001 Causes Property Dispute Female 0-14 0

108147 Karnataka 2001 Causes Drug Abuse/Addiction Female 0-14 5

108148 Karnataka 2001 Causes Cancer Female 0-14 0

... ... ... ... ... ... ... ...

localhost:8888/nbconvert/html/pandas.ipynb?download=false 15/23
4/4/2021 pandas

State Year Type_code Type Gender Age_group Total

114931 Karnataka 2012 Social_Status Seperated Male 0-100+ 215

114932 Karnataka 2012 Social_Status Married Male 0-100+ 6216

114933 Karnataka 2012 Social_Status Never Married Male 0-100+ 1968

114934 Karnataka 2012 Social_Status Widowed/Widower Male 0-100+ 180

114935 Karnataka 2012 Social_Status Divorcee Male 0-100+ 17

6792 rows × 7 columns

In [23]:
df2[(df2['State'] == 'Karnataka') | (df2['State'] == 'Punjab')]

Out[23]: State Year Type_code Type Gender Age_group Total

108144 Karnataka 2001 Causes Insanity/Mental Illness Female 0-14 11

108145 Karnataka 2001 Causes Causes Not known Female 0-14 42

108146 Karnataka 2001 Causes Property Dispute Female 0-14 0

108147 Karnataka 2001 Causes Drug Abuse/Addiction Female 0-14 5

108148 Karnataka 2001 Causes Cancer Female 0-14 0

... ... ... ... ... ... ... ...

189152 Punjab 2012 Social_Status Seperated Male 0-100+ 10

189153 Punjab 2012 Social_Status Divorcee Male 0-100+ 1

189154 Punjab 2012 Social_Status Widowed/Widower Male 0-100+ 4

189155 Punjab 2012 Social_Status Never Married Male 0-100+ 299

189156 Punjab 2012 Social_Status Married Male 0-100+ 470

13571 rows × 7 columns

In [25]:
df2[(df2['State'] == 'Bihar') & (df2['Gender'] == 'Male') & (df2['Type'] == 'Cancer')].

Out[25]: (60, 7)

In [45]:
karnataka_df = df2[df2['State'] == 'Karnataka']

In [46]:
karnataka_df

Out[46]: State Year Type_code Type Gender Age_group Total

108144 Karnataka 2001 Causes Insanity/Mental Illness Female 0-14 11

108145 Karnataka 2001 Causes Causes Not known Female 0-14 42

108146 Karnataka 2001 Causes Property Dispute Female 0-14 0

localhost:8888/nbconvert/html/pandas.ipynb?download=false 16/23
4/4/2021 pandas

State Year Type_code Type Gender Age_group Total

108147 Karnataka 2001 Causes Drug Abuse/Addiction Female 0-14 5

108148 Karnataka 2001 Causes Cancer Female 0-14 0

... ... ... ... ... ... ... ...

114931 Karnataka 2012 Social_Status Seperated Male 0-100+ 215

114932 Karnataka 2012 Social_Status Married Male 0-100+ 6216

114933 Karnataka 2012 Social_Status Never Married Male 0-100+ 1968

114934 Karnataka 2012 Social_Status Widowed/Widower Male 0-100+ 180

114935 Karnataka 2012 Social_Status Divorcee Male 0-100+ 17

6792 rows × 7 columns

In [47]:
karnataka_df.to_excel('karnataka_df.xlsx')

In [50]:
karnataka_df[karnataka_df['Year'] == 2005]

Out[50]: State Year Type_code Type Gender Age_group Total

Bankruptcy or Sudden change in


110408 Karnataka 2005 Causes Female 0-14 0
Economic

110409 Karnataka 2005 Causes Other Causes (Please Specity) Female 0-14 38

110410 Karnataka 2005 Causes Paralysis Female 0-14 0

110411 Karnataka 2005 Causes Poverty Female 0-14 1

110412 Karnataka 2005 Causes Death of Dear Person Female 0-14 0

... ... ... ... ... ... ... ...

110969 Karnataka 2005 Social_Status Seperated Male 0-100+ 232

110970 Karnataka 2005 Social_Status Married Male 0-100+ 5715

110971 Karnataka 2005 Social_Status Divorcee Male 0-100+ 28

110972 Karnataka 2005 Social_Status Widowed/Widower Male 0-100+ 170

110973 Karnataka 2005 Social_Status Never Married Male 0-100+ 1562

566 rows × 7 columns

In [51]:
df2[(df2['State'] == 'Kerala') & (df2['Year'] == 2005)]

Out[51]: State Year Type_code Type Gender Age_group Total

117200 Kerala 2005 Causes Cancer Female 0-14 0

Bankruptcy or Sudden change in


117201 Kerala 2005 Causes Female 0-14 0
Economic

localhost:8888/nbconvert/html/pandas.ipynb?download=false 17/23
4/4/2021 pandas

State Year Type_code Type Gender Age_group Total

Cancellation/Non-Settlement of
117202 Kerala 2005 Causes Female 0-14 0
Marriage

117203 Kerala 2005 Causes Illegitimate Pregnancy Female 0-14 0

117204 Kerala 2005 Causes Family Problems Female 0-14 13

... ... ... ... ... ... ... ...

117761 Kerala 2005 Social_Status Widowed/Widower Male 0-100+ 128

117762 Kerala 2005 Social_Status Married Male 0-100+ 5388

117763 Kerala 2005 Social_Status Divorcee Male 0-100+ 16

117764 Kerala 2005 Social_Status Seperated Male 0-100+ 60

117765 Kerala 2005 Social_Status Never Married Male 0-100+ 1238

566 rows × 7 columns

In [52]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237519 entries, 0 to 237518
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 State 237519 non-null object
1 Year 237519 non-null int64
2 Type_code 237519 non-null object
3 Type 237519 non-null object
4 Gender 237519 non-null object
5 Age_group 237519 non-null object
6 Total 237519 non-null int64
dtypes: int64(2), object(5)
memory usage: 12.7+ MB

In [33]:
df2[(df2['State']=='Gujarat') & (df2['Year'] >= 2003) & (df2['Year'] <= 2005 ) & (df2[

Out[33]: State Year Type_code Type Gender Age_group Total

75403 Gujarat 2003 Causes Cancer Female 0-14 1

75442 Gujarat 2003 Causes Cancer Female 15-29 1

75486 Gujarat 2003 Causes Cancer Female 30-44 4

75549 Gujarat 2003 Causes Cancer Female 45-59 4

75602 Gujarat 2003 Causes Cancer Female 60+ 2

75970 Gujarat 2004 Causes Cancer Female 0-14 0

76003 Gujarat 2004 Causes Cancer Female 15-29 0

76072 Gujarat 2004 Causes Cancer Female 30-44 4

76105 Gujarat 2004 Causes Cancer Female 45-59 1

localhost:8888/nbconvert/html/pandas.ipynb?download=false 18/23
4/4/2021 pandas

State Year Type_code Type Gender Age_group Total

76164 Gujarat 2004 Causes Cancer Female 60+ 1

76532 Gujarat 2005 Causes Cancer Female 0-14 0

76570 Gujarat 2005 Causes Cancer Female 15-29 1

76636 Gujarat 2005 Causes Cancer Female 30-44 1

76671 Gujarat 2005 Causes Cancer Female 45-59 3

76725 Gujarat 2005 Causes Cancer Female 60+ 2

In [40]:
df3 = pd.read_csv('sucide df.csv')
df3.shape

Out[40]: (237530, 7)

In [36]:
df3

Out[36]: State Year Type_code Type Gender Age_group Total

A&N
0 2001 Causes Illness (Aids/STD) Female 0-14 0
Islands

A&N Bankruptcy or Sudden change in


1 2001 Causes Female 0-14 0
Islands Economic

A&N Cancellation/Non-Settlement of
2 2001 Causes Female 0-14 0
Islands Marriage

A&N
3 2001 Causes Physical Abuse (Rape/Incest Etc.) Female 0-14 0
Islands

A&N
4 2001 Causes Dowry Dispute Female 0-14 0
Islands

... ... ... ... ... ... ... ...

West
237525 2012 Social_Status Seperated Male 0-100+ 149
Bengal

West
237526 2012 Social_Status Widowed/Widower Male 0-100+ 233
Bengal

West
237527 2012 Social_Status Married Male 0-100+ 5451
Bengal

West
237528 2012 Social_Status Divorcee Male 0-100+ 189
Bengal

West
237529 2012 Social_Status Never Married Male 0-100+ 2658
Bengal

237530 rows × 7 columns

In [37]:
df3[df3.duplicated()]

localhost:8888/nbconvert/html/pandas.ipynb?download=false 19/23
4/4/2021 pandas

Out[37]: State Year Type_code Type Gender Age_group Total

237519 West Bengal 2012 Professional_Profile Service (Government) Male 60+ 0

237520 West Bengal 2012 Social_Status Seperated Female 0-100+ 200

237521 West Bengal 2012 Social_Status Married Female 0-100+ 3927

237522 West Bengal 2012 Social_Status Divorcee Female 0-100+ 182

237523 West Bengal 2012 Social_Status Widowed/Widower Female 0-100+ 455

237524 West Bengal 2012 Social_Status Never Married Female 0-100+ 1513

237525 West Bengal 2012 Social_Status Seperated Male 0-100+ 149

237526 West Bengal 2012 Social_Status Widowed/Widower Male 0-100+ 233

237527 West Bengal 2012 Social_Status Married Male 0-100+ 5451

237528 West Bengal 2012 Social_Status Divorcee Male 0-100+ 189

237529 West Bengal 2012 Social_Status Never Married Male 0-100+ 2658

In [41]:
df3 = df3.drop_duplicates()

In [42]:
df3[df3.duplicated()]

Out[42]: State Year Type_code Type Gender Age_group Total

In [43]:
df4 = df3.drop_duplicates(subset=['State'])

In [44]:
df4

Out[44]: State Year Type_code Type Gender Age_group Total

A&N
0 2001 Causes Illness (Aids/STD) Female 0-14 0
Islands

Andhra
6712 2001 Causes Paralysis Female 0-14 0
Pradesh

Arunachal Cancellation/Non-Settlement
13503 2001 Causes Female 0-14 0
Pradesh of Marriage

20210 Assam 2001 Causes Drug Abuse/Addiction Female 0-14 0

26996 Bihar 2001 Causes Drug Abuse/Addiction Female 0-14 0

33786 Chandigarh 2001 Causes Drug Abuse/Addiction Female 0-14 0

40503 Chhattisgarh 2001 Causes Causes Not known Female 0-14 13

47293 D & N Haveli 2001 Causes Insanity/Mental Illness Female 0-14 0

Daman &
53997 2001 Causes Illegitimate Pregnancy Female 0-14 0
Diu

localhost:8888/nbconvert/html/pandas.ipynb?download=false 20/23
4/4/2021 pandas

State Year Type_code Type Gender Age_group Total

60707 Delhi (Ut) 2001 Causes Professional/Career Problem Female 0-14 0

Cancellation/Non-Settlement
67489 Goa 2001 Causes Female 0-14 0
of Marriage

74248 Gujarat 2001 Causes Illegitimate Pregnancy Female 0-14 0

81034 Haryana 2001 Causes Death of Dear Person Female 0-14 0

Himachal
87824 2001 Causes Dowry Dispute Female 0-14 0
Pradesh

Jammu &
94598 2001 Causes Illness (Aids/STD) Female 0-14 0
Kashmir

101359 Jharkhand 2001 Causes Failure in Examination Female 0-14 1

108144 Karnataka 2001 Causes Insanity/Mental Illness Female 0-14 11

114936 Kerala 2001 Causes Love Affairs Female 0-14 0

121724 Lakshadweep 2001 Causes Drug Abuse/Addiction Female 0-14 0

Madhya
128398 2001 Causes Insanity/Mental Illness Female 0-14 2
Pradesh

Ideological Causes/Hero
135190 Maharashtra 2001 Causes Female 0-14 0
Worshipping

141982 Manipur 2001 Causes Paralysis Female 0-14 0

148682 Meghalaya 2001 Causes Death of Dear Person Female 0-14 0

155415 Mizoram 2001 Causes Death of Dear Person Female 0-14 0

162152 Nagaland 2001 Causes Fall in Social Reputation Female 0-14 0

Bankruptcy or Sudden
168857 Odisha 2001 Causes Female 0-14 0
change in Economic

175648 Puducherry 2001 Causes Illegitimate Pregnancy Female 0-14 0

182378 Punjab 2001 Causes Dowry Dispute Female 0-14 0

189157 Rajasthan 2001 Causes Causes Not known Female 0-14 12

195948 Sikkim 2001 Causes Failure in Examination Female 0-14 0

202690 Tamil Nadu 2001 Causes Death of Dear Person Female 0-14 2

Hr.
Total (All
209476 2001 Education_Status Secondary/Intermediate/Pre- Female 0-100+ 2391
India)
Universit

209788 Total (States) 2001 Education_Status Primary Female 0-100+ 11118

210100 Total (Uts) 2001 Education_Status No Education Female 0-100+ 148

210412 Tripura 2001 Causes Poverty Female 0-14 1

Uttar Other Causes (Please


217194 2001 Causes Female 0-14 13
Pradesh Specity)

223981 Uttarakhand 2001 Causes Unemployment Female 0-14 0

localhost:8888/nbconvert/html/pandas.ipynb?download=false 21/23
4/4/2021 pandas

State Year Type_code Type Gender Age_group Total

Physical Abuse (Rape/Incest


230739 West Bengal 2001 Causes Female 0-14 1
Etc.)

In [45]:
df3.describe()

Out[45]: Year Total

count 237519.000000 237519.000000

mean 2006.500448 55.034477

std 3.452240 792.749038

min 2001.000000 0.000000

25% 2004.000000 0.000000

50% 2007.000000 0.000000

75% 2010.000000 6.000000

max 2012.000000 63343.000000

In [46]:
import matplotlib.pyplot as plt

In [48]:
churn_df = pd.read_csv('Churn.csv')

In [49]:
churn_df

Out[49]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance

0 1 15634602 Hargrave 619 France Female 42 2 0.00

1 2 15647311 Hill 608 Spain Female 41 1 83807.86

2 3 15619304 Onio 502 France Female 42 8 159660.80

3 4 15701354 Boni 699 France Female 39 1 0.00

4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82

... ... ... ... ... ... ... ... ... ...

9995 9996 15606229 Obijiaku 771 France Male 39 5 0.00

9996 9997 15569892 Johnstone 516 France Male 35 10 57369.61

9997 9998 15584532 Liu 709 France Female 36 7 0.00

9998 9999 15682355 Sabbatini 772 Germany Male 42 3 75075.31

9999 10000 15628319 Walker 792 France Female 28 4 130142.79

10000 rows × 14 columns

localhost:8888/nbconvert/html/pandas.ipynb?download=false 22/23
4/4/2021 pandas

In [50]:
churn_df.describe()

Out[50]: RowNumber CustomerId CreditScore Age Tenure Balance NumOfPro

count 10000.00000 1.000000e+04 10000.000000 10000.000000 10000.000000 10000.000000 10000.00

mean 5000.50000 1.569094e+07 650.528800 38.921800 5.012800 76485.889288 1.53

std 2886.89568 7.193619e+04 96.653299 10.487806 2.892174 62397.405202 0.58

min 1.00000 1.556570e+07 350.000000 18.000000 0.000000 0.000000 1.00

25% 2500.75000 1.562853e+07 584.000000 32.000000 3.000000 0.000000 1.00

50% 5000.50000 1.569074e+07 652.000000 37.000000 5.000000 97198.540000 1.00

75% 7500.25000 1.575323e+07 718.000000 44.000000 7.000000 127644.240000 2.00

max 10000.00000 1.581569e+07 850.000000 92.000000 10.000000 250898.090000 4.00

In [51]:
churn_df.boxplot(column=['Age'])

Out[51]: <matplotlib.axes._subplots.AxesSubplot at 0x1ab45349848>

In [53]:
churn_df = churn_df[churn_df['Age'] < 62]

In [ ]:

localhost:8888/nbconvert/html/pandas.ipynb?download=false 23/23

You might also like