Download as pdf or txt
Download as pdf or txt
You are on page 1of 24

11/14/21, 6:31 PM 2.

3 - Jupyter Notebook

In [1]:

import pandas as pd
pd.set_option('display.max_rows', 130, 'display.max_columns', 130)
pd.options.display.float_format = '{:,.2f}'.format

import matplotlib.pyplot as plt


import seaborn as sns

import numpy as np

In [3]:

data = pd.read_csv('D:\\sem3\\CSf\\py lab\\2.3\loan.csv', error_bad_lines=False, index_col=

In [4]:

data.head()

Out[4]:

id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installmen

36
0 1077501 1296599 5000 5000 4975 10.65% 162.8
months

60
1 1077430 1314167 2500 2500 2500 15.27% 59.8
months

36
2 1077175 1313524 2400 2400 2400 15.96% 84.3
months

36
3 1076863 1277178 10000 10000 10000 13.49% 339.3
months

60
4 1075358 1311748 3000 3000 3000 12.69% 67.7
months

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 1/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [5]:

data.shape

Out[5]:

(39717, 111)

In [6]:

data.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 39717 entries, 0 to 39716

Columns: 111 entries, id to total_il_high_credit_limit

dtypes: object(111)

memory usage: 33.6+ MB

In [7]:

percent_missing = data.isnull().sum() * 100 / len(data)


missing_value_df = pd.DataFrame({'column_name': data.columns,
'percent_missing': percent_missing.round(2)})
missing_value_df.sort_values('percent_missing', inplace=True)
(missing_value_df['percent_missing'] == 100.00).sum()

Out[7]:

54

In [9]:

missing_values_columns = missing_value_df[(missing_value_df['percent_missing'] == 100.00)].


null_columns_dropped_data = data.drop(missing_values_columns, axis=1)
null_columns_dropped_data = null_columns_dropped_data.drop(['desc', 'member_id', 'id', 'url
'mths_since_last_record', 'mths_
null_columns_dropped_data = null_columns_dropped_data.drop(['delinq_amnt', 'acc_now_delinq'
'policy_code', 'pymnt_plan', 'in
'tax_liens', 'collections_12_mth
null_columns_dropped_data = null_columns_dropped_data.drop(['emp_title'], axis=1)
((null_columns_dropped_data.isnull().sum() * 100 / len(null_columns_dropped_data)).sort_val
ascending=False)).head(11)

Out[9]:

emp_length 2.71

pub_rec_bankruptcies 1.75

last_pymnt_d 0.18

revol_util 0.13

title 0.03

last_credit_pull_d 0.01

total_pymnt 0.00

pub_rec 0.00

revol_bal 0.00

total_acc 0.00

out_prncp 0.00

dtype: float64

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 2/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [10]:

null_columns_dropped_data.shape

Out[10]:

(39717, 40)

In [11]:

data = []
for row in null_columns_dropped_data.columns.tolist():
data.append({'column': row, 'count': len(null_columns_dropped_data[row].unique())})
unique = pd.DataFrame(data).sort_values('count')
unique.head(19)

Out[11]:

column count

3 term 2

13 loan_status 3

11 verification_status 3

39 pub_rec_bankruptcies 4

23 pub_rec 5

9 home_ownership 5

6 grade 7

21 inq_last_6mths 9

19 delinq_2yrs 11

8 emp_length 12

14 purpose 14

7 sub_grade 35

22 open_acc 40

17 addr_state 50

12 issue_d 55

26 total_acc 82

36 last_pymnt_d 102

38 last_credit_pull_d 107

4 int_rate 371

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 3/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [13]:

null_columns_dropped_data['term_months'] = null_columns_dropped_data['term'].str.rstrip('mo
del null_columns_dropped_data['term']
null_columns_dropped_data['int_rate_percentage'] = null_columns_dropped_data['int_rate'].st
del null_columns_dropped_data['int_rate']
null_columns_dropped_data['emp_length_years'] = null_columns_dropped_data['emp_length'].str
null_columns_dropped_data['emp_length_years'].replace({'10+': '10'}, inplace=True)
null_columns_dropped_data['emp_length_years'][null_columns_dropped_data['emp_length_years']
del null_columns_dropped_data['emp_length']
null_columns_dropped_data['revol_util_percentage'] = null_columns_dropped_data['revol_util'
del null_columns_dropped_data['revol_util']
null_columns_dropped_data['issue_d_month'], null_columns_dropped_data['issue_d_year'] = nul
'issue_d'].str.split('-').str
null_columns_dropped_data['last_credit_pull_d_month'], null_columns_dropped_data['last_cred
'last_credit_pull_d'].str.split('-').str
null_columns_dropped_data['last_pymnt_d_month'], null_columns_dropped_data['last_pymnt_d_ye
'last_pymnt_d'].str.split('-').str
null_columns_dropped_data['earliest_cr_line_month'], null_columns_dropped_data['earliest_cr
'earliest_cr_line'].str.split('-').str
null_columns_dropped_data['issue_d_year'] = '20' + null_columns_dropped_data['issue_d_year'
columns = ['loan_amnt', 'funded_amnt','funded_amnt_inv', 'installment', 'annual_inc', 'emp_
'revol_bal', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'tot
'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'int_rate_percenta
'revol_util_percentage']
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].astype(np.float)
null_columns_dropped_data['PnL']=(
null_columns_dropped_data['total_pymnt']-null_columns_dropped_data['funded_amnt']
)*100/null_columns_dropped_data['funded_amnt']
null_columns_dropped_data['loan_inc_ratio'
] = null_columns_dropped_data.funded_amnt*100/null_columns_dropped
columns = ['total_acc', 'term_months', 'issue_d_year']
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].astype(np.int)
columns = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'loan_status', 'p
'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies']
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].apply(lambda x: x.a
null_columns_dropped_data['annual_inc_lakhs'] = null_columns_dropped_data['annual_inc'] / 1
null_columns_dropped_data['annual_inc_lakhs'].describe()

<ipython-input-13-7f15929f3711>:11: FutureWarning: Columnar iteration over c


haracters will be deprecated in future releases.

null_columns_dropped_data['issue_d_month'], null_columns_dropped_data['iss
ue_d_year'] = null_columns_dropped_data[

<ipython-input-13-7f15929f3711>:13: FutureWarning: Columnar iteration over c


haracters will be deprecated in future releases.

null_columns_dropped_data['last_credit_pull_d_month'], null_columns_droppe
d_data['last_credit_pull_d_year'] = null_columns_dropped_data[

<ipython-input-13-7f15929f3711>:15: FutureWarning: Columnar iteration over c


haracters will be deprecated in future releases.

null_columns_dropped_data['last_pymnt_d_month'], null_columns_dropped_data
['last_pymnt_d_year'] = null_columns_dropped_data[

<ipython-input-13-7f15929f3711>:17: FutureWarning: Columnar iteration over c


haracters will be deprecated in future releases.

null_columns_dropped_data['earliest_cr_line_month'], null_columns_dropped_
data['earliest_cr_line_year'] = null_columns_dropped_data[

<ipython-input-13-7f15929f3711>:24: DeprecationWarning: `np.float` is a depr


ecated alias for the builtin `float`. To silence this warning, use `float` b
y itself. Doing this will not modify any behavior and is safe. If you specif
ically wanted the numpy scalar type, use `np.float64` here.

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 4/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/d


evdocs/release/1.20.0-notes.html#deprecations (https://numpy.org/devdocs/rel
ease/1.20.0-notes.html#deprecations)

null_columns_dropped_data[columns] = null_columns_dropped_data[columns].as
type(np.float)

<ipython-input-13-7f15929f3711>:31: DeprecationWarning: `np.int` is a deprec


ated alias for the builtin `int`. To silence this warning, use `int` by itse
lf. Doing this will not modify any behavior and is safe. When replacing `np.
int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the preci
sion. If you wish to review your current use, check the release note link fo
r additional information.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/d


evdocs/release/1.20.0-notes.html#deprecations (https://numpy.org/devdocs/rel
ease/1.20.0-notes.html#deprecations)

null_columns_dropped_data[columns] = null_columns_dropped_data[columns].as
type(np.int)

Out[13]:

count 39,717.00

mean 68.97

std 63.79

min 4.00

25% 40.40

50% 59.00

75% 82.30

max 6,000.00

Name: annual_inc_lakhs, dtype: float64

In [14]:

null_columns_dropped_data.head()

Out[14]:

loan_amnt funded_amnt funded_amnt_inv installment grade sub_grade home_ownership

0 5,000.00 5,000.00 4,975.00 162.87 B B2 RENT

1 2,500.00 2,500.00 2,500.00 59.83 C C4 RENT

2 2,400.00 2,400.00 2,400.00 84.33 C C5 RENT

3 10,000.00 10,000.00 10,000.00 339.31 C C1 RENT

4 3,000.00 3,000.00 3,000.00 67.79 B B5 RENT

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 5/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [15]:

numeric_data = null_columns_dropped_data.select_dtypes(include = [np.number])

categorical_data = null_columns_dropped_data.select_dtypes(exclude = [np.number, np.object]

string_data = null_columns_dropped_data.select_dtypes(include = [np.object])

print('There are {0} numerical , {1} categorical and {2} string features in the training da
format(numeric_data.shape[1], categorical_data.shape[1], string_data.shape[1]))

There are 25 numerical , 12 categorical and 14 string features in the traini


ng data

<ipython-input-15-f7fe6adb94b3>:3: DeprecationWarning: `np.object` is a depr


ecated alias for the builtin `object`. To silence this warning, use `object`
by itself. Doing this will not modify any behavior and is safe.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/d


evdocs/release/1.20.0-notes.html#deprecations (https://numpy.org/devdocs/rel
ease/1.20.0-notes.html#deprecations)

categorical_data = null_columns_dropped_data.select_dtypes(exclude = [np.n


umber, np.object])

<ipython-input-15-f7fe6adb94b3>:5: DeprecationWarning: `np.object` is a depr


ecated alias for the builtin `object`. To silence this warning, use `object`
by itself. Doing this will not modify any behavior and is safe.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/d


evdocs/release/1.20.0-notes.html#deprecations (https://numpy.org/devdocs/rel
ease/1.20.0-notes.html#deprecations)

string_data = null_columns_dropped_data.select_dtypes(include = [np.objec


t])

In [16]:

null_columns_dropped_data = null_columns_dropped_data[null_columns_dropped_data.loan_status
null_columns_dropped_data = null_columns_dropped_data.loc[:,null_columns_dropped_data.nuniq

df = null_columns_dropped_data

df["loan_status"].value_counts()

Out[16]:

Fully Paid 32950

Charged Off 5627

Current 0

Name: loan_status, dtype: int64

In [17]:

df["loan_status"].value_counts()

Out[17]:

Fully Paid 32950

Charged Off 5627

Current 0

Name: loan_status, dtype: int64

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 6/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [18]:

sns.boxplot( y=df["loan_amnt"] )
plt.ylabel('Loan Amount')
plt.show()

In [19]:

sns.countplot(x= null_columns_dropped_data["loan_status"])
plt.show()

In [20]:

df["loan_status"].value_counts()

Out[20]:

Fully Paid 32950

Charged Off 5627

Current 0

Name: loan_status, dtype: int64

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 7/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [21]:

sns.countplot(x='grade', data=df)
plt.show()

In [22]:

sns.countplot(x='emp_length_years', data=df)
plt.xlabel('Employee Working Experience')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 8/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [23]:

sns.countplot(x='emp_length_years', hue='loan_status', data=df)


plt.xlabel('Employee Working Experience')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 9/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [24]:

sns.distplot(df['funded_amnt'])
plt.show()

C:\Users\DEVI PRASAD\anaconda3\lib\site-packages\seaborn\distributions.py:25
57: FutureWarning: `distplot` is a deprecated function and will be removed i
n a future version. Please adapt your code to use either `displot` (a figure
-level function with similar flexibility) or `histplot` (an axes-level funct
ion for histograms).

warnings.warn(msg, FutureWarning)

In [25]:

sns.boxplot(x=df['annual_inc_lakhs'])
plt.xlabel('Annual Income in Lakhs')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 10/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [26]:

sns.distplot(df[df['annual_inc_lakhs'] < 3000]['annual_inc_lakhs'], bins=5)


plt.xlabel('Annual Income in Lakhs')
plt.show()

C:\Users\DEVI PRASAD\anaconda3\lib\site-packages\seaborn\distributions.py:25
57: FutureWarning: `distplot` is a deprecated function and will be removed i
n a future version. Please adapt your code to use either `displot` (a figure
-level function with similar flexibility) or `histplot` (an axes-level funct
ion for histograms).

warnings.warn(msg, FutureWarning)

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 11/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [27]:

plt.subplot(1,2,1)
plt.title('Default')
sns.boxplot(y=df[df.loan_status=='Charged Off'].PnL)

plt.subplot(1,2,2)
plt.title('Fully Paid')
sns.boxplot(y=df[df.loan_status=='Fully Paid'].PnL)
plt.show()

In [28]:

sns.boxplot(y='int_rate_percentage', x='grade', data=df)


plt.ylabel('Interest Rate Percentage')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 12/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [29]:

sns.boxplot(y='int_rate_percentage', x='loan_status', data=df)


plt.ylabel('Interest Rate Percentage')
plt.xlabel('Loan Status')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 13/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [30]:

sns.countplot(x='purpose', hue="loan_status", data=df)


plt.xticks(rotation=90)
plt.show()

In [31]:

sns.boxplot( y="loan_amnt", x='loan_status' , data=df)


plt.ylabel('Loan Amount')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 14/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [32]:

sns.countplot( x="term_months", hue='loan_status', data=df)


plt.xlabel('Loan Term in Months')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 15/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [33]:

plt.figure(figsize=(10,7))
sns.countplot( x="term_months", hue='purpose', data=df)
plt.xlabel('Loan Term in Months')
plt.show()

In [34]:

sns.countplot(x='purpose', hue='loan_status', data=df)


plt.title('No of loans granted for various purpose')
plt.xticks(rotation=90)
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 16/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [35]:

sns.countplot(x='home_ownership',hue='loan_status', data=df)
plt.xticks(rotation=90)
plt.show()

In [36]:

sns.barplot(y='annual_inc_lakhs', x='pub_rec_bankruptcies', data=df)


plt.xticks(rotation=90)
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 17/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [37]:

plt.figure(figsize=(10,10))
sns.heatmap(df.corr())
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 18/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [38]:

del numeric_data['out_prncp']
del numeric_data['out_prncp_inv']

plt.figure(figsize=(15,40))

for i in range(len(numeric_data.columns)):
plt.subplot(8,3, i + 1)
sns.boxplot(y=numeric_data.columns[i], x='loan_status', data=df)

plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 19/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [39]:

g = df[df['loan_status']=='Charged Off'].groupby('addr_state')['loan_status'].count().reset
plt.figure(figsize=(10,10))
sns.barplot(y='addr_state', x='loan_status', data=g)
plt.xlabel('Count of loan status to be defaulter')
plt.ylabel('State')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 20/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [40]:

g = df.groupby('issue_d_year')['loan_status'].count()

g.plot.line(x_compat=True)
plt.xticks(np.arange(min(g.index), max(g.index)+1, 1.0))
plt.title('No of loan granted over the years')
plt.xlabel('Loan Issue Year')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 21/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [41]:

sns.barplot(x='loan_status',y='PnL',data=df)
plt.xlabel("Loan Status")
plt.ylabel("Profit and Loss")

plt.title("Profit n Loss vs status relationship")

plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 22/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [42]:

plt.figure(figsize=(10,10))

sns.barplot(x='loan_status',y='loan_inc_ratio',hue='purpose',data=df)
plt.xlabel('Loan Status')
plt.ylabel('Ratio of loan granted vs annual salary')
plt.show()

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 23/24


11/14/21, 6:31 PM 2.3 - Jupyter Notebook

In [43]:

df_agg = df[df['funded_amnt'] <= 7000].groupby(['loan_status'])['funded_amnt'].count().rese


sns.barplot(x='loan_status', y='funded_amnt', data=df_agg)
plt.title('Loan Amount granted less than 7 lakhs')
plt.xlabel('Loan Status')
plt.ylabel('Loan Amount Granted')
plt.show()

In [ ]:

In [ ]:

localhost:8888/notebooks/worksheet/exp 2.3/2.3.ipynb 24/24

You might also like