Professional Documents
Culture Documents
2.3 - Jupyter Notebook
2.3 - Jupyter Notebook
3 - Jupyter Notebook
In [1]:
import pandas as pd
pd.set_option('display.max_rows', 130, 'display.max_columns', 130)
pd.options.display.float_format = '{:,.2f}'.format
import numpy as np
In [3]:
In [4]:
data.head()
Out[4]:
36
0 1077501 1296599 5000 5000 4975 10.65% 162.8
months
60
1 1077430 1314167 2500 2500 2500 15.27% 59.8
months
36
2 1077175 1313524 2400 2400 2400 15.96% 84.3
months
36
3 1076863 1277178 10000 10000 10000 13.49% 339.3
months
60
4 1075358 1311748 3000 3000 3000 12.69% 67.7
months
In [5]:
data.shape
Out[5]:
(39717, 111)
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
dtypes: object(111)
In [7]:
Out[7]:
54
In [9]:
Out[9]:
emp_length 2.71
pub_rec_bankruptcies 1.75
last_pymnt_d 0.18
revol_util 0.13
title 0.03
last_credit_pull_d 0.01
total_pymnt 0.00
pub_rec 0.00
revol_bal 0.00
total_acc 0.00
out_prncp 0.00
dtype: float64
In [10]:
null_columns_dropped_data.shape
Out[10]:
(39717, 40)
In [11]:
data = []
for row in null_columns_dropped_data.columns.tolist():
data.append({'column': row, 'count': len(null_columns_dropped_data[row].unique())})
unique = pd.DataFrame(data).sort_values('count')
unique.head(19)
Out[11]:
column count
3 term 2
13 loan_status 3
11 verification_status 3
39 pub_rec_bankruptcies 4
23 pub_rec 5
9 home_ownership 5
6 grade 7
21 inq_last_6mths 9
19 delinq_2yrs 11
8 emp_length 12
14 purpose 14
7 sub_grade 35
22 open_acc 40
17 addr_state 50
12 issue_d 55
26 total_acc 82
36 last_pymnt_d 102
38 last_credit_pull_d 107
4 int_rate 371
In [13]:
null_columns_dropped_data['term_months'] = null_columns_dropped_data['term'].str.rstrip('mo
del null_columns_dropped_data['term']
null_columns_dropped_data['int_rate_percentage'] = null_columns_dropped_data['int_rate'].st
del null_columns_dropped_data['int_rate']
null_columns_dropped_data['emp_length_years'] = null_columns_dropped_data['emp_length'].str
null_columns_dropped_data['emp_length_years'].replace({'10+': '10'}, inplace=True)
null_columns_dropped_data['emp_length_years'][null_columns_dropped_data['emp_length_years']
del null_columns_dropped_data['emp_length']
null_columns_dropped_data['revol_util_percentage'] = null_columns_dropped_data['revol_util'
del null_columns_dropped_data['revol_util']
null_columns_dropped_data['issue_d_month'], null_columns_dropped_data['issue_d_year'] = nul
'issue_d'].str.split('-').str
null_columns_dropped_data['last_credit_pull_d_month'], null_columns_dropped_data['last_cred
'last_credit_pull_d'].str.split('-').str
null_columns_dropped_data['last_pymnt_d_month'], null_columns_dropped_data['last_pymnt_d_ye
'last_pymnt_d'].str.split('-').str
null_columns_dropped_data['earliest_cr_line_month'], null_columns_dropped_data['earliest_cr
'earliest_cr_line'].str.split('-').str
null_columns_dropped_data['issue_d_year'] = '20' + null_columns_dropped_data['issue_d_year'
columns = ['loan_amnt', 'funded_amnt','funded_amnt_inv', 'installment', 'annual_inc', 'emp_
'revol_bal', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'tot
'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'int_rate_percenta
'revol_util_percentage']
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].astype(np.float)
null_columns_dropped_data['PnL']=(
null_columns_dropped_data['total_pymnt']-null_columns_dropped_data['funded_amnt']
)*100/null_columns_dropped_data['funded_amnt']
null_columns_dropped_data['loan_inc_ratio'
] = null_columns_dropped_data.funded_amnt*100/null_columns_dropped
columns = ['total_acc', 'term_months', 'issue_d_year']
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].astype(np.int)
columns = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'loan_status', 'p
'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies']
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].apply(lambda x: x.a
null_columns_dropped_data['annual_inc_lakhs'] = null_columns_dropped_data['annual_inc'] / 1
null_columns_dropped_data['annual_inc_lakhs'].describe()
null_columns_dropped_data['issue_d_month'], null_columns_dropped_data['iss
ue_d_year'] = null_columns_dropped_data[
null_columns_dropped_data['last_credit_pull_d_month'], null_columns_droppe
d_data['last_credit_pull_d_year'] = null_columns_dropped_data[
null_columns_dropped_data['last_pymnt_d_month'], null_columns_dropped_data
['last_pymnt_d_year'] = null_columns_dropped_data[
null_columns_dropped_data['earliest_cr_line_month'], null_columns_dropped_
data['earliest_cr_line_year'] = null_columns_dropped_data[
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].as
type(np.float)
null_columns_dropped_data[columns] = null_columns_dropped_data[columns].as
type(np.int)
Out[13]:
count 39,717.00
mean 68.97
std 63.79
min 4.00
25% 40.40
50% 59.00
75% 82.30
max 6,000.00
In [14]:
null_columns_dropped_data.head()
Out[14]:
In [15]:
print('There are {0} numerical , {1} categorical and {2} string features in the training da
format(numeric_data.shape[1], categorical_data.shape[1], string_data.shape[1]))
In [16]:
null_columns_dropped_data = null_columns_dropped_data[null_columns_dropped_data.loan_status
null_columns_dropped_data = null_columns_dropped_data.loc[:,null_columns_dropped_data.nuniq
df = null_columns_dropped_data
df["loan_status"].value_counts()
Out[16]:
Current 0
In [17]:
df["loan_status"].value_counts()
Out[17]:
Current 0
In [18]:
sns.boxplot( y=df["loan_amnt"] )
plt.ylabel('Loan Amount')
plt.show()
In [19]:
sns.countplot(x= null_columns_dropped_data["loan_status"])
plt.show()
In [20]:
df["loan_status"].value_counts()
Out[20]:
Current 0
In [21]:
sns.countplot(x='grade', data=df)
plt.show()
In [22]:
sns.countplot(x='emp_length_years', data=df)
plt.xlabel('Employee Working Experience')
plt.show()
In [23]:
In [24]:
sns.distplot(df['funded_amnt'])
plt.show()
C:\Users\DEVI PRASAD\anaconda3\lib\site-packages\seaborn\distributions.py:25
57: FutureWarning: `distplot` is a deprecated function and will be removed i
n a future version. Please adapt your code to use either `displot` (a figure
-level function with similar flexibility) or `histplot` (an axes-level funct
ion for histograms).
warnings.warn(msg, FutureWarning)
In [25]:
sns.boxplot(x=df['annual_inc_lakhs'])
plt.xlabel('Annual Income in Lakhs')
plt.show()
In [26]:
C:\Users\DEVI PRASAD\anaconda3\lib\site-packages\seaborn\distributions.py:25
57: FutureWarning: `distplot` is a deprecated function and will be removed i
n a future version. Please adapt your code to use either `displot` (a figure
-level function with similar flexibility) or `histplot` (an axes-level funct
ion for histograms).
warnings.warn(msg, FutureWarning)
In [27]:
plt.subplot(1,2,1)
plt.title('Default')
sns.boxplot(y=df[df.loan_status=='Charged Off'].PnL)
plt.subplot(1,2,2)
plt.title('Fully Paid')
sns.boxplot(y=df[df.loan_status=='Fully Paid'].PnL)
plt.show()
In [28]:
In [29]:
In [30]:
In [31]:
In [32]:
In [33]:
plt.figure(figsize=(10,7))
sns.countplot( x="term_months", hue='purpose', data=df)
plt.xlabel('Loan Term in Months')
plt.show()
In [34]:
In [35]:
sns.countplot(x='home_ownership',hue='loan_status', data=df)
plt.xticks(rotation=90)
plt.show()
In [36]:
In [37]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr())
plt.show()
In [38]:
del numeric_data['out_prncp']
del numeric_data['out_prncp_inv']
plt.figure(figsize=(15,40))
for i in range(len(numeric_data.columns)):
plt.subplot(8,3, i + 1)
sns.boxplot(y=numeric_data.columns[i], x='loan_status', data=df)
plt.show()
In [39]:
g = df[df['loan_status']=='Charged Off'].groupby('addr_state')['loan_status'].count().reset
plt.figure(figsize=(10,10))
sns.barplot(y='addr_state', x='loan_status', data=g)
plt.xlabel('Count of loan status to be defaulter')
plt.ylabel('State')
plt.show()
In [40]:
g = df.groupby('issue_d_year')['loan_status'].count()
g.plot.line(x_compat=True)
plt.xticks(np.arange(min(g.index), max(g.index)+1, 1.0))
plt.title('No of loan granted over the years')
plt.xlabel('Loan Issue Year')
plt.show()
In [41]:
sns.barplot(x='loan_status',y='PnL',data=df)
plt.xlabel("Loan Status")
plt.ylabel("Profit and Loss")
plt.show()
In [42]:
plt.figure(figsize=(10,10))
sns.barplot(x='loan_status',y='loan_inc_ratio',hue='purpose',data=df)
plt.xlabel('Loan Status')
plt.ylabel('Ratio of loan granted vs annual salary')
plt.show()
In [43]:
In [ ]:
In [ ]: