Bond Investment Risk Prediction

In
[1]: import pandas as pd
import numpy as np
from matplotlib import pyplot,pylab as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScale

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso

import seaborn as sns
from scipy.stats import zscore
from sklearn.metrics import r2_score,roc_auc_score,adjusted_rand_score
import statsmodels.formula.api as sm
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
In [2]: df=pd.read_csv("Kern_County bond dataset 2.csv")
df
Out[2]:
ADTR
CDIAC Issuance Sold ADTR ADT
Issuer Sale Date Filing
Number Documents Status Report Reportabl
Status
Richland 07/14/2022
2022-
0 School submited SOLD 12:00:00 REPORTED PENDING
1204
District AM
Delano
Joint
1996- Union High 12-11-1996
1 Pending SOLD No Report N/A
1769 School 00:00
District
(CSCRPA)
Richland 07/14/2022
2022-
2 School submited SOLD 12:00:00 REPORTED PENDING
1205
District AM
2015- 06-10-
3 Bakersfield submited SOLD No Report N/A
1538 2015 00:00
Kern
County 06/21/2001
2001-
4 Board of Pending SOLD 12:00:00 No Report N/A
0720
Education AM
(CSCRPA)
... ... ... ... ... ... ... ...
Mojave
01/15/2009
2008- Unified
1195 submited SOLD 12:00:00 No Report N/A
1301 School
AM
District
Kern High 07/28/1994

1994-
1196 School Pending SOLD 12:00:00 No Report N/A
1262
District AM
04/19/1988
1988-
1197 Bakersfield Pending SOLD 12:00:00 No Report N/A
0037
AM
1198 2003- North of submited SOLD 05/15/2003 No Report N/A
0562 River 12:00:00
Sanitary AM
District No
1
Beardsley 07/18/2007
2007-
1199 School submited SOLD 12:00:00 No Report N/A
0941
District AM
1200 rows × 55 columns
In [3]: df.shape
(1200, 55)
Out[3]:
In [4]: df.size
66000
Out[4]:
In [5]: df.describe()
Out[5]: TIC NI
Principal Refunding Net Issue
New Money Interest Interes
Amount Amount Discount/Premium
Rate Rat
count 1.200000e+03 1.200000e+03 1.081000e+03 9.190000e+02 559.000000 847.00000
mean 1.250888e+07 8.618718e+06 4.318402e+06 2.385701e+05 3.797231 3.98918
std 2.665435e+07 2.256718e+07 1.655262e+07 1.180353e+06 3.472626 2.67965
min 1.000000e+04 -1.309768e+06 0.000000e+00 -6.337712e+06 0.000000 0.00000
25% 2.000000e+06 5.000000e+05 0.000000e+00 0.000000e+00 2.577500 2.31200
50% 4.563000e+06 2.682260e+06 0.000000e+00 0.000000e+00 3.687000 3.85100
75% 1.088905e+07 6.960277e+06 1.768974e+06 6.706200e+04 4.777000 5.70950
max 2.381771e+08 2.278184e+08 2.381771e+08 2.552913e+07 69.604000 28.27100
In [6]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 55 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CDIAC Number 1200 non-null object
1 Issuer 1200 non-null object
2 Issuance Documents 1200 non-null object
3 Sold Status 1200 non-null object
4 Sale Date 1200 non-null object
5 ADTR Report 1200 non-null object
6 ADTR Filing Status 1200 non-null object
7 ADTR Reportable 1200 non-null object
8 ADTR Reportable Next Fiscal Year 151 non-null object
9 ADTR Last Reported Year 117 non-null object
10 Debt Policy 158 non-null object
11 Issuer County 1200 non-null object
12 MKR Authority 1200 non-null object
13 Local Obligation 1200 non-null object
14 MKR CDIAC Number 8 non-null object
15 Issuer Group 1200 non-null object
16 Issuer Type 1200 non-null object
17 Project Name 1012 non-null object
18 Principal Amount 1200 non-null float64
19 New Money 1200 non-null float64
20 Refunding Amount 1081 non-null float64
21 Net Issue Discount/Premium 919 non-null float64
22 Environmental/Social Impact Bonds 1191 non-null object
23 Debt Type 1200 non-null object
24 Purpose 1198 non-null object
25 Source of Repayment 1199 non-null object
26 TIC Interest Rate 559 non-null float64
27 NIC Interest Rate 847 non-null float64
28 Interest Type 915 non-null object
29 Other Interest Type 6 non-null object
30 Federally Taxable 1200 non-null object
31 First Optional Call Date 480 non-null object
32 Final Maturity Date 1057 non-null object
33 CAB Flag 750 non-null object
34 S and P Rating 1198 non-null object
35 Moody Rating 1198 non-null object
36 Fitch Rating 1198 non-null object
37 Other Rating 1198 non-null object
38 Guarantor Flag 579 non-null object
39 Guarantor 513 non-null object
40 Sale Type (Comp/Neg) 1200 non-null object
41 Private Placement Flag 1200 non-null object
42 Underwriter 1024 non-null object
43 Lender 0 non-null float64
44 Purchaser 123 non-null object
45 Placement Agent 52 non-null object
46 Financial Advisor 657 non-null object
47 Co-Financial Advisor 0 non-null float64
48 Bond Counsel 1189 non-null object
49 Co-Bond Counsel 0 non-null float64
50 Disclosure Counsel 227 non-null object
51 Borrower Counsel 2 non-null object
52 Trustee 814 non-null object
53 Issue Costs Pct of Principal Amt 1198 non-null float64
54 Total Issuance Costs 1198 non-null float64
dtypes: float64(11), object(44)
memory usage: 515.8+ KB
In [7]: df.nunique()
CDIAC Number 1200
Out[7]:
Issuer 178
Issuance Documents 3
Sold Status 2
Sale Date 833
ADTR Report 3
ADTR Filing Status 4
ADTR Reportable 2
ADTR Reportable Next Fiscal Year 1
ADTR Last Reported Year 4
Debt Policy 2
Issuer County 1
MKR Authority 2
Local Obligation 2
MKR CDIAC Number 8
Issuer Group 6
Issuer Type 23
Project Name 403
Principal Amount 882
New Money 745
Refunding Amount 301
Net Issue Discount/Premium 342
Environmental/Social Impact Bonds 2
Debt Type 23
Purpose 29
Source of Repayment 13
TIC Interest Rate 447
NIC Interest Rate 581
Interest Type 4
Other Interest Type 6
Federally Taxable 3
First Optional Call Date 247
Final Maturity Date 477
CAB Flag 2
S and P Rating 51
Moody Rating 38
Fitch Rating 16
Other Rating 3
Guarantor Flag 3
Guarantor 27
Sale Type (Comp/Neg) 2
Private Placement Flag 3
Underwriter 138
Lender 0
Purchaser 50
Placement Agent 13
Financial Advisor 62
Co-Financial Advisor 0
Bond Counsel 65
Co-Bond Counsel 0
Disclosure Counsel 22
Borrower Counsel 2
Trustee 40
Issue Costs Pct of Principal Amt 376
Total Issuance Costs 789
dtype: int64
checking for null values

In [8]: df.isnull().sum()
CDIAC Number 0
Out[8]:
Issuer 0
Sold Status 0
Sale Date 0
ADTR Report 0
ADTR Reportable 0
Debt Policy 1042
Issuer County 0
MKR Authority 0
Local Obligation 0
MKR CDIAC Number 1192
Issuer Group 0
Issuer Type 0
Project Name 188
Principal Amount 0
New Money 0
Refunding Amount 119
Debt Type 0
Purpose 2
TIC Interest Rate 641
NIC Interest Rate 353
Interest Type 285
Federally Taxable 0
CAB Flag 450
S and P Rating 2
Moody Rating 2
Fitch Rating 2
Other Rating 2
Guarantor Flag 621
Guarantor 687
Underwriter 176
Lender 1200
Purchaser 1077
Placement Agent 1148
Financial Advisor 543
Bond Counsel 11
Co-Bond Counsel 1200
Borrower Counsel 1198
Trustee 386
dtype: int64
Replacing null value

In [9]: mode= df["ADTR Reportable Next Fiscal Year"].mode().values[0]
df["ADTR Reportable Next Fiscal Year"]=df["ADTR Reportable Next Fiscal Ye
In [10]: mode= df["ADTR Last Reported Year"].mode().values[0]
df["ADTR Last Reported Year"]=df["ADTR Last Reported Year"].fillna(mode)
In [11]: mode= df["Debt Policy"].mode().values[0]
df["Debt Policy"]=df["Debt Policy"].fillna(mode)
In [12]: mode= df["MKR CDIAC Number"].mode().values[0]
df["MKR CDIAC Number"]=df["MKR CDIAC Number"].fillna(mode)
In [13]: mode= df["Project Name"].mode().values[0]
df["Project Name"]=df["Project Name"].fillna(mode)
In [14]: mean= df["Refunding Amount"].mean()
df["Refunding Amount"]=df["Refunding Amount"].fillna(mean)
In [15]: mean= df["Net Issue Discount/Premium"].mean()
df["Net Issue Discount/Premium"]=df["Net Issue Discount/Premium"].fillna(

In [16]: mode= df["Environmental/Social Impact Bonds"].mode().values[0]
df["Environmental/Social Impact Bonds"]=df["Environmental/Social Impact B
In [17]: mode= df["Purpose"].mode().values[0]
df["Purpose"]=df["Purpose"].fillna(mode)
In [18]: mode= df["Source of Repayment"].mode().values[0]
df["Source of Repayment"]=df["Source of Repayment"].fillna(mode)
In [19]: mean= df["TIC Interest Rate"].mean()
df["TIC Interest Rate"]=df["TIC Interest Rate"].fillna(mean)
In [20]: mean= df["NIC Interest Rate"].mean()
df["NIC Interest Rate"]=df["NIC Interest Rate"].fillna(mean)
In [21]: mode= df["Interest Type"].mode().values[0]
df["Interest Type"]=df["Interest Type"].fillna(mode)
In [22]: mode= df["Other Interest Type"].mode().values[0]
df["Other Interest Type"]=df["Other Interest Type"].fillna(mode)
In [23]: mode= df["First Optional Call Date"].mode().values[0]
df["First Optional Call Date"]=df["First Optional Call Date"].fillna(mode
In [24]: mode= df["Final Maturity Date"].mode().values[0]
df["Final Maturity Date"]=df["Final Maturity Date"].fillna(mode)
In [25]: mode= df["CAB Flag"].mode().values[0]
df["CAB Flag"]=df["CAB Flag"].fillna(mode)
In [26]: mode= df["S and P Rating"].mode().values[0]
df["S and P Rating"]=df["S and P Rating"].fillna(mode)
In [27]: mode= df["Moody Rating"].mode().values[0]
df["Moody Rating"]=df["Moody Rating"].fillna(mode)
In [28]: mode= df["Fitch Rating"].mode().values[0]
df["Fitch Rating"]=df["Fitch Rating"].fillna(mode)
In [29]: mode= df["Other Rating"].mode().values[0]
df["Other Rating"]=df["Other Rating"].fillna(mode)
In [30]: mode= df["Guarantor Flag"].mode().values[0]
df["Guarantor Flag"]=df["Guarantor Flag"].fillna(mode)
In [31]: mode= df["Guarantor"].mode().values[0]
df["Guarantor"]=df["Guarantor"].fillna(mode)
In [32]: mode= df["Underwriter"].mode().values[0]
df["Underwriter"]=df["Underwriter"].fillna(mode)
In [33]: mean= df["Lender"].mean()
df["Lender"]=df["Lender"].fillna(mean)
In [34]: mode= df["Purchaser"].mode().values[0]
df["Purchaser"]=df["Purchaser"].fillna(mode)
In [35]: mode= df["Placement Agent"].mode().values[0]
df["Placement Agent"]=df["Placement Agent"].fillna(mode)
In [36]: mode= df["Financial Advisor"].mode().values[0]
df["Financial Advisor"]=df["Financial Advisor"].fillna(mode)
In [37]: mean= df["Co-Financial Advisor"].mean()
df["Co-Financial Advisor"]=df["Co-Financial Advisor"].fillna(mean)
In [38]: mode = df["Bond Counsel"].mode().values[0]
df["Bond Counsel"]=df["Bond Counsel"].fillna(mode)
In [39]: mode = df["Disclosure Counsel"].mode().values[0]
df["Disclosure Counsel"]=df["Disclosure Counsel"].fillna(mode)
In [40]: mode = df["Trustee"].mode().values[0]
df["Trustee"]=df["Trustee"].fillna(mode)
In [41]: mean= df["Issue Costs Pct of Principal Amt"].mean()

df["Issue Costs Pct of Principal Amt"]=df["Issue Costs Pct of Principal A
In [42]: mean= df["Total Issuance Costs"].mean()
df["Total Issuance Costs"]=df["Total Issuance Costs"].fillna(mean)
In [43]: df.isnull().sum()
CDIAC Number 0
Out[43]:
Issuer 0
Sold Status 0
Sale Date 0
ADTR Report 0
ADTR Reportable 0
Debt Policy 0
Issuer County 0
MKR Authority 0
Local Obligation 0
MKR CDIAC Number 0
Issuer Group 0
Issuer Type 0
Project Name 0
Principal Amount 0
New Money 0
Refunding Amount 0
Debt Type 0
Purpose 0
TIC Interest Rate 0
NIC Interest Rate 0
Interest Type 0
Federally Taxable 0
CAB Flag 0
S and P Rating 0
Moody Rating 0
Fitch Rating 0
Other Rating 0
Guarantor Flag 0
Guarantor 0
Underwriter 0
Lender 1200
Purchaser 0
Placement Agent 0
Financial Advisor 0
Bond Counsel 0
Co-Bond Counsel 1200
Borrower Counsel 1198
Trustee 0
dtype: int64
As we can see that there are not a single value in column like
lender,co-financial advisor,co-bond counsel,borrower counsel
(only have 2 values). so we can drop this columns
Droping the columns

In [44]: df.drop(columns=["CDIAC Number","Issuer","MKR CDIAC Number","Fitch Rating
In [45]: df
Out[45]: ADTR
ADTR
Issuance Sold ADTR ADTR Last Debt MKR
Filing
Documents Status Report Reportable Reported Policy Authority
Status
Year
06/30/2022
0 submited SOLD REPORTED PENDING Y 12:00:00 Y NO
AM
06/30/2022
1 Pending SOLD No Report N/A N 12:00:00 Y NO
AM
06/30/2022
2 submited SOLD REPORTED PENDING Y 12:00:00 Y NO
AM
06/30/2022
3 submited SOLD No Report N/A N 12:00:00 Y NO
AM
4 Pending SOLD No Report N/A N 06/30/2022 Y NO
12:00:00
AM
... ... ... ... ... ... ... ... ...
06/30/2022
AM
06/30/2022
AM
06/30/2022
AM
06/30/2022
AM
06/30/2022
AM
looking for duplicate values

In [46]: df.duplicated().sum()
2
Out[46]:
In [47]: df.drop_duplicates(inplace=True)
In [48]: df.duplicated().sum()
0
Out[48]:
looking for outliers

In [49]: plt.figure(figsize=(30,30))
plt.subplot(4,2,1)
sns.boxplot(df["Principal Amount"])
plt.subplot(4,2,2)
sns.boxplot(df["New Money"])
plt.subplot(4,2,3)
sns.boxplot(df["Refunding Amount"])
Out[49]: <AxesSubplot:xlabel='Refunding Amount'>
plt.subplot(2,2,1)
sns.boxplot(df["Issue Costs Pct of Principal Amt"])
plt.subplot(2,2,2)
sns.boxplot(df["Total Issuance Costs"])
<AxesSubplot:xlabel='Total Issuance Costs'>

Out[50]:
Treating the outliers

In [51]: mean = df['Principal Amount'].mean()
median = df['Principal Amount'].median()
df['Principal Amount'] = np.where(df['Principal Amount'] > median,mean, d
In [52]: mean = df['New Money'].mean()
median = df['New Money'].median()
df['New Money'] = np.where(df['New Money'] > median,mean, df['New Money']
In [53]: mean = df['Refunding Amount'].mean()
median = df['Refunding Amount'].median()
df['Refunding Amount'] = np.where(df['Refunding Amount'] > median,mean, d
In [54]: mean = df['Issue Costs Pct of Principal Amt'].mean()
median = df['Issue Costs Pct of Principal Amt'].median()
df['Issue Costs Pct of Principal Amt'] = np.where(df['Issue Costs Pct of
In [55]: mean = df['Total Issuance Costs'].mean()
median = df['Total Issuance Costs'].median()
df['Total Issuance Costs'] = np.where(df['Total Issuance Costs'] > median
plt.subplot(4,2,1)
sns.boxplot(df["Principal Amount"])
plt.subplot(4,2,2)
sns.boxplot(df["New Money"])
plt.subplot(4,2,3)
sns.boxplot(df["Refunding Amount"])
<AxesSubplot:xlabel='Refunding Amount'>
Out[56]:
plt.subplot(2,2,3)
sns.boxplot(df["Issue Costs Pct of Principal Amt"])
plt.subplot(2,2,4)
sns.boxplot(df["Total Issuance Costs"])
<AxesSubplot:xlabel='Total Issuance Costs'>

Out[57]:
AS we can see that the 4 5 6 graphs are showing outliers on
the tail of the barplot its reason is as followes:
It means that they are relatively high values compared to the rest of the data.
so we will use quantile method to remove the tail outliers
As we can see that the data is clean and we can

proceed towards visualization part
data visualization
In [58]: sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x29a73984f70>
Out[58]:
plt.subplot(2,2,1)
sns.countplot("Issuance Documents",hue="ADTR Report",data=df)
plt.subplot(2,2,2)
sns.countplot("Issuance Documents",hue="ADTR Reportable",data=df)
<AxesSubplot:xlabel='Issuance Documents', ylabel='count'>

Out[59]:
insights:
1. As we can see that the or organization who submited but not reported for ADTR
report(annual debt transparancy report) is higher as compared to the organization
who submited the report.
this means that investers can only invest in 150 bonds for now as per the data
provided for now. for non reported bonds the credit ratings will not be available for
now as per data
for pending and non submitted non reported organizations are high in no.
1. As we can see that the no. organizations whose issuance docs are submited but
ADTR is not repotable are higher as compared to reportable ADTR organizations
this means there are only 150 bonds in which investers can invest other than that all
are high risk bonds for now
1. as we can see that the organization who have submited there issuance docs. but
are not reportable will be able to report for ADTR by next fiscal year same goes for
pending and non-submited
plt.subplot(2,2,1)
sns.countplot(df["MKR Authority"])
plt.subplot(2,2,2)
sns.countplot(df["Local Obligation"])
<AxesSubplot:xlabel='Local Obligation', ylabel='count'>

Out[60]:
insights:
1. MKR : Maker typically refers to a corporation, government, or other organization that
seeks to raise capital by issuing debt securities in the form of bonds. The Maker
uses the proceeds from the sale of bonds to fund its operations, pay off existing
debt, or for other purposes.
MKR authority signifies the issuar of the bond
as we can see that the rate of no is higher as compared to yes this means investing
in the for now is highly volatile as the data because issuer is not a Municipal Bond
insurer or Municipal Bond Rating Agency.
1. As we can see in second graph in local obligation is high whic means that the bond
is not tied to a kern county local area or jurisdiction, but rather has a broader,
national reach.
plt.subplot(2,2,1)
sns.countplot(df["Issuer Group"])
plt.subplot(2,2,2)
sns.countplot(df["Issuer Group"])
<AxesSubplot:xlabel='Issuer Group', ylabel='count'>

Out[61]:
Insights:
1. As we can see that the of schools in issuer group is higher as compared to
government
2. jpa & marks-ross is bond which inlude mutiple govenment project in single bond
3. mello roos is a tax district in california which helps community project whith
financing
sns.countplot(df["S and P Rating"])
<AxesSubplot:xlabel='S and P Rating', ylabel='count'>

Out[62]:
insights:
1. As we can see that non rated organization are higher in no. . as we saw in the graph
before most of the organization will be gettng their ADTR report next fiscal year
2. But still the count of organizations with good ratings is high
sns.barplot("S and P Rating","Principal Amount",data=df,ci=True)
<AxesSubplot:xlabel='S and P Rating', ylabel='Principal Amount'>

Out[63]:
insight:
1. pricipal amount is properly distributed among all the organizations
In [64]: sns.countplot("CAB Flag",data=df)
<AxesSubplot:xlabel='CAB Flag', ylabel='count'>

Out[64]:
insights:
1. as we can see that the rate of no in CAB (call away bond) is higher as compared to
yes
whch means that in most of the bond issuer can not call or take the bond out even if
the issuer have good cedit score or issuer feels there is a financial risk in
proceeding ahead
plt.subplot(2,2,1)
sns.distplot(df["Principal Amount"])
plt.subplot(2,2,2)
sns.distplot(df["Total Issuance Costs"])
plt.subplot(2,2,3)
sns.histplot(df["S and P Rating"],kde=True)
<AxesSubplot:xlabel='S and P Rating', ylabel='Count'>

Out[65]:
As we can see taht the data is not normally distributed and it
is having a huge range seo we have to standardize it
encoding
In [66]: le=LabelEncoder()
df["Issuance Documents"]=le.fit_transform(df["Issuance Documents"])
df["Sold Status"]=le.fit_transform(df["Sold Status"])
df["ADTR Report"]=le.fit_transform(df["ADTR Report"])
df["ADTR Filing Status"]=le.fit_transform(df["ADTR Filing Status"])
df["ADTR Reportable"]=le.fit_transform(df["ADTR Reportable"])
df["ADTR Last Reported Year"]=le.fit_transform(df["ADTR Last Reported Yea

df["Debt Policy"]=le.fit_transform(df["Debt Policy"])
df["MKR Authority"]=le.fit_transform(df["MKR Authority"])
df["Local Obligation"]=le.fit_transform(df["Local Obligation"])
df["Issuer Group"]=le.fit_transform(df["Issuer Group"])
df["Issuer Type"]=le.fit_transform(df["Issuer Type"])
df["Environmental/Social Impact Bonds"]=le.fit_transform(df["Environmenta

df["Debt Type"]=le.fit_transform(df["Debt Type"])
df["Purpose"]=le.fit_transform(df["Purpose"])
df["Source of Repayment"]=le.fit_transform(df["Source of Repayment"])
df["Interest Type"]=le.fit_transform(df["Interest Type"])
df["Other Interest Type"]=le.fit_transform(df["Other Interest Type"])
df["Federally Taxable"]=le.fit_transform(df["Federally Taxable"])
df["CAB Flag"]=le.fit_transform(df["CAB Flag"])
df["S and P Rating"]=le.fit_transform(df["S and P Rating"])
df["Moody Rating"]=le.fit_transform(df["Moody Rating"])
df["Guarantor Flag"]=le.fit_transform(df["Guarantor Flag"])
df["Guarantor"]=le.fit_transform(df["Guarantor"])
df["Sale Type (Comp/Neg)"]=le.fit_transform(df["Sale Type (Comp/Neg)"])

df["Private Placement Flag"]=le.fit_transform(df["Private Placement Flag"
In [67]: ohe=pd.get_dummies(df,columns=["Underwriter","Total Issuance Costs","Disc

df=ohe
df
Out[67]: Issuance Sold ADTR ADTR ADTR ADTR Debt MKR Loc
Documents Status Report Filing Reportable Last Policy Authority Obligatio
Status Reported
Year
0 2 1 2 3 1 3 1 0
1 1 1 0 1 0 3 1 0
2 2 1 2 3 1 3 1 0
3 2 1 0 1 0 3 1 0
4 1 1 0 1 0 3 1 0
... ... ... ... ... ... ... ... ...
1195 2 1 0 1 0 3 1 0
1196 1 1 0 1 0 3 1 0
1197 1 1 0 1 0 3 1 0
1198 2 1 0 1 0 3 1 0
1199 2 1 0 1 0 3 1 0
In [68]: df.nunique()
Out[68]:
Sold Status 2
ADTR Report 3
ADTR Reportable 2
..
First Optional Call Date_12-08-2020 00:00 2
First Optional Call Date_12/15/2009 12:00:00 AM 2
Length: 1302, dtype: int64
corellation
In [69]: df.corr()
Out[69]:
ADTR
ADTR
Issuance Sold ADTR ADTR Last De
Filing
Documents Status Report Reportable Reported Polic
Status
Year
Issuance
1.000000 0.027858 0.277328 0.232318 0.279301 -0.126094 -0.05848
Documents
Sold Status 0.027858 1.000000 -0.045929 0.012459 0.015530 -0.007339 0.22098
ADTR Report 0.277328 -0.045929 1.000000 0.912576 0.998111 -0.351646 -0.19970
ADTR Filing
0.232318 0.012459 0.912576 1.000000 0.914196 -0.122844 -0.18440
Status
ADTR
0.279301 0.015530 0.998111 0.914196 1.000000 -0.352426 -0.18630
Reportable
... ... ... ... ... ... ...
First Optional
Call Date_12-08- 0.032059 0.001672 0.107476 0.060349 0.107680 -0.155483 0.00375
2020 00:00
First Optional
Call
0.022660 0.001182 -0.011039 -0.008806 -0.010977 0.005187 0.00265
Date_12/15/2009
12:00:00 AM
First Optional
Call
0.022660 0.001182 -0.011039 -0.008806 -0.010977 0.005187 0.00265
Date_12/22/2016
12:00:00 AM
First Optional
Call
0.022660 0.001182 -0.011039 -0.008806 -0.010977 0.005187 0.00265
Date_12/23/2008
12:00:00 AM
First Optional
Call
0.022660 0.001182 0.075966 0.042655 0.076109 0.005187 0.00265
Date_12/26/2019
12:00:00 AM
train_test_split
In [70]: x=df.drop("S and P Rating",axis=1)
y=df["S and P Rating"]
In [71]: x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.700,rando
zscore normalization
In [72]: mean = df.mean()
std = df.std()
df = (df - mean) / std
df
Out[72]:
ADTR
ADTR
Issuance Sold ADTR ADTR Last Debt MKR
Filing
Documents Status Report Reportable Reported Policy Authority
Status
Year
0 0.783647 0.040876 2.627135 3.25486 2.632107 0.179399 0.091709 -0.070918
1 -0.680954 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
2 0.783647 0.040876 2.627135 3.25486 2.632107 0.179399 0.091709 -0.070918
3 0.783647 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
4 -0.680954 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
... ... ... ... ... ... ... ... ...
1195 0.783647 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
1196 -0.680954 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918

1197 -0.680954 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
1198 0.783647 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
1199 0.783647 0.040876 -0.381763 -0.30454 -0.379607 0.179399 0.091709 -0.070918
Model building
1. linear regression model:
In [73]: lr= LinearRegression()
In [74]: lr.fit(x_train,y_train)
LinearRegression()
Out[74]:
In [75]: y_pred=lr.predict(x_test)
In [76]: r2=r2_score(y_test,y_pred)
r2
-2545768.6466048327
Out[76]:
using gridsearchcv to improve r2 score

In [77]: lr.get_params()
{'copy_X': True,
Out[77]:
'fit_intercept': True,
'n_jobs': None,
'normalize': 'deprecated',
'positive': False}
In [78]: params={"n_jobs":[5,20,30],"positive":[True],"fit_intercept":[False],"cop
lrt=GridSearchCV(lr,params,cv=7,verbose=1,scoring='r2')
lrt.fit(x_train,y_train)
Fitting 7 folds for each of 6 candidates, totalling 42 fits
GridSearchCV(cv=7, estimator=LinearRegression(),
Out[78]:
param_grid={'copy_X': [False, True], 'fit_intercept': [Fals
e],
'n_jobs': [5, 20, 30], 'positive': [True]},
scoring='r2', verbose=1)
In [79]: lrt.best_score_
0.23832445204531277
Out[79]:
In [80]: lrt.best_params_
{'copy_X': False, 'fit_intercept': False, 'n_jobs': 5, 'positive': True}

Out[80]:
2. ridge regression
In [81]: ridge=Ridge()
ridge.get_params()
ridge.fit(x_train,y_train)
y_pred_ridge=ridge.predict(x_test)
In [82]: r_2=r2_score(y_test,y_pred_ridge)
r_2
0.5982772629653259
Out[82]:

In [83]: ridge.get_params()
{'alpha': 1.0,
Out[83]:
'copy_X': True,
'max_iter': None,
'positive': False,
'random_state': None,
'solver': 'auto',
'tol': 0.001}
In [84]: params={"max_iter":[3,5,20],"positive":[True,False],"fit_intercept":[True
ridgetu=GridSearchCV(ridge,params,cv=4,verbose=1,n_jobs=20,scoring='r2')
ridgetu.fit(x_train,y_train)
GridSearchCV(cv=4, estimator=Ridge(), n_jobs=20,
Out[84]:
param_grid={'alpha': [1.0, 2.0, 4.0, 5.0], 'copy_X': [True,
False],
'fit_intercept': [True, False], 'max_iter': [3,

5, 20],
'positive': [True, False],
'random_state': [10, 20, 30, 45],
'tol': [0.0003, 0.0005, 0.0002, 0.0001]},
In [85]: ridgetu.best_score_
0.6057072416861131
Out[85]:
In [86]: ridgetu.best_params_
{'alpha': 5.0,
Out[86]:
'copy_X': True,
'fit_intercept': False,
'max_iter': 3,
'positive': False,
'random_state': 10,
'tol': 0.0003}
3. lasso regression
In [87]: lasso=Lasso()
lasso.fit(x_train,y_train)
Lasso()
Out[87]:
In [88]: y_pred_lasso = lasso.predict(x_test)
In [89]: r_2=r2_score(y_test,y_pred_lasso)
r_2
0.48015619366575135
Out[89]:

In [90]: lasso.get_params()
{'alpha': 1.0,
Out[90]:
'copy_X': True,
'max_iter': 1000,
'positive': False,
'precompute': False,
'selection': 'cyclic',
'tol': 0.0001,
'warm_start': False}
In [91]: params={"alpha":[1.0,2.0,3.0,4.0],"max_iter":[100,500,1000],"random_state
lass=GridSearchCV(lasso,params,cv=4,n_jobs=20,scoring='r2')
lass.fit(x_train,y_train)
GridSearchCV(cv=4, estimator=Lasso(), n_jobs=20,
Out[91]:
param_grid={'alpha': [1.0, 2.0, 3.0, 4.0], 'copy_X': [True,
False],
'fit_intercept': [True, False],
'max_iter': [100, 500, 1000],
'positive': [True, False], 'precompute': [True,

False],
'random_state': [30, 45],
'tol': [0.0003, 0.0005, 0.0002, 0.0001]},
scoring='r2')
In [92]: lass.best_score_
0.48653717075366076
Out[92]:
In [93]: lass.best_params_
{'alpha': 1.0,
Out[93]:
'copy_X': True,
'max_iter': 100,
'positive': False,
'precompute': True,
'random_state': 30,
'tol': 0.0001}
4. adaboost regressor
In [94]: ada=AdaBoostRegressor()
In [95]: ada.fit(x_train,y_train)
Out[95]: AdaBoostRegressor()
In [96]: y_pred_adb = ada.predict(x_test)
In [97]: r_2=r2_score(y_test,y_pred_adb)
r_2
0.5807366918340197
Out[97]:

In [98]: ada.get_params()
{'base_estimator': None,
Out[98]:
'learning_rate': 1.0,
'loss': 'linear',
'n_estimators': 50,
'random_state': None}
In [99]: params={"learning_rate":[1.0,2.0,3.0,4.0],"random_state":[30,45,60,64],'n
adat=GridSearchCV(ada,params,cv=3,n_jobs=5,verbose=1,scoring='r2')
adat.fit(x_train,y_train)
GridSearchCV(cv=3, estimator=AdaBoostRegressor(), n_jobs=5,
Out[99]:
param_grid={'learning_rate': [1.0, 2.0, 3.0, 4.0],
'n_estimators': [3, 5, 10, 20, 30, 40, 50, 70],
'random_state': [30, 45, 60, 64]},
In [100… adat.best_score_
0.652097756607407
Out[100]:
In [101… adat.best_params_
{'learning_rate': 1.0, 'n_estimators': 10, 'random_state': 30}

Out[101]:
5. decision tree regressor

In [102… dtcr=DecisionTreeRegressor()
dtcr.fit(x_train,y_train)
DecisionTreeRegressor()
Out[102]:
In [103… y_pred_dtcr=dtcr.predict(x_test)
In [104… r2=r2_score(y_test,y_pred_dtcr)
r2
0.6494079829434429
Out[104]:

In [105… dtcr.get_params()
{'ccp_alpha': 0.0,
Out[105]: 'criterion': 'squared_error',
'max_depth': None,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'splitter': 'best'}
In [106… params={"max_depth":[10,20,30,40],"random_state":[20,40],'min_impurity_de
dtcrt=GridSearchCV(dtcr,params,cv=5,n_jobs=5,verbose=1,scoring='r2')
dtcrt.fit(x_train,y_train)
GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=5,
Out[106]:
param_grid={'ccp_alpha': [0.5, 0.6, 0.7],
'max_depth': [10, 20, 30, 40],
'min_impurity_decrease': [1, 2, 3, 4],
'min_samples_leaf': [1, 2, 3, 4, 5],
'min_samples_split': [2, 3, 4, 5, 6],
'random_state': [20, 40]},
In [107… dtcrt.best_params_
{'ccp_alpha': 0.5,
Out[107]:
'max_depth': 10,
'min_impurity_decrease': 3,
'min_samples_leaf': 1,
'random_state': 40}
In [108… dtcrt.best_score_
0.6491684757048557
Out[108]:
6. random forest regressor

In [109… rfr= RandomForestRegressor()
rfr.fit(x_train,y_train)
RandomForestRegressor()
Out[109]:
In [110… y_pred_rf=rfr.predict(x_test)
In [111… r2=r2_score(y_test,y_pred_rf)
r2
0.8054613972729647
Out[111]:

In [116… params={"max_depth":[100,200],"min_samples_split":[3,5],"min_impurity_dec
"n_estimators":[200,500],"random_state":[10,20,40],"bootstrap":[T
rfrt=GridSearchCV(rfr,params,cv=5,n_jobs=20,verbose=1,scoring="r2")
rfrt.fit(x_train,y_train)
GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=20,
Out[116]:
param_grid={'bootstrap': [True], 'max_depth': [100, 200],
'min_impurity_decrease': [0.5, 1.0],
'min_samples_split': [3, 5],
'n_estimators': [200, 500], 'oob_score': [Tru

e],
'random_state': [10, 20, 40]},
In [117… rfrt.best_params_
{'bootstrap': True,
Out[117]:
'max_depth': 100,
'min_impurity_decrease': 0.5,
'n_estimators': 500,
'oob_score': True,
'random_state': 20}
In [118… rfrt.best_score_
0.7255391911131108
Out[118]:
As we can see that even if we try our best parameters we are

getting less accuracy as compared to our default accuracy
score
Accuracy of all the models:

1. linear regression = 0.24
2. Ridge regression = 0.60
3. lasso regression = 0.48
4. Adaboost regression = 0.65
5. decision tree regressor = 0.64
6. Random forest regressor = 0.80 (Before grid search cv)

Bond Investment Risk Prediction

Uploaded by

Document Information

Original Description:

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Bond Investment Risk Prediction

Uploaded by

Copyright:

Available Formats

In

[1]: import pandas as pd

from matplotlib import pyplot,pylab as plt

from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScale

from sklearn.linear_model import LinearRegression,Ridge,Lasso

from scipy.stats import zscore

from sklearn.metrics import r2_score,roc_auc_score,adjusted_rand_score

from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor

from sklearn.tree import DecisionTreeRegressor

In [2]: df=pd.read_csv("Kern_County bond dataset 2.csv")

... ... ... ... ... ... ... ...

Kern High 07/28/1994

1200 rows × 55 columns

count 1.200000e+03 1.200000e+03 1.081000e+03 9.190000e+02 559.000000 847.00000

mean 1.250888e+07 8.618718e+06 4.318402e+06 2.385701e+05 3.797231 3.98918

std 2.665435e+07 2.256718e+07 1.655262e+07 1.180353e+06 3.472626 2.67965

min 1.000000e+04 -1.309768e+06 0.000000e+00 -6.337712e+06 0.000000 0.00000

25% 2.000000e+06 5.000000e+05 0.000000e+00 0.000000e+00 2.577500 2.31200

50% 4.563000e+06 2.682260e+06 0.000000e+00 0.000000e+00 3.687000 3.85100

75% 1.088905e+07 6.960277e+06 1.768974e+06 6.706200e+04 4.777000 5.70950

max 2.381771e+08 2.278184e+08 2.381771e+08 2.552913e+07 69.604000 28.27100

RangeIndex: 1200 entries, 0 to 1199

Data columns (total 55 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 CDIAC Number 1200 non-null object

1 Issuer 1200 non-null object

2 Issuance Documents 1200 non-null object

3 Sold Status 1200 non-null object

4 Sale Date 1200 non-null object

5 ADTR Report 1200 non-null object

6 ADTR Filing Status 1200 non-null object

7 ADTR Reportable 1200 non-null object

8 ADTR Reportable Next Fiscal Year 151 non-null object

9 ADTR Last Reported Year 117 non-null object

10 Debt Policy 158 non-null object

11 Issuer County 1200 non-null object

12 MKR Authority 1200 non-null object

13 Local Obligation 1200 non-null object

14 MKR CDIAC Number 8 non-null object

15 Issuer Group 1200 non-null object

16 Issuer Type 1200 non-null object

17 Project Name 1012 non-null object

18 Principal Amount 1200 non-null float64

19 New Money 1200 non-null float64

20 Refunding Amount 1081 non-null float64

21 Net Issue Discount/Premium 919 non-null float64

22 Environmental/Social Impact Bonds 1191 non-null object

23 Debt Type 1200 non-null object

24 Purpose 1198 non-null object

25 Source of Repayment 1199 non-null object

26 TIC Interest Rate 559 non-null float64

27 NIC Interest Rate 847 non-null float64

28 Interest Type 915 non-null object

29 Other Interest Type 6 non-null object

30 Federally Taxable 1200 non-null object

31 First Optional Call Date 480 non-null object

32 Final Maturity Date 1057 non-null object

33 CAB Flag 750 non-null object

34 S and P Rating 1198 non-null object

35 Moody Rating 1198 non-null object

36 Fitch Rating 1198 non-null object

37 Other Rating 1198 non-null object