Download as pdf or txt
Download as pdf or txt
You are on page 1of 19

In 

[1]: import pandas as pd


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime
import calendar
warnings.filterwarnings('ignore')

In [2]: df=pd.read_csv('Housing_Macroeconomic_Factors_US (2).csv')

In [3]: df.head()

Out[3]: Date house_price_index population house_supply gdp mortgage_rate employment_rate permit_new

1987-
0 63.965 241857 6.0 99.902813 9.2040 70.163085 1690.0
01-01

1987-
1 64.424 242005 6.2 99.875864 9.0825 70.289205 1689.0
02-01

1987-
2 64.735 242166 6.0 99.869734 9.0350 70.321678 1704.0
03-01

1987-
3 65.132 242338 6.0 99.882087 9.8325 70.499062 1601.0
04-01

1987-
4 65.565 242516 6.7 99.910371 10.5960 70.808308 1500.0
05-01

In [4]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425 entries, 0 to 424
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 425 non-null object
1 house_price_index 425 non-null float64
2 population 425 non-null int64
3 house_supply 425 non-null float64
4 gdp 422 non-null float64
5 mortgage_rate 422 non-null float64
6 employment_rate 422 non-null float64
7 permit_new 422 non-null float64
8 ppi_res 422 non-null float64
9 m3 422 non-null float64
10 cci 422 non-null float64
11 delinquency_rate 374 non-null float64
12 hcai 285 non-null float64
dtypes: float64(11), int64(1), object(1)
memory usage: 43.3+ KB

In [5]: df.shape

(425, 13)
Out[5]:

In [6]: df.describe()

Loading [MathJax]/extensions/Safe.js
Out[6]: house_price_index population house_supply gdp mortgage_rate employment_rate permit_n

count 425.000000 425.000000 425.000000 422.000000 422.000000 422.000000 422.0000

mean 135.887280 291988.251765 5.813412 99.955457 6.328301 70.759327 1346.9620

std 53.684802 28193.599715 1.691079 1.124453 2.209432 2.247253 394.7768

min 63.965000 241857.000000 3.300000 91.543057 2.684000 60.261014 513.0000

25% 81.620000 267829.000000 4.400000 99.559318 4.266250 69.317138 1077.2500

50% 139.727000 293857.000000 5.500000 99.977924 6.257250 71.221613 1362.0000

75% 174.802000 317156.000000 6.700000 100.556927 7.912000 72.281388 1614.7500

max 304.831000 332928.000000 12.200000 101.825852 11.260000 74.507436 2263.0000

In [8]: df['Date']=pd.to_datetime(df['Date'],format='%Y/%m/%d')

In [9]: df['DAY']=[x.day for x in df['Date']]


df['MONTH']=[x.month for x in df['Date']]
df['YEAR']=[x.year for x in df['Date']]
df.head(10)

Out[9]: Date house_price_index population house_supply gdp mortgage_rate employment_rate permit_new

1987-
0 63.965 241857 6.0 99.902813 9.2040 70.163085 1690
01-01

1987-
1 64.424 242005 6.2 99.875864 9.0825 70.289205 1689
02-01

1987-
2 64.735 242166 6.0 99.869734 9.0350 70.321678 1704
03-01

1987-
3 65.132 242338 6.0 99.882087 9.8325 70.499062 1601
04-01

1987-
4 65.565 242516 6.7 99.910371 10.5960 70.808308 1500
05-01

1987-
5 66.073 242706 6.9 99.954091 10.5375 70.585708 1522
06-01

1987-
6 66.508 242908 6.7 100.016021 10.2780 70.793518 1516
07-01

1987-
7 66.939 243118 6.8 100.098404 10.3300 70.999089 1511
08-01

1987-
8 67.331 243335 6.8 100.197931 10.8875 70.915448 1514
09-01

1987-
9 67.738 243543 6.8 100.299312 11.2600 71.106111 1447
10-01

Null Values Treatment

In [7]: df.isnull()

Loading [MathJax]/extensions/Safe.js
Out[7]: Date house_price_index population house_supply gdp mortgage_rate employment_rate permit_new p

0 False False False False False False False False

1 False False False False False False False False

2 False False False False False False False False

3 False False False False False False False False

4 False False False False False False False False

... ... ... ... ... ... ... ... ...

420 False False False False False False False False

421 False False False False False False False False

422 False False False False True True True True

423 False False False False True True True True

424 False False False False True True True True

425 rows × 13 columns

In [10]: df.isnull().sum()

Date 0
Out[10]:
house_price_index 0
population 0
house_supply 0
gdp 3
mortgage_rate 3
employment_rate 3
permit_new 3
ppi_res 3
m3 3
cci 3
delinquency_rate 51
hcai 140
DAY 0
MONTH 0
YEAR 0
dtype: int64

In [11]: df['delinquency_rate'].fillna(df['delinquency_rate'].median(),inplace=True)
df['hcai'].fillna(df['hcai'].median(),inplace=True)

In [12]: df.dropna(subset=['cci'], how='all',inplace=True)


df.dropna(subset=['m3'], how='all',inplace=True)
df.dropna(subset=['ppi_res'], how='all',inplace=True)
df.dropna(subset=['employment_rate'], how='all',inplace=True)
df.dropna(subset=['permit_new'], how='all',inplace=True)
df.dropna(subset=['gdp'], how='all',inplace=True)
df.dropna(subset=['mortgage_rate'], how='all',inplace=True)

In [13]: df.isnull().sum()

Loading [MathJax]/extensions/Safe.js
Date 0
Out[13]:
house_price_index 0
population 0
house_supply 0
gdp 0
mortgage_rate 0
employment_rate 0
permit_new 0
ppi_res 0
m3 0
cci 0
delinquency_rate 0
hcai 0
DAY 0
MONTH 0
YEAR 0
dtype: int64

In [14]: df.columns

Index(['Date', 'house_price_index', 'population', 'house_supply', 'gdp',


Out[14]:
'mortgage_rate', 'employment_rate', 'permit_new', 'ppi_res', 'm3',
'cci', 'delinquency_rate', 'hcai', 'DAY', 'MONTH', 'YEAR'],
dtype='object')

In [15]: cols=df[['house_price_index', 'population', 'house_supply', 'gdp',


'mortgage_rate', 'employment_rate', 'permit_new', 'ppi_res', 'm3',
'cci', 'delinquency_rate', 'hcai']]
cols.head()

Out[15]: house_price_index population house_supply gdp mortgage_rate employment_rate permit_new ppi_r

0 63.965 241857 6.0 99.902813 9.2040 70.163085 1690.0 100

1 64.424 242005 6.2 99.875864 9.0825 70.289205 1689.0 100

2 64.735 242166 6.0 99.869734 9.0350 70.321678 1704.0 100

3 65.132 242338 6.0 99.882087 9.8325 70.499062 1601.0 101

4 65.565 242516 6.7 99.910371 10.5960 70.808308 1500.0 101

Data Visualiztion
In [16]: for column in cols:
sns.distplot(cols[column])
plt.show()

Loading [MathJax]/extensions/Safe.js
Loading [MathJax]/extensions/Safe.js
Loading [MathJax]/extensions/Safe.js
Loading [MathJax]/extensions/Safe.js
Loading [MathJax]/extensions/Safe.js
Loading [MathJax]/extensions/Safe.js
In [17]: plt_1 = plt.figure(figsize=(10,10))
plt.xticks(rotation=90)
sns.heatmap(df.corr(),annot=True)

<AxesSubplot:>
Out[17]:

Loading [MathJax]/extensions/Safe.js
In [18]: data_set = df[['DAY', 'house_price_index']]
sns.scatterplot(x='DAY', y='house_price_index', data=data_set, color='purple')

<AxesSubplot:xlabel='DAY', ylabel='house_price_index'>
Out[18]:

Loading [MathJax]/extensions/Safe.js
In [19]: data_set = df[['YEAR', 'house_price_index']]
sns.scatterplot(x='YEAR', y='house_price_index', data=data_set, color='orange')

<AxesSubplot:xlabel='YEAR', ylabel='house_price_index'>
Out[19]:

In [20]: data_set = df[['MONTH', 'house_price_index']]


sns.scatterplot(x='MONTH', y='house_price_index', data=data_set, color='k')

<AxesSubplot:xlabel='MONTH', ylabel='house_price_index'>
Out[20]:
Loading [MathJax]/extensions/Safe.js
Ananlysis on a sample
In [21]: df1=df.sample(20)
df1

Loading [MathJax]/extensions/Safe.js
Out[21]: Date house_price_index population house_supply gdp mortgage_rate employment_rate permit_n

2003-
193 129.355 289606 4.5 98.375113 5.8425 71.520526 185
02-01

2017-
370 195.866 327699 4.8 100.113491 3.9220 70.280019 129
11-01

1988-
15 69.977 244528 6.4 100.590991 10.2020 71.529216 142
04-01

1996-
108 81.835 268258 6.4 99.256632 7.0300 72.337947 138
01-01

2017-
365 190.522 326743 5.3 99.752263 3.9040 70.103544 134
06-01

2004-
208 148.185 292872 3.8 99.683189 6.2700 71.131627 215
05-01

2002-
186 122.888 288051 4.2 98.940396 6.4850 71.748101 173
07-01

2000-
160 103.677 281996 4.4 101.736906 8.5150 74.079531 154
05-01

1996-
116 83.258 270433 5.2 99.628047 8.2300 73.168792 139
09-01

1996-
112 82.611 269247 5.9 99.524263 8.0700 72.710103 145
05-01

1989-
33 76.283 248174 6.9 101.280035 9.9475 72.569040 136
10-01

2002-
188 124.780 288554 3.9 98.726530 6.0925 72.076044 180
09-01

2021-
418 276.429 332598 6.2 100.094880 3.0675 70.587317 172
11-01

1993-
82 78.149 261550 4.8 99.680219 7.1550 71.460550 135
11-01

2021-
414 263.349 332192 6.0 99.493405 2.8680 69.664968 165
07-01

1988-
20 72.240 245579 6.5 100.831870 10.4800 71.812109 143
09-01

1990-
39 77.278 249436 8.3 101.290583 10.3700 72.323204 113
04-01

2009-
272 148.023 307826 7.8 98.070172 5.0575 66.881083 60
09-01

1994-
91 79.782 263871 6.1 100.317324 8.5125 72.097205 137
08-01

2013-
319 156.973 317397 5.5 99.928057 4.4560 67.419391 96
08-01

In [22]: plt.figure(figsize=(10,8))
sns.stripplot(data=df1,x='MONTH',y='house_supply')
plt.grid()

Loading [MathJax]/extensions/Safe.js
In [23]: sns.lineplot(data=df1,x='gdp',y='mortgage_rate')
plt.grid()
plt.show()

Loading [MathJax]/extensions/Safe.js
In [24]: sns.barplot(data=df1,x='house_supply',y='employment_rate')
plt.grid()
plt.show()

Splitting
In [25]: X = df.drop(['house_price_index','Date','DAY','MONTH','YEAR'],axis=1).values
X
Loading [MathJax]/extensions/Safe.js
array([[2.41857000e+05, 6.00000000e+00, 9.99028134e+01, ...,
Out[25]:
1.00462400e+02, 2.47000000e+00, 5.95800000e+00],
[2.42005000e+05, 6.20000000e+00, 9.98758644e+01, ...,
1.00494500e+02, 2.47000000e+00, 5.95800000e+00],
[2.42166000e+05, 6.00000000e+00, 9.98697339e+01, ...,
1.00572000e+02, 2.47000000e+00, 5.95800000e+00],
...,
[3.32640000e+05, 5.60000000e+00, 1.00120622e+02, ...,
9.77348900e+01, 2.33000000e+00, 5.95800000e+00],
[3.32684000e+05, 5.70000000e+00, 1.00091744e+02, ...,
9.74946700e+01, 2.13000000e+00, 5.95800000e+00],
[3.32750000e+05, 6.00000000e+00, 1.00034014e+02, ...,
9.71899600e+01, 2.13000000e+00, 5.95800000e+00]])

In [26]: y = df['house_price_index'].values
y

Loading [MathJax]/extensions/Safe.js
array([ 63.965, 64.424, 64.735, 65.132, 65.565, 66.073, 66.508,
Out[26]:
66.939, 67.331, 67.738, 68.107, 68.506, 68.859, 69.263,
69.639, 69.977, 70.426, 70.888, 71.354, 71.799, 72.24 ,
72.636, 73.072, 73.465, 73.947, 74.383, 74.778, 75.085,
75.306, 75.48 , 75.658, 75.836, 76.057, 76.283, 76.521,
76.705, 76.897, 77.053, 77.201, 77.278, 77.298, 77.258,
77.138, 77.009, 76.85 , 76.7 , 76.37 , 76.185, 75.916,
75.735, 75.57 , 75.567, 75.765, 75.993, 76.083, 76.11 ,
76.194, 76.075, 76.014, 76.056, 76.086, 76.155, 76.276,
76.346, 76.399, 76.332, 76.264, 76.23 , 76.239, 76.377,
76.559, 76.674, 76.784, 76.838, 76.868, 76.937, 77.037,
77.243, 77.429, 77.613, 77.795, 77.942, 78.149, 78.327,
78.592, 78.727, 78.857, 78.988, 79.223, 79.424, 79.596,
79.782, 79.919, 80.065, 80.15 , 80.297, 80.427, 80.529,
80.599, 80.661, 80.705, 80.786, 80.937, 81.111, 81.307,
81.483, 81.62 , 81.737, 81.835, 81.954, 82.194, 82.422,
82.611, 82.753, 82.929, 83.087, 83.258, 83.378, 83.553,
83.722, 83.956, 84.181, 84.453, 84.624, 84.862, 85.081,
85.333, 85.574, 85.851, 86.148, 86.633, 87.094, 87.616,
88.004, 88.443, 88.879, 89.365, 89.845, 90.311, 90.786,
91.26 , 91.719, 92.201, 92.713, 93.209, 93.672, 94.218,
94.785, 95.345, 95.976, 96.593, 97.221, 97.864, 98.524,
99.155, 99.846, 100.552, 101.339, 102.127, 102.922, 103.677,
104.424, 105.054, 105.767, 106.537, 107.382, 108.302, 109.14 ,
109.846, 110.5 , 111.108, 111.651, 112.163, 112.796, 113.491,
114.166, 114.811, 115.308, 115.855, 116.453, 117.143, 117.844,
118.687, 119.611, 120.724, 121.813, 122.888, 123.831, 124.78 ,
125.734, 126.669, 127.622, 128.461, 129.355, 130.148, 130.884,
131.735, 132.649, 133.776, 134.968, 136.294, 137.532, 138.794,
140.18 , 141.646, 143.192, 145.059, 146.592, 148.185, 149.85 ,
151.338, 152.633, 154.179, 155.751, 157.527, 159.33 , 161.288,
163.344, 165.812, 167.501, 169.351, 171.19 , 172.86 , 174.44 ,
176.437, 178.027, 179.681, 180.91 , 182.32 , 183.287, 184.364,
184.329, 184.156, 183.507, 183.067, 182.593, 182.799, 183.2 ,
183.611, 184.141, 184.518, 184.599, 184.15 , 183.011, 181.601,
180.254, 179.111, 178.117, 177.558, 176.624, 175.147, 174.341,
173.133, 171.542, 170.054, 168.337, 166.658, 165.017, 163.567,
161.989, 160.309, 158.329, 156.142, 153.618, 151.504, 150.012,
148.659, 147.949, 147.696, 148.09 , 148.409, 148.276, 148.023,
147.848, 148.133, 147.929, 147.395, 145.632, 145.86 , 146.401,
146.391, 145.717, 144.988, 143.912, 143.015, 142.525, 142.169,
142.061, 141.526, 140.356, 139.987, 140.011, 139.904, 139.86 ,
139.727, 139.306, 138.667, 137.954, 137.154, 136.676, 136.607,
136.529, 137.903, 139.155, 140.156, 141.029, 141.667, 142.277,
142.907, 143.6 , 144.585, 145.501, 146.835, 147.784, 149.965,
151.521, 152.854, 154.194, 155.606, 156.973, 158.234, 159.247,
160.075, 160.997, 161.948, 162.53 , 163.093, 163.4 , 163.666,
164.04 , 164.574, 165.219, 165.909, 166.646, 167.339, 168.058,
168.663, 169.138, 169.812, 170.312, 170.894, 171.437, 172.131,
172.948, 173.84 , 174.802, 175.747, 176.553, 177.302, 177.667,
178.193, 178.8 , 179.46 , 180.057, 180.848, 181.868, 182.837,
183.751, 184.74 , 185.69 , 186.793, 187.313, 188.032, 188.818,
189.707, 190.522, 191.499, 192.723, 193.786, 194.778, 195.866,
197.044, 198.201, 199.184, 200.038, 200.876, 201.646, 202.345,
203.015, 203.783, 204.366, 205.017, 205.464, 205.867, 206.266,
206.712, 207.202, 207.941, 208.58 , 208.947, 209.496, 210.231,
210.911, 211.598, 212.446, 213.434, 214.49 , 215.549, 216.602,
217.464, 217.689, 218.139, 219.702, 222.539, 225.793, 229.403,
232.673, 235.7 , 238.784, 241.845, 245.796, 250.094, 254.556,
259.249, 263.349, 267.028, 270.258, 273.154, 276.429, 280.19 ,
284.767, 290.371])

In [27]: from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
Loading [MathJax]/extensions/Safe.js
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((316, 11), (106, 11), (316,), (106,))


Out[27]:

Evaluation & Preduction Using Linear Regression


In [28]: from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9880181997171065
Out[28]:

In [29]: print(model.predict([[269527,6.0,99.582313,8.3200,72.829141,1429.0,135.5,3.7225,100.7675

[85.9362418]

Loading [MathJax]/extensions/Safe.js

You might also like