Housing Prices Notebook

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 14

The goal

The objective of this notebook is to predict the sales price for each house. I explored various questions during my analysis to identify
patterns within the data, leveraging Plotly to create interactive visualizations for deeper insights.

I encourage you to join me on this journey! Feel free to fork this notebook and share your insights and suggestions for further
improvement in the comments. Together, we can unlock the secrets to house price prediction. ^^

1. Import libraries and loading data


# Import necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visulazing librarires
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

%matplotlib inline
sns.set_context("paper", font_scale= 1.25)

# Import preprocessing packages


from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold

# Import models
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor, VotingRegressor

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Round the number to two decimal places and display it.


pd.set_option('display.float_format', lambda x: '{:0.2f}'.format(x))

print('Done! All packges loaded.')

Done! All packges loaded.

# Create a function to show the count and percentage of missing values in each column, and the column types.
def nulls_count(df):
" Create a function to show the count and percentage of missing values in each column, and the column types."
#Identify and count the missing values in each feature.
missing_num= df.isna().sum()
# Show only features with Nulls.
missing_num= missing_num[missing_num >0].to_frame()
#Calculate the percentage of null values.
percente= (missing_num/ dfs.shape[0])*100

# Create a DataFrame of null values.


nulls_df= pd.DataFrame(
{"Nulls_num": missing_num.iloc[:, -1],
"percentage": percente.iloc[:, -1],
"col_type": dfs[missing_num.index].dtypes}
).sort_values("Nulls_num", ascending= False)

# Show nulls count


return nulls_df.drop(['SalePrice'])

# Create a function to split the concatenated DataFrame into the train and test DataFrames
def split_df(df):
" Create a function to split the concatenated DataFrame into the train and test DataFrames"
train= df.iloc[:train_df.shape[0]]
test= df.iloc[train_df.shape[0]:].drop("SalePrice", axis= 1)

# Set dataframe names


train.name= "Training set"
test.name= "Test set"
return train, test

def check_skewness_outliers(df):
" Create a function to show the number of outliers per column."
# numeric column list
numeric_cols= df.select_dtypes(include= 'number').columns
# compute features skewness
skewed= df.skew(numeric_only= True).to_frame().reset_index()
skewed.columns= ['feature', 'skewness']

# Identify the outliers and count them.


outliers= []
for col in numeric_cols:
# Visualize column distribution.
# sns.distplot(df[col], bins=20)
# plt.show()

# Count the outliers in numeric columns.


quantile_25= df[col].quantile(0.25)
quantile_75= df[col].quantile(0.75)
IQR= quantile_75 - quantile_25
upper= quantile_75 + 1.5* IQR
lower= quantile_25 - 1.5* IQR
outlier_df= df[(df[col]> upper) | (df[col]< lower)]
outliers.append(outlier_df[col].nunique())
# print("-"*50, "\n")

# Create a DataFrame to store the outlier count


outlier_df= skewed.assign(outliers_num= outliers).sort_values('skewness', ascending= False).reset_index(drop
# Show only features that have outliers.
return outlier_df[outlier_df.iloc[:, 2] > 0]

# Load data
train_df= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Concatenate datasets into one DataFrame for cleaning.


dfs= pd.concat([train_df, test_df]).drop("Id", axis= 1)

# Set dataframe names


train_df.name= "Training set"
test_df.name= "Test set"

# View a sample of 5 rows from the training set.


train_df.sample(5)

Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeatu

693 694 30 RL 60.00 5400 Pave NaN Reg Lvl AllPub ... 0 NaN NaN

927 928 60 RL NaN 9900 Pave NaN Reg Lvl AllPub ... 0 NaN GdPrv

1324 1325 20 RL 75.00 9986 Pave NaN Reg Lvl AllPub ... 0 NaN NaN

1019 1020 120 RL 43.00 3013 Pave NaN Reg Lvl AllPub ... 0 NaN NaN

389 390 60 RL 96.00 12474 Pave NaN Reg Lvl AllPub ... 0 NaN NaN

5 rows × 81 columns

# View trainig set info


dfs.info()
<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Data columns (total 80 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MSSubClass 2919 non-null int64
1 MSZoning 2915 non-null object
2 LotFrontage 2433 non-null float64
3 LotArea 2919 non-null int64
4 Street 2919 non-null object
5 Alley 198 non-null object
6 LotShape 2919 non-null object
7 LandContour 2919 non-null object
8 Utilities 2917 non-null object
9 LotConfig 2919 non-null object
10 LandSlope 2919 non-null object
11 Neighborhood 2919 non-null object
12 Condition1 2919 non-null object
13 Condition2 2919 non-null object
14 BldgType 2919 non-null object
15 HouseStyle 2919 non-null object
16 OverallQual 2919 non-null int64
17 OverallCond 2919 non-null int64
18 YearBuilt 2919 non-null int64
19 YearRemodAdd 2919 non-null int64
20 RoofStyle 2919 non-null object
21 RoofMatl 2919 non-null object
22 Exterior1st 2918 non-null object
23 Exterior2nd 2918 non-null object
24 MasVnrType 1153 non-null object
25 MasVnrArea 2896 non-null float64
26 ExterQual 2919 non-null object
27 ExterCond 2919 non-null object
28 Foundation 2919 non-null object
29 BsmtQual 2838 non-null object
30 BsmtCond 2837 non-null object
31 BsmtExposure 2837 non-null object
32 BsmtFinType1 2840 non-null object
33 BsmtFinSF1 2918 non-null float64
34 BsmtFinType2 2839 non-null object
35 BsmtFinSF2 2918 non-null float64
36 BsmtUnfSF 2918 non-null float64
37 TotalBsmtSF 2918 non-null float64
38 Heating 2919 non-null object
39 HeatingQC 2919 non-null object
40 CentralAir 2919 non-null object
41 Electrical 2918 non-null object
42 1stFlrSF 2919 non-null int64
43 2ndFlrSF 2919 non-null int64
44 LowQualFinSF 2919 non-null int64
45 GrLivArea 2919 non-null int64
46 BsmtFullBath 2917 non-null float64
47 BsmtHalfBath 2917 non-null float64
48 FullBath 2919 non-null int64
49 HalfBath 2919 non-null int64
50 BedroomAbvGr 2919 non-null int64
51 KitchenAbvGr 2919 non-null int64
52 KitchenQual 2918 non-null object
53 TotRmsAbvGrd 2919 non-null int64
54 Functional 2917 non-null object
55 Fireplaces 2919 non-null int64
56 FireplaceQu 1499 non-null object
57 GarageType 2762 non-null object
58 GarageYrBlt 2760 non-null float64
59 GarageFinish 2760 non-null object
60 GarageCars 2918 non-null float64
61 GarageArea 2918 non-null float64
62 GarageQual 2760 non-null object
63 GarageCond 2760 non-null object
64 PavedDrive 2919 non-null object
65 WoodDeckSF 2919 non-null int64
66 OpenPorchSF 2919 non-null int64
67 EnclosedPorch 2919 non-null int64
68 3SsnPorch 2919 non-null int64
69 ScreenPorch 2919 non-null int64
70 PoolArea 2919 non-null int64
71 PoolQC 10 non-null object
72 Fence 571 non-null object
73 MiscFeature 105 non-null object
74 MiscVal 2919 non-null int64
75 MoSold 2919 non-null int64
76 YrSold 2919 non-null int64
77 SaleType 2918 non-null object
78 SaleCondition 2919 non-null object
79 SalePrice 1460 non-null float64
dtypes: float64(12), int64(25), object(43)
memory usage: 1.8+ MB

# Show numeric features summary statisticsand sorted by standard deviation values desc.
train_df.describe().sort_values('std',axis=1, ascending= False)

SalePrice LotArea GrLivArea MiscVal BsmtFinSF1 BsmtUnfSF TotalBsmtSF 2ndFlrSF Id 1stFlrSF ... YrSold OverallCond

count 1460.00 1460.00 1460.00 1460.00 1460.00 1460.00 1460.00 1460.00 1460.00 1460.00 ... 1460.00 1460.00

mean 180921.20 10516.83 1515.46 43.49 443.64 567.24 1057.43 346.99 730.50 1162.63 ... 2007.82 5.58

std 79442.50 9981.26 525.48 496.12 456.10 441.87 438.71 436.53 421.61 386.59 ... 1.33 1.11

min 34900.00 1300.00 334.00 0.00 0.00 0.00 0.00 0.00 1.00 334.00 ... 2006.00 1.00

25% 129975.00 7553.50 1129.50 0.00 0.00 223.00 795.75 0.00 365.75 882.00 ... 2007.00 5.00

50% 163000.00 9478.50 1464.00 0.00 383.50 477.50 991.50 0.00 730.50 1087.00 ... 2008.00 5.00

75% 214000.00 11601.50 1776.75 0.00 712.25 808.00 1298.25 728.00 1095.25 1391.25 ... 2009.00 6.00

max 755000.00 215245.00 5642.00 15500.00 5644.00 2336.00 6110.00 2065.00 1460.00 4692.00 ... 2010.00 9.00

8 rows × 38 columns

# Show categorical features summary statistics and sorted by frequency.


train_df.describe(include="O").sort_values('freq', axis= 1, ascending= False)

Utilities Street Condition2 RoofMatl Heating LandSlope CentralAir Functional PavedDrive Electrical ... Exterior1st Exterior2nd MasV

count 1460 1460 1460 1460 1460 1460 1460 1460 1460 1459 ... 1460 1460

unique 2 2 8 8 6 3 2 7 3 5 ... 15 16

top AllPub Pave Norm CompShg GasA Gtl Y Typ Y SBrkr ... VinylSd VinylSd

freq 1459 1454 1445 1434 1428 1382 1365 1360 1340 1334 ... 515 504

4 rows × 43 columns

Target variable distribution


# Plot the target variable distribution (histogram & box plot) interactively. Hover over data points for details.
fig= make_subplots(rows= 1, cols= 2, subplot_titles=['Target variable summary statistics', 'Target variable distribut
fig.add_trace(go.Box(x= train_df['SalePrice'], name= 'Box plot'), row=1, col=1)
fig.add_trace(go.Histogram(x= train_df['SalePrice'], name= 'Histogram'), row=1, col=2)
fig.show()

Target variable summary statistics Target variable distribution


Box plot
120 Histogram

100

80

Box plot
60

40

20

0
0 200k 400k 600k 200k 400k 600k

2. Cleaning data

2.1 handle missing values


First, we will handle missing values in order to understand the data comprehensively.

# Show number of nulls


nulls_count(dfs)

Nulls_num percentage col_type

PoolQC 2909 99.66 object

MiscFeature 2814 96.40 object

Alley 2721 93.22 object

Fence 2348 80.44 object

MasVnrType 1766 60.50 object

FireplaceQu 1420 48.65 object

LotFrontage 486 16.65 float64

GarageCond 159 5.45 object

GarageYrBlt 159 5.45 float64

GarageFinish 159 5.45 object

GarageQual 159 5.45 object

GarageType 157 5.38 object

BsmtExposure 82 2.81 object

BsmtCond 82 2.81 object

BsmtQual 81 2.77 object

BsmtFinType2 80 2.74 object

BsmtFinType1 79 2.71 object

MasVnrArea 23 0.79 float64

MSZoning 4 0.14 object

Functional 2 0.07 object

BsmtHalfBath 2 0.07 float64

BsmtFullBath 2 0.07 float64

Utilities 2 0.07 object

KitchenQual 1 0.03 object

TotalBsmtSF 1 0.03 float64

BsmtUnfSF 1 0.03 float64

GarageCars 1 0.03 float64

GarageArea 1 0.03 float64

BsmtFinSF2 1 0.03 float64

BsmtFinSF1 1 0.03 float64

Exterior2nd 1 0.03 object

Exterior1st 1 0.03 object

SaleType 1 0.03 object

Electrical 1 0.03 object

The data description file says that all the null values in the following columns denote null values. Therefore, we will replace them
with None.

# Columns list from data descriptions.


cols= ['Alley', 'FireplaceQu', 'Fence', 'PoolQC', 'MiscFeature', 'MasVnrType',
'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'GarageCond', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageType']
# Fill missing values by None.
for c in cols:
dfs[c].fillna('None', inplace= True)

LotFrontage is the linear feet of street connected to the property in any place. It depends on the zoning rules of the neighborhood
and the location of the property. Therefore, we will impute the missing LotFrontage values by using the median LotFrontage value for
each Neighborhood and LotConfig.

# Fill Nulls in LotFrontage column.


dfs['LotFrontage']= dfs.groupby(['Neighborhood', 'LotConfig'])['LotFrontage'].transform(lambda x: x.fillna(x.median
dfs['LotFrontage']= dfs.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

For MasVnrArea, we fill the missing values with the median MasVnrArea value for each MasVnrType category.
For MasVnrArea, we fill the missing values with the median MasVnrArea value for each MasVnrType category.

# Create a dictionary of the median MasVnrArea values for each MasVnrType group.
masvnr_dict= dfs.groupby('MasVnrType')['MasVnrArea'].median().to_dict()
# Fill Nulls in MasVnrArea column.
dfs['MasVnrArea']= dfs['MasVnrArea'].fillna(dfs['MasVnrType'].map(masvnr_dict))

For MSZoning, we fill the missing values with the median MSZoning value for each Neighborhood group.

# Fill the missing values in the MSZoning column with the most frequent MSZoning value for each Neighborhood group.
dfs['MSZoning']= dfs.groupby(['Neighborhood'])['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

For TotalBsmtSF, BsmtUnfSF, BsmtFullBath, BsmtHalfBath, BsmtFinSF1,and BsmtFinSF2, there is No basement in the
property. Therefore, they are equal to 0.

# Fill Nulls in columns by 0.


for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
dfs[col].fillna(0, inplace= True)

For other categorical and numerical columns, we will impute missing values with the mode and median, respectively.

# Fill object columns by mode.


for col in ['Utilities', 'Functional', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType', 'Electrical']:
dfs[col].fillna(dfs[col].mode()[0], inplace= True)

# Fill numeric columns by median.


for col in ['GarageCars', 'GarageArea']:
dfs[col].fillna(dfs[col].median(), inplace= True)

# Check Nulls.
nulls_count(dfs)

Nulls_num percentage col_type

2.2 Duplicates
# Check duplicates
print(f"- There is {dfs.duplicated().sum()} duplicates in the dataset.")

- There is 0 duplicates in the dataset.

2.3 Outliers and Skewness


# View outliers
check_skewness_outliers(dfs)
feature skewness outliers_num

0 MiscVal 21.96 37

1 PoolArea 16.91 13

2 LotArea 12.83 124

3 LowQualFinSF 12.09 35

4 3SsnPorch 11.38 30

5 KitchenAbvGr 4.30 3

6 BsmtFinSF2 4.15 271

7 EnclosedPorch 4.01 182

8 ScreenPorch 3.95 120

9 BsmtHalfBath 3.93 2

10 MasVnrArea 2.61 140

11 OpenPorchSF 2.54 96

12 SalePrice 1.88 58

13 WoodDeckSF 1.84 57

14 1stFlrSF 1.47 41

15 LotFrontage 1.44 45

16 BsmtFinSF1 1.43 15

17 MSSubClass 1.38 4

18 GrLivArea 1.27 66

19 TotalBsmtSF 1.16 44

20 BsmtUnfSF 0.92 45

21 2ndFlrSF 0.86 8

22 TotRmsAbvGrd 0.76 5

23 Fireplaces 0.73 2

25 BsmtFullBath 0.63 1

26 OverallCond 0.57 5

27 BedroomAbvGr 0.33 4

28 GarageArea 0.24 40

29 OverallQual 0.20 1

31 FullBath 0.17 1

33 GarageCars -0.22 2

35 YearBuilt -0.60 5

Our analysis identified a significant number of outliers in most numeric features. To mitigate potential bias and improve model
performance, we explored various outlier handling techniques. We found that standardization, which scales features to a common
range, yielded the best results based on the public Leadership score metric (0.12274).

2.4 Change features types


There are many features in the dataset that are ordinal categories. It's better to replace them with numbers to be more interpretable.

cols1= ['BsmtQual', 'BsmtCond', 'FireplaceQu', 'GarageQual', 'GarageCond']


cols2= ['ExterQual', 'ExterCond', 'KitchenQual', 'HeatingQC']
cols3= ['BsmtFinType1', 'BsmtFinType2']
# replace dictionary
for col in cols1:
dfs[col].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}, inplace= True)

for col in cols2:


dfs[col].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}, inplace= True)

for col in cols3:


dfs[col].replace({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'None':0}, inplace= True)

# Columns has ordinal categories.


dict1={
'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0},
'BsmtExposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0},
'CentralAir': {'Y': 1, 'N': 0},
'PavedDrive' : {'N': 0, 'P': 1, 'Y': 2},
'Fence' : {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'None': 0},
'Alley': {'Pave': 2, 'Grvl': 1, 'None': 0},
'LandSlope': {'Gtl': 2, 'Mod': 1, 'Sev':0},
'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, "None": 0},
'Street': {'Grvl': 0, 'Pave': 1},
'LotShape': {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3':1},
'Utilities': {'AllPub': 4, 'NoSewr': 3, 'NoSeWa': 2, 'ELO': 1},
'MSSubClass': {20: 1, 30: 2, 40: 3, 45: 4, 50: 5, 60: 6, 70: 7, 75:8, 80:9, 85:10, 90:11, 120: 12, 150:13, 160
}

# replacement
for col, values in dict1.items():
dfs[col].replace(values, inplace= True)

After finishing cleaning data, we will return to the Training and Test sets.

3. Exploring data and find patterns


# seperate the training set to visualize
train, _= split_df(dfs)

## Explore the distributions of the top 5 highest variance numeric features


high_variance_cols= train.var(numeric_only= True).sort_values(ascending= False)[0:5].index.to_list()
print("top 5 highest variance numeric features are: \n", high_variance_cols)
sns.pairplot(train, vars= high_variance_cols)

top 5 highest variance numeric features are:


['SalePrice', 'LotArea', 'GrLivArea', 'MiscVal', 'BsmtFinSF1']
<seaborn.axisgrid.PairGrid at 0x7a557f67b700>

# plot features correlations


corr= train.corr(numeric_only= True)
# use mask to avoid information duplicates
mask= np.triu(np.ones_like(corr, dtype= 'bool'))

plt.figure(figsize= (25, 15))


sns.heatmap(train.corr(numeric_only= True), mask= mask, cmap= 'coolwarm', linewidths= 0.01)

<Axes: >

• Which Neighborhood has the highest price and number of sales?

# Plot SalePrice dist per Neighborhood


plt.figure(figsize= (20,6))
sns.swarmplot(y= "SalePrice", x="Neighborhood", palette= "dark", data= train)
plt.title( "SalePrice distribution per Neighborhood", fontdict= {"fontsize": 16})
plt.xticks(rotation=45)
plt.show()

• Is LotConfig effect on price?

# Plot House price distribution by Lot configuration.


plt.figure(figsize= (10, 4))

sns.swarmplot(data= train, x='LotConfig', y= "SalePrice")

plt.title("House price distribution by Lot configuration.", fontdict= {'fontsize': 16})

Text(0.5, 1.0, 'House price distribution by Lot configuration.')


# plot the house price distribution by year built and sold.
plt.figure(figsize= (15, 6))

sns.swarmplot(data= train, x='YrSold', y= "SalePrice", hue= 'YearBuilt', palette= 'flare')

plt.title("House price distribution by year built and sold.", fontdict= {'fontsize': 16})

Text(0.5, 1.0, 'House price distribution by year built and sold.')

• The Condition1 effect on price?

plt.figure(figsize= (16, 5))


sns.violinplot(data= train, x='Condition1', y= "SalePrice")

plt.title("House price distribution per Proximity to various conditions.", fontdict= {'fontsize': 16})

Text(0.5, 1.0, 'House price distribution per Proximity to various conditions.')


# Plot House price distribution by basement finished area.
fig, ax1= plt.subplots(1, 1, figsize= (16, 5))
sns.swarmplot(data= train, x='BsmtFinType1', y= "SalePrice", palette= 'flare')

plt.title("House price distribution per Rating of basement finished area.", fontdict= {'fontsize': 16})

Text(0.5, 1.0, 'House price distribution per Rating of basement finished area.')

• What is the price difference between dwelling types?

# plot House price distribution in each dwelling type


plt.figure(figsize= (20,6))
sns.violinplot(data= train, x='MSSubClass', y= "SalePrice", palette= 'flare')

plt.title("House price distribution in each dwelling type.", fontdict= {'fontsize': 16})

Text(0.5, 1.0, 'House price distribution in each dwelling type.')

# plot SalePrice averge per MSZoning.


fig, ax1= plt.subplots(1, 1, figsize= (10, 5))
sns.barplot(data= train, x= 'MSZoning', y= "SalePrice")

plt.title("House price averages in each zoning classification.", fontdict= {'fontsize': 16})

Text(0.5, 1.0, 'House price averages in each zoning classification.')


# Plot between Sale pice and Sale Condition.
plt.figure(figsize= (10,4))
sns.stripplot(y= "SalePrice", x="SaleCondition", data= train)
plt.title( "SalePrice distribution per Sale Condition", fontdict= {"fontsize": 16})

Text(0.5, 1.0, 'SalePrice distribution per Sale Condition')

# Plot between SalePrice and Sale type.


plt.figure(figsize= (12,4))
sns.stripplot(y= "SalePrice", x="SaleType", data= train)
plt.title( "SalePrice distribution per Sale type", fontdict= {"fontsize": 16})

Text(0.5, 1.0, 'SalePrice distribution per Sale type')

4. feature Engineering
# dummies categorical features
dfs= pd.get_dummies(dfs, drop_first= True)

# Removing features with correlation coefficient above 0.95 to reduce multicollinearity and improve model stability.
corr= dfs.drop('SalePrice', axis=1).corr().abs()

mask= np.triu(np.ones_like(corr, dtype= 'bool'))

tri_df= corr.mask(mask)
to_drop= [c for c in tri_df.columns if any(tri_df[c] >0.95)]
print(f"we dropped {len(to_drop)} features.", to_drop)
dfs= dfs.drop(to_drop, axis= 1)

we dropped 5 features. ['Exterior1st_CemntBd', 'Exterior1st_MetalSd', 'Exterior1st_VinylSd', 'GarageType_None',


'SaleType_New']

## Drop low variance features to improve model performance and reduce overfitting.
# Set variance threshold
sel= VarianceThreshold(threshold= 0.005)

# Normalize dataframe and Fit the algorithm


sel.fit(dfs / dfs.mean())

# get features which passed the threshold


mask= sel.get_support()
dfs= dfs.loc[:, mask]

print("number of features in the reduced df= ", np.sum(mask))

number of features in the reduced df= 299

# get the Training and test sets


train, test= split_df(dfs)

train.sample(5)

MSSubClass LotFrontage LotArea Alley LotShape LandSlope OverallQual OverallCond MasVnrArea ExterQual ... SaleType_ConLD Sale

1048 1 100.00 21750 0 4 2 5 4 75.00 3 ... False

1381 1 80.00 12925 0 3 2 6 7 0.00 3 ... False

487 1 70.00 12243 0 3 2 5 6 0.00 3 ... False

404 6 63.00 10364 0 3 2 6 5 0.00 3 ... False

35 6 108.00 13418 0 4 2 8 5 132.00 4 ... False

5 rows × 299 columns

5.Modeling
# Store train and Target variables
X= train.drop(['SalePrice'], axis=1).values
# Normalize The target variable by taking the logarithm
y= np.log1p(train['SalePrice']).values

# Set the seed for reproducibility.


SEED= 616

# Split training set


X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.3, random_state= SEED)

# shuffle data
kf= KFold(n_splits=8, shuffle= True, random_state= SEED)

As we said before, many features have high variance, so we will normalize them.

# Define standrized model


scaler= StandardScaler()
# Normalize data to make all features have the same scale.
X_train_scaled= scaler.fit_transform(X_train)
X_test_scaled= scaler.transform(X_test)

. Evaluate best model to use.


# # build stochastic Gradient Boosting Regressor
sgbr= GradientBoostingRegressor(learning_rate= 0.1, max_depth= 3, min_samples_split= 35, n_estimators= 300, subsample

# fit model
sgbr.fit(X_train_scaled, y_train)

# Evaluate model score


sgbr.score(X_test_scaled, y_test)

0.921089316513142

# build Lasso model with best parameters


lasso= LassoCV(cv= kf, random_state= SEED)
# fit model
lasso.fit(X_train_scaled, y_train)

# Evaluate model score


lasso.score(X_test_scaled, y_test)

0.9020028424435507

# Create a list of base estimators


estimators= [('lasso', lasso),
('sgbr', sgbr)]

# Instantiate and fit the VotingRegressor model


voting = VotingRegressor(estimators= estimators, weights= [0.4, 0.6]).fit(X_train_scaled, y_train)

# Over/underfitting check
y_pred_train= voting.predict(X_train_scaled)
y_pred= voting.predict(X_test_scaled)
print(f"""R-squared train score:{voting.score(X_train_scaled, y_train)*100: 0.1f}%, test score: {voting.score(X_test_
and train average error= {mean_squared_error(y_train, y_pred_train, squared= False): 0.3f}, test average error=

R-squared train score: 96.3%, test score: 92.4%


and train average error= 0.077, test average error= 0.109

## Predict Test set


# Normalize Test set
test_scaled= scaler.transform(test)

# convert sales price to original value


test_pred= np.expm1(voting.predict(test_scaled))

# read sample_submission file to replace SalePrice values.


submission= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission["SalePrice"]= test_pred
# show first rows
submission.head()

Id SalePrice

0 1461 116981.63

1 1462 163090.13

2 1463 182496.25

3 1464 185388.99

4 1465 198106.75

submission.to_csv("submission.csv", index= False)

You might also like