California Housing Dataset

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 3

Importing Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.datasets import fetch_california_housing
sns.set()
%matplotlib inline

A:\Anaconda\lib\site-packages\scipy\__init__.py:155: UserWarning: A
NumPy version >=1.18.5 and <1.25.0 is required for this version of
SciPy (detected version 1.26.3
warnings.warn(f"A NumPy version >={np_minversion} and
<{np_maxversion}"

Loading Data
house = fetch_california_housing(data_home=None,
download_if_missing=True, return_X_y=False, as_frame=True)
df = house.data
df['MedHouseValue'] = house.target
df.head()

Summary Statistics
df.describe()

Visualizations for complete DataFrame


Histogram for all columns
df.hist(bins=50, figsize=(12,8))
plt.show()

MedInc VS MedHouseValue
px.scatter(df,x='MedInc',y='MedHouseValue')
Housing Value based on Location
px.scatter(df, x="Longitude",y="Latitude",
color='MedHouseValue',size='Population')

import folium
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import branca.colormap as cm

# Assuming df is your DataFrame


# Normalize 'MedHouseValue' column for color mapping
medhv_norm = (df['MedHouseValue'] - df['MedHouseValue'].min()) /
(df['MedHouseValue'].max() - df['MedHouseValue'].min())

# Create a colormap
colormap = plt.cm.viridis

# Create a LinearColormap for the legend


min_medhv = df['MedHouseValue'].min()
max_medhv = df['MedHouseValue'].max()
linear = cm.LinearColormap([mcolors.rgb2hex(colormap(i)) for i in
np.linspace(0, 1, colormap.N)],
vmin=min_medhv, vmax=max_medhv,
caption='Median House Value') # Caption
for the legend

# Create a map centered around California


map_california = folium.Map(location=[36.7783, -119.4179],
zoom_start=6)

# Add points to the map with colors based on 'MedHouseValue'


for index, row in df.iterrows():
# Get normalized 'MedHouseValue' value for the current row
norm_value = medhv_norm.iloc[index]
# Get color from colormap
color = mcolors.rgb2hex(colormap(norm_value))

folium.CircleMarker([row['Latitude'], row['Longitude']],
radius=5, # Size of the marker
fill=True,
fill_color=color,
color=None,
fill_opacity=0.7).add_to(map_california)

# Add the colormap to the map as a legend


map_california.add_child(linear)
# Display the map
map_california

Visualizations for a random chunk of DataFrame


chunk = df[2000:3000]
chunk.describe()

MedInc VS MedHouseValue
px.scatter(chunk,x='MedInc',y='MedHouseValue')

AveRooms VS MedHouseValue
px.scatter(chunk,x='AveRooms',y='MedHouseValue')

Heatmap
cor = df.corr()
sns.heatmap(cor, annot=True,fmt='.2f')

Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X =
df.drop(['MedHouseValue','Population','AveOccup','AveBedrms'],axis=1)
Y = df[['MedHouseValue']]
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.4,
random_state=42)

model = LinearRegression().fit(xtrain,ytrain)
pred = model.predict(xtest)

from sklearn.metrics import mean_squared_error, r2_score


r2 = r2_score(ytest, pred)
print("R-squared:", r2)

You might also like