Download as pdf or txt
Download as pdf or txt
You are on page 1of 8

keyboard_arrow_down 四電子三甲41040106 吳中瑜 4-1簡單線性回歸

from ast import increment_lineno


from google.colab import drive
drive.mount('/content/drive') #基本套件和模租

import numpy as np #數據矩陣處理套件 numpy


import pandas as pd
import matplotlib.pyplot as plt #繪圖處理套件
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# 資料模組 無法再使用
#from sklearn.datasets import load_boston
#boston = load_boston()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

folder="/content/drive/MyDrive/"

add 程式碼 add 文字

4-5將資料整合到DataFrame

boston = pd.read_csv(folder+"housing.csv")
boston.head()

CRIM ZN INDUS CHAS NX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV

0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0

1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6

2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7

3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4

4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2

後續步驟:
toggle_off 查看建議的圖表

import seaborn as sns


#用seaborn一次把圖表的美化格式設定好,這裡是只有先設定圖表長寬
sns.set(rc={'figure.figsize':(5,6)})

#使用的資料是房價MEDIV
sns.distplot(boston['MEDV'])
plt.show()
#Density 密度

簡單回歸(單一變數)

sns.set(rc={'figure.figsize':(10,10)})
correlation_matrix = boston.corr().round(2)
#= True 讓我們可以把數字飆進每個格子裡
sns.heatmap(data=correlation_matrix, annot = True)
#接下來我們可以看每個變數之間的關係,透過相關係數去觀察有哪些特徵變數和目標變數有較高的相關性等等
<Axes: >

4-10列出所有相關係數高於0.6的值,並用heatmap來呈現

plt.figure(figsize=(8,6))
correlation_matrix[np.abs(correlation_matrix) < 0.6 ] = 0
sns.heatmap(correlation_matrix , annot=True, cmap='coolwarm');

4-11繪製RM和target的散布圖,並將透明度設為0.5

#df改成boston (跟課本不同)
#target改成MEDV
boston.plot(kind='scatter', x='RM', y='MEDV',alpha=0.5,figsize=(8,4));

keyboard_arrow_down 4-1-4將資料整理出X和Y
4-12取出X和Y

X = boston[['RM']]
y = boston[['MEDV']]

4-13將資料切割成train和test兩個子集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.33,random_state=42)

4-14檢視資料切割的比數

print('訓練集的筆數: ',len(X_train))
print('測試集的筆數: ',len(X_test))
print('測試集所佔全部資料的百分比: ',len(X_test)/len(X))

訓練集的筆數: 339
測試集的筆數: 167
測試集所佔全部資料的百分比: 0.3300395256916996

4-15用散布圖描繪訓練集和測試集的資料

import matplotlib.pyplot as plt


plt.figure(figsize=(10,4))
plt.scatter(X_train,y_train,color='blue',alpha=0.4,label='訓練集')
plt.scatter(X_test,y_test,color='red',alpha=0.4,label='測練集')
plt.xlabel('房間數量')
plt.ylabel('房價')
plt.legend(); #This ensures that the labels are show
#從執行解果發現,資料有分為訓練集和測試集
#觀察,發現每一增加一個房間約增加10單位價格
#換句話說,我們期望b的值為10左右(function上的參數)(不過這是人腦判讀,而非祭器學習結果)

keyboard_arrow_down 4-1-5回歸模型建構
4-16初始回歸物件

from sklearn.linear_model import LinearRegression


model = LinearRegression()

4-17訓練回歸模型

model.fit(X_train, y_train)

▾ LinearRegression
LinearRegression()

4-18檢視訓練後的係數

print(model.intercept_, model.coef_)
#執行結果發現: intercept=-34左右 coef=9(跟我們觀察的10很接近)

[-34.22235235] [[9.03907314]]

4-19評估訓練模型的好壞

y_pred = model.predict(X_test)

4-20繪製散步圖來觀察預測值與實際值的分布

plt.figure(figsize=(8,4))

plt.scatter(X_test, y_test, label='實際資料')


plt.scatter(X_test, y_pred, c='r', label='測試結果')

plt.legend()

<matplotlib.legend.Legend at 0x7f5024127ca0>

4-21繪製殘差的分散圖

plt.figure(figsize=(6,3))
plt.scatter(X_test, y_test-y_pred)
plt.axhline(0, c='r', ls='--');
4-22用直方圖觀察殘差的分布

plt.figure(figsize=(4,5))
plt.hist(y_test - y_pred, bins=30);

4-23找出殘差最大的前五筆資料

df_test = pd.concat([X_test, y_test], axis=1)


df_test['y_pred'] = y_pred
df_test['error'] = df_test['MEDV'] - df_test['y_pred']
df_test['error_abs'] = np.abs(df_test['error'])
df_test.sort_values(by='error_abs', ascending=False, inplace=True)
df_test.head()

RM MEDV y_pred error error_abs

365 3.561 27.5 -2.034213 29.534213 29.534213

371 6.216 50.0 21.964526 28.035474 28.035474

375 7.313 15.0 31.880390 -16.880390 16.880390

181 6.144 36.2 21.313713 14.886287 14.886287

436 6.461 9.6 24.179099 -14.579099 14.579099

後續步驟:
toggle_off 查看建議的圖表

4-24繪製殘差和實際分布圖,並標示殘差最大的五個點

colors = ['red'] * 5 + ['blue'] * (len(df_test) - 5)

fig, axes = plt.subplots(1, 2, figsize = (12, 4))


#第一張圖
ax = axes[0]
df_test.plot(kind = 'scatter', x = 'RM', y = 'error', c = colors, ax = ax)
for i in df_test.index[:5]:
ax.text(x = df_test.loc[i, 'RM'] + 0.1, y = df_test.loc[i, 'error'] - 1, s = i)
ax.vlines(x = df_test.loc[i, 'RM'], ymin = 0, ymax = df_test.loc[i, 'error'], ls = ':')
ax.axhline(0, c = 'r', ls = '--')
ax.set_title('殘差值分佈')

#第二張圖
ax = axes[1]
df_test.plot(kind = 'scatter', x = 'RM', y = 'MEDV', c = colors, ax = ax)
df_test.plot(kind = 'scatter', x = 'RM', y = 'y_pred', c = 'gray', ax = ax)
for i in df_test.index[:5]:
ax.text(x = df_test.loc[i, 'RM'] + 0.1, y = df_test.loc[i, 'error'] - 1, s = i)
ax.vlines(x = df_test.loc[i, 'RM'], ymin = df_test.loc[i, 'MEDV'], ymax = df_test.loc[i, 'y_pred'], ls = ':')
ax.axhline(0, c = 'r', ls = '--')
ax.set_title('實際值分佈')

Text(0.5, 1.0, '實際值分佈')

4-1-6整體預測結果的好壞評估

4-25範例

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


print('Mean Squred Error:',mean_squared_error(y_test, y_pred))
print('Mean Absoult Error:', mean_absolute_error(y_test, y_pred))
print('R2 Scoer:', r2_score(y_test, y_pred))
Mean Squred Error: 39.091051114869956
Mean Absoult Error: 4.271512885857222
R2 Scoer: 0.4834590168919487

4-1-7使用模型來預測結果

4-26如何運算模型預測房價

model.predict([[6]])

array([[20.01208651]])

4-27用估算出的係數算出預測值

model.intercept_+model.coef_*6

array([[20.01208651]])

4-28二次方的回歸模型

X_train['RM2'] = X_train['RM']**2
X_test['RM2'] = X_test['RM']**2
X_train.head()

RM RM2

478 6.185 38.254225

26 5.813 33.790969

7 6.172 38.093584

492 5.983 35.796289

108 6.474 41.912676

後續步驟:
toggle_off 查看建議的圖表

4-29回歸預測

model_2 = LinearRegression()
model_2.fit(X_train, y_train)
y_pred = model_2.predict(X_test)
print('Mean Squred Error:',mean_squared_error(y_test, y_pred))
print('Mean Absoult Error:', mean_absolute_error(y_test, y_pred))
print('R2 Score:',r2_score(y_test, y_pred))

Mean Squred Error: 31.473995415562957


Mean Absoult Error: 3.9790451133823814
R2 Score: 0.5841091996600494

4-30施出模型預測的係數

model_2.intercept_, model_2.coef_

(array([56.79811638]), array([[-19.51754517, 2.21109792]]))

4-31將預測結果做出圖

plt.figure(figsize=(8,4))

plt.scatter(X_test.iloc[:,0], y_test, label='實際資料')


plt.scatter(X_test.iloc[:,0], y_pred, c='r', label='測試結果')

plt.legend()

<matplotlib.legend.Legend at 0x7f5026c3d210>

4-32多次方轉換器

#先將原本的二次欄位刪除
X_train.drop('RM2', axis=1, inplace=True)
X_test.drop('RM2', axis=1, inplace=True)
#觀察前五筆資料與手動增加的二次方項是相同的,多出來的1部用去理他

from sklearn.preprocessing import PolynomialFeatures


polynomial = PolynomialFeatures(degree=2)
X_poly = polynomial.fit_transform(X_train)
X_poly[:5]

array([[ 1. , 6.185 , 38.254225],


[ 1. , 5.813 , 33.790969],
[ 1. , 6.172 , 38.093584],
[ 1. , 5.983 , 35.796289],
[ 1. , 6.474 , 41.912676]])

4-33用管道器實現多次方的轉換

from sklearn.pipeline import make_pipeline


model_pl_2 = make_pipeline(PolynomialFeatures(degree=2),
LinearRegression())

model_pl_2.fit(X_train, y_train)
y_pred = model_pl_2.predict(X_test)
print('Mean Squred Error:',mean_squared_error(y_test, y_pred))
print('Mean Absoult Error:', mean_absolute_error(y_test, y_pred))
print('R2 Score:',r2_score(y_test, y_pred))

Mean Squred Error: 31.473995415562957


Mean Absoult Error: 3.9790451133823828
R2 Score: 0.5841091996600494
4-34高次方向的預測結說明

errors_train = []
errors_test = []
for order in range(1, 10):
model_pl_o = make_pipeline(PolynomialFeatures(degree = order), LinearRegression())

model_pl_o.fit(X_train, y_train)
y_pred = model_pl_o.predict(X_train)
errors_train.append(mean_squared_error(y_train, y_pred))
y_pred = model_pl_o.predict(X_test)
errors_test.append(mean_squared_error(y_test, y_pred))

plt.plot(range(1, 10), errors_train, marker = '.', ls = '--')


plt.plot(range(1, 10), errors_test, marker = 'o', label = '測試集')
plt.legend()

<matplotlib.legend.Legend at 0x7f5026dabbe0>

keyboard_arrow_down 章末習題
# 導入必要的庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 加載加利福尼亞房價數據集
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = california.target

# 1. 使用 'MedInc' 作為特徵 X 並應用線性回歸


X_medinc = X[['MedInc']]

# (1) 進行預測並繪製結果
# 將數據拆分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_medinc, y, test_size=0.2, random_state=42)

# 初始化並訓練模型
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# 進行預測
y_pred = lin_reg.predict(X_test)

# 繪製預測值與實際值的散點圖
plt.scatter(X_test, y_test, color='blue', label='實際值')
plt.scatter(X_test, y_pred, color='red', label='預測值')
plt.xlabel('MedInc')
plt.ylabel('House Value')
plt.legend()
plt.show()

# (2) 繪製殘差圖
residuals = y_test - y_pred
plt.scatter(X_test, residuals)
plt.axhline(y=0, color='red', linestyle='-')
plt.xlabel('MedInc')
plt.ylabel('殘差')
plt.show()

# (3) 結果分析
print('均方誤差 (MSE):', mean_squared_error(y_test, y_pred))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test, y_pred))
print('R² 分數:', r2_score(y_test, y_pred))
均方誤差 (MSE): 0.7091157771765549
平均絕對誤差 (MAE): 0.629908653009376
R² 分數: 0.45885918903846656

# 2. 預測當 MedInc 為 5 時的結果


medinc_5 = np.array([[5]])
prediction = lin_reg.predict(medinc_5)
print('當 MedInc 為 5 時的預測結果:', prediction)

當 MedInc 為 5 時的預測結果: [2.54128976]

# 3. 使用不同的測試集大小訓練模型並進行評估
# 測試集大小為 0.99
X_train_99, X_test_99, y_train_99, y_test_99 = train_test_split(X_medinc, y, test_size=0.99, random_state=42)
lin_reg_99 = LinearRegression()
lin_reg_99.fit(X_train_99, y_train_99)
y_pred_99 = lin_reg_99.predict(X_test_99)

print('測試集大小為 0.99:')
print('均方誤差 (MSE):', mean_squared_error(y_test_99, y_pred_99))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test_99, y_pred_99))
print('R² 分數:', r2_score(y_test_99, y_pred_99))

# 測試集大小為 0.1
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_medinc, y, test_size=0.1, random_state=42)
lin_reg_1 = LinearRegression()
lin_reg_1.fit(X_train_1, y_train_1)
y_pred_1 = lin_reg_1.predict(X_test_1)

print('測試集大小為 0.1:')
print('均方誤差 (MSE):', mean_squared_error(y_test_1, y_pred_1))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test_1, y_pred_1))
print('R² 分數:', r2_score(y_test_1, y_pred_1))

測試集大小為 0.99:
均方誤差 (MSE): 0.7016528984188746
平均絕對誤差 (MAE): 0.6288707489450783
R² 分數: 0.4719862200671945
測試集大小為 0.1:
均方誤差 (MSE): 0.7306836123487029
平均絕對誤差 (MAE): 0.6372079445770642
R² 分數: 0.4521007156901461

# 4. 觀察:
# 較大的測試集大小 (0.99) 導致模型性能比較小的測試集大小 (0.1) 差。

# 5. 對 'MedInc' 進行三次多項式轉換並進行預測
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_medinc)

X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, test_size=0.2, random_state=42)

lin_reg_poly = LinearRegression()
lin_reg_poly.fit(X_train_poly, y_train_poly)
y_pred_poly = lin_reg_poly.predict(X_test_poly)

# (1) 進行預測並繪製結果
plt.scatter(X_test, y_test, color='blue', label='實際值')
plt.scatter(X_test, y_pred_poly, color='red', label='預測值')
plt.xlabel('MedInc')
plt.ylabel('House Value')
plt.legend()
plt.show()

# (2) 繪製殘差圖
residuals_poly = y_test_poly - y_pred_poly
plt.scatter(X_test_poly[:, 1], residuals_poly)
plt.axhline(y=0, color='red', linestyle='-')
plt.xlabel('MedInc')
plt.ylabel('殘差')
plt.show()

# (3) 結果分析
print('均方誤差 (MSE):', mean_squared_error(y_test_poly, y_pred_poly))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test_poly, y_pred_poly))
print('R² 分數:', r2_score(y_test_poly, y_pred_poly))

You might also like