Professional Documents
Culture Documents
CH4.ipynb - Colab
CH4.ipynb - Colab
# 資料模組 無法再使用
#from sklearn.datasets import load_boston
#boston = load_boston()
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
folder="/content/drive/MyDrive/"
4-5將資料整合到DataFrame
boston = pd.read_csv(folder+"housing.csv")
boston.head()
CRIM ZN INDUS CHAS NX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 396.90 5.33 36.2
後續步驟:
toggle_off 查看建議的圖表
#使用的資料是房價MEDIV
sns.distplot(boston['MEDV'])
plt.show()
#Density 密度
簡單回歸(單一變數)
sns.set(rc={'figure.figsize':(10,10)})
correlation_matrix = boston.corr().round(2)
#= True 讓我們可以把數字飆進每個格子裡
sns.heatmap(data=correlation_matrix, annot = True)
#接下來我們可以看每個變數之間的關係,透過相關係數去觀察有哪些特徵變數和目標變數有較高的相關性等等
<Axes: >
4-10列出所有相關係數高於0.6的值,並用heatmap來呈現
plt.figure(figsize=(8,6))
correlation_matrix[np.abs(correlation_matrix) < 0.6 ] = 0
sns.heatmap(correlation_matrix , annot=True, cmap='coolwarm');
4-11繪製RM和target的散布圖,並將透明度設為0.5
#df改成boston (跟課本不同)
#target改成MEDV
boston.plot(kind='scatter', x='RM', y='MEDV',alpha=0.5,figsize=(8,4));
keyboard_arrow_down 4-1-4將資料整理出X和Y
4-12取出X和Y
X = boston[['RM']]
y = boston[['MEDV']]
4-13將資料切割成train和test兩個子集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
test_size=0.33,random_state=42)
4-14檢視資料切割的比數
print('訓練集的筆數: ',len(X_train))
print('測試集的筆數: ',len(X_test))
print('測試集所佔全部資料的百分比: ',len(X_test)/len(X))
訓練集的筆數: 339
測試集的筆數: 167
測試集所佔全部資料的百分比: 0.3300395256916996
4-15用散布圖描繪訓練集和測試集的資料
keyboard_arrow_down 4-1-5回歸模型建構
4-16初始回歸物件
4-17訓練回歸模型
model.fit(X_train, y_train)
▾ LinearRegression
LinearRegression()
4-18檢視訓練後的係數
print(model.intercept_, model.coef_)
#執行結果發現: intercept=-34左右 coef=9(跟我們觀察的10很接近)
[-34.22235235] [[9.03907314]]
4-19評估訓練模型的好壞
y_pred = model.predict(X_test)
4-20繪製散步圖來觀察預測值與實際值的分布
plt.figure(figsize=(8,4))
plt.legend()
<matplotlib.legend.Legend at 0x7f5024127ca0>
4-21繪製殘差的分散圖
plt.figure(figsize=(6,3))
plt.scatter(X_test, y_test-y_pred)
plt.axhline(0, c='r', ls='--');
4-22用直方圖觀察殘差的分布
plt.figure(figsize=(4,5))
plt.hist(y_test - y_pred, bins=30);
4-23找出殘差最大的前五筆資料
後續步驟:
toggle_off 查看建議的圖表
4-24繪製殘差和實際分布圖,並標示殘差最大的五個點
#第二張圖
ax = axes[1]
df_test.plot(kind = 'scatter', x = 'RM', y = 'MEDV', c = colors, ax = ax)
df_test.plot(kind = 'scatter', x = 'RM', y = 'y_pred', c = 'gray', ax = ax)
for i in df_test.index[:5]:
ax.text(x = df_test.loc[i, 'RM'] + 0.1, y = df_test.loc[i, 'error'] - 1, s = i)
ax.vlines(x = df_test.loc[i, 'RM'], ymin = df_test.loc[i, 'MEDV'], ymax = df_test.loc[i, 'y_pred'], ls = ':')
ax.axhline(0, c = 'r', ls = '--')
ax.set_title('實際值分佈')
4-1-6整體預測結果的好壞評估
4-25範例
4-1-7使用模型來預測結果
4-26如何運算模型預測房價
model.predict([[6]])
array([[20.01208651]])
4-27用估算出的係數算出預測值
model.intercept_+model.coef_*6
array([[20.01208651]])
4-28二次方的回歸模型
X_train['RM2'] = X_train['RM']**2
X_test['RM2'] = X_test['RM']**2
X_train.head()
RM RM2
26 5.813 33.790969
7 6.172 38.093584
後續步驟:
toggle_off 查看建議的圖表
4-29回歸預測
model_2 = LinearRegression()
model_2.fit(X_train, y_train)
y_pred = model_2.predict(X_test)
print('Mean Squred Error:',mean_squared_error(y_test, y_pred))
print('Mean Absoult Error:', mean_absolute_error(y_test, y_pred))
print('R2 Score:',r2_score(y_test, y_pred))
4-30施出模型預測的係數
model_2.intercept_, model_2.coef_
4-31將預測結果做出圖
plt.figure(figsize=(8,4))
plt.legend()
<matplotlib.legend.Legend at 0x7f5026c3d210>
4-32多次方轉換器
#先將原本的二次欄位刪除
X_train.drop('RM2', axis=1, inplace=True)
X_test.drop('RM2', axis=1, inplace=True)
#觀察前五筆資料與手動增加的二次方項是相同的,多出來的1部用去理他
4-33用管道器實現多次方的轉換
model_pl_2.fit(X_train, y_train)
y_pred = model_pl_2.predict(X_test)
print('Mean Squred Error:',mean_squared_error(y_test, y_pred))
print('Mean Absoult Error:', mean_absolute_error(y_test, y_pred))
print('R2 Score:',r2_score(y_test, y_pred))
errors_train = []
errors_test = []
for order in range(1, 10):
model_pl_o = make_pipeline(PolynomialFeatures(degree = order), LinearRegression())
model_pl_o.fit(X_train, y_train)
y_pred = model_pl_o.predict(X_train)
errors_train.append(mean_squared_error(y_train, y_pred))
y_pred = model_pl_o.predict(X_test)
errors_test.append(mean_squared_error(y_test, y_pred))
<matplotlib.legend.Legend at 0x7f5026dabbe0>
keyboard_arrow_down 章末習題
# 導入必要的庫
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# 加載加利福尼亞房價數據集
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = california.target
# (1) 進行預測並繪製結果
# 將數據拆分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_medinc, y, test_size=0.2, random_state=42)
# 初始化並訓練模型
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
# 進行預測
y_pred = lin_reg.predict(X_test)
# 繪製預測值與實際值的散點圖
plt.scatter(X_test, y_test, color='blue', label='實際值')
plt.scatter(X_test, y_pred, color='red', label='預測值')
plt.xlabel('MedInc')
plt.ylabel('House Value')
plt.legend()
plt.show()
# (2) 繪製殘差圖
residuals = y_test - y_pred
plt.scatter(X_test, residuals)
plt.axhline(y=0, color='red', linestyle='-')
plt.xlabel('MedInc')
plt.ylabel('殘差')
plt.show()
# (3) 結果分析
print('均方誤差 (MSE):', mean_squared_error(y_test, y_pred))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test, y_pred))
print('R² 分數:', r2_score(y_test, y_pred))
均方誤差 (MSE): 0.7091157771765549
平均絕對誤差 (MAE): 0.629908653009376
R² 分數: 0.45885918903846656
# 3. 使用不同的測試集大小訓練模型並進行評估
# 測試集大小為 0.99
X_train_99, X_test_99, y_train_99, y_test_99 = train_test_split(X_medinc, y, test_size=0.99, random_state=42)
lin_reg_99 = LinearRegression()
lin_reg_99.fit(X_train_99, y_train_99)
y_pred_99 = lin_reg_99.predict(X_test_99)
print('測試集大小為 0.99:')
print('均方誤差 (MSE):', mean_squared_error(y_test_99, y_pred_99))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test_99, y_pred_99))
print('R² 分數:', r2_score(y_test_99, y_pred_99))
# 測試集大小為 0.1
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_medinc, y, test_size=0.1, random_state=42)
lin_reg_1 = LinearRegression()
lin_reg_1.fit(X_train_1, y_train_1)
y_pred_1 = lin_reg_1.predict(X_test_1)
print('測試集大小為 0.1:')
print('均方誤差 (MSE):', mean_squared_error(y_test_1, y_pred_1))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test_1, y_pred_1))
print('R² 分數:', r2_score(y_test_1, y_pred_1))
測試集大小為 0.99:
均方誤差 (MSE): 0.7016528984188746
平均絕對誤差 (MAE): 0.6288707489450783
R² 分數: 0.4719862200671945
測試集大小為 0.1:
均方誤差 (MSE): 0.7306836123487029
平均絕對誤差 (MAE): 0.6372079445770642
R² 分數: 0.4521007156901461
# 4. 觀察:
# 較大的測試集大小 (0.99) 導致模型性能比較小的測試集大小 (0.1) 差。
# 5. 對 'MedInc' 進行三次多項式轉換並進行預測
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X_medinc)
lin_reg_poly = LinearRegression()
lin_reg_poly.fit(X_train_poly, y_train_poly)
y_pred_poly = lin_reg_poly.predict(X_test_poly)
# (1) 進行預測並繪製結果
plt.scatter(X_test, y_test, color='blue', label='實際值')
plt.scatter(X_test, y_pred_poly, color='red', label='預測值')
plt.xlabel('MedInc')
plt.ylabel('House Value')
plt.legend()
plt.show()
# (2) 繪製殘差圖
residuals_poly = y_test_poly - y_pred_poly
plt.scatter(X_test_poly[:, 1], residuals_poly)
plt.axhline(y=0, color='red', linestyle='-')
plt.xlabel('MedInc')
plt.ylabel('殘差')
plt.show()
# (3) 結果分析
print('均方誤差 (MSE):', mean_squared_error(y_test_poly, y_pred_poly))
print('平均絕對誤差 (MAE):', mean_absolute_error(y_test_poly, y_pred_poly))
print('R² 分數:', r2_score(y_test_poly, y_pred_poly))