Regression and Eda

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 47

regression-and-eda

May 3, 2024

[2]: import pandas as pd

[3]: df=pd.read_excel('C:\\Users\\lariy\\Downloads\\capstone\\final_data.xlsx')
df

[3]: Unnamed: 0 Unnamed: 1 Year Gold Prices (USD) Inflation \


0 NaN NaN 1980-01-04 588.00 13.549202
1 NaN NaN 1980-01-11 623.00 13.549202
2 NaN NaN 1980-01-18 835.00 13.549202
3 NaN NaN 1980-01-25 668.00 13.549202
4 NaN NaN 1980-02-01 676.50 13.549202
… … … … … …
2239 NaN NaN 2022-12-02 1784.75 NaN
2240 NaN NaN 2022-12-09 1796.15 NaN
2241 NaN NaN 2022-12-16 1792.55 NaN
2242 NaN NaN 2022-12-23 1800.70 NaN
2243 NaN NaN 2022-12-30 1813.75 NaN

Unemployment rate Interest rates Oil Prices (USD)


0 6.3 18.9 21.59
1 6.3 18.9 21.59
2 6.3 18.9 21.59
3 6.3 18.9 21.59
4 6.9 18.9 21.59
… … … …
2239 NaN 4.1 93.97
2240 NaN 4.1 93.97
2241 NaN 4.1 93.97
2242 NaN 4.1 93.97
2243 NaN 4.1 93.97

[2244 rows x 8 columns]

[4]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2244 entries, 0 to 2243
Data columns (total 8 columns):

1
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 0 non-null float64
1 Unnamed: 1 0 non-null float64
2 Year 2244 non-null datetime64[ns]
3 Gold Prices (USD) 2244 non-null float64
4 Inflation 2192 non-null float64
5 Unemployment rate 2136 non-null float64
6 Interest rates 2244 non-null float64
7 Oil Prices (USD) 2244 non-null float64
dtypes: datetime64[ns](1), float64(7)
memory usage: 140.4 KB

[7]: df.describe()

[7]: Unnamed: 0 Unnamed: 1 Year Gold Prices (USD) \


count 0.0 0.0 2244 2244.000000
mean NaN NaN 2001-07-02 12:00:00 738.579835
min NaN NaN 1980-01-04 00:00:00 253.800000
25% NaN NaN 1990-10-03 06:00:00 357.787500
50% NaN NaN 2001-07-02 12:00:00 427.575000
75% NaN NaN 2012-03-31 18:00:00 1215.312500
max NaN NaN 2022-12-30 00:00:00 2031.150000
std NaN NaN NaN 506.810257

Inflation Unemployment rate Interest rates Oil Prices (USD)


count 2192.000000 2136.000000 2244.000000 2244.000000
mean 3.180372 6.311798 4.276190 40.127362
min -0.355546 3.900000 0.070000 10.870000
25% 2.069337 5.300000 0.540000 16.540000
50% 2.931204 5.900000 4.160000 27.560000
75% 3.156842 7.800000 6.770000 59.690000
max 13.549202 10.800000 18.900000 95.990000
std 2.402141 1.459679 3.957089 27.380432

[27]: df.columns

[27]: Index(['Year', 'Gold Prices (USD)', 'Inflation', 'Unemployment rate',


'Interest rates', 'Oil Prices (USD)'],
dtype='object')

[8]: #Removing unnecessary columns like "unnamed:0" and "unnamed:1"


import pandas as pd

df = df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])

print(df.head())

2
Year Gold Prices (USD) Inflation Unemployment rate Interest rates \
0 1980-01-04 588.0 13.549202 6.3 18.9
1 1980-01-11 623.0 13.549202 6.3 18.9
2 1980-01-18 835.0 13.549202 6.3 18.9
3 1980-01-25 668.0 13.549202 6.3 18.9
4 1980-02-01 676.5 13.549202 6.9 18.9

Oil Prices (USD)


0 21.59
1 21.59
2 21.59
3 21.59
4 21.59

[133]: # Visualize data using histograms


df['Gold Prices (USD)'].hist()

# Visualize data using box plots


df.boxplot(column='Gold Prices (USD)')

# Visualize data using scatter plots


df.plot.scatter(x='Year', y='Gold Prices (USD)')

[133]: <Axes: xlabel='Year', ylabel='Gold Prices (USD)'>

3
[10]: #Removing missing values from "inflation" and "unemployment rate"
# Assuming your dataframe is named 'df'

# Remove rows with missing values in both 'Inflation' and 'Unemployment rate'␣
↪columns

df = df.dropna(subset=['Inflation', 'Unemployment rate'])

# Display the modified dataframe


print(df.head())

Year Gold Prices (USD) Inflation Unemployment rate Interest rates \


0 1980-01-04 588.0 13.549202 6.3 18.9
1 1980-01-11 623.0 13.549202 6.3 18.9
2 1980-01-18 835.0 13.549202 6.3 18.9
3 1980-01-25 668.0 13.549202 6.3 18.9
4 1980-02-01 676.5 13.549202 6.9 18.9

Oil Prices (USD)


0 21.59

4
1 21.59
2 21.59
3 21.59
4 21.59

[12]: df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2136 entries, 0 to 2135
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Year 2136 non-null datetime64[ns]
1 Gold Prices (USD) 2136 non-null float64
2 Inflation 2136 non-null float64
3 Unemployment rate 2136 non-null float64
4 Interest rates 2136 non-null float64
5 Oil Prices (USD) 2136 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 116.8 KB

[ ]: #missng values are removed , left us with 2136 rows"

[14]: #Visualising distrbution of each variable to understand distribution


import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your dataframe is named 'df'

# Set the style for seaborn plots


sns.set(style="whitegrid")

# Plot the distribution of each variable


plt.figure(figsize=(12, 8))

# Plot for 'Gold Prices (USD)' variable


plt.subplot(3, 2, 2)
sns.histplot(df['Gold Prices (USD)'], kde=True)
plt.title('Distribution of Gold Prices (USD)')

# Plot for 'Inflation' variable


plt.subplot(3, 2, 3)
sns.histplot(df['Inflation'], kde=True)
plt.title('Distribution of Inflation')

# Plot for 'Unemployment rate' variable

5
plt.subplot(3, 2, 4)
sns.histplot(df['Unemployment rate'], kde=True)
plt.title('Distribution of Unemployment rate')

# Plot for 'Interest rates' variable


plt.subplot(3, 2, 5)
sns.histplot(df['Interest rates'], kde=True)
plt.title('Distribution of Interest rates')

# Plot for 'Oil Prices (USD)' variable


plt.subplot(3, 2, 6)
sns.histplot(df['Oil Prices (USD)'], kde=True)
plt.title('Distribution of Oil Prices (USD)')

plt.tight_layout()
plt.show()

[15]: import seaborn as sns


import matplotlib.pyplot as plt

# Assuming your dataframe is named 'df'

# Set the style for seaborn plots

6
sns.set(style="whitegrid")

# Plot the trend of each variable over time (year)


plt.figure(figsize=(12, 8))

# Plot for 'Gold Prices (USD)'


plt.subplot(3, 2, 1)
sns.lineplot(x='Year', y='Gold Prices (USD)', data=df)
plt.title('Trend of Gold Prices (USD)')

# Plot for 'Inflation'


plt.subplot(3, 2, 2)
sns.lineplot(x='Year', y='Inflation', data=df)
plt.title('Trend of Inflation')

# Plot for 'Unemployment rate'


plt.subplot(3, 2, 3)
sns.lineplot(x='Year', y='Unemployment rate', data=df)
plt.title('Trend of Unemployment rate')

# Plot for 'Interest rates'


plt.subplot(3, 2, 4)
sns.lineplot(x='Year', y='Interest rates', data=df)
plt.title('Trend of Interest rates')

# Plot for 'Oil Prices (USD)'


plt.subplot(3, 2, 5)
sns.lineplot(x='Year', y='Oil Prices (USD)', data=df)
plt.title('Trend of Oil Prices (USD)')

plt.tight_layout()
plt.show()

7
[17]: import seaborn as sns
import matplotlib.pyplot as plt

# Pairwise relationships between variables


sns.pairplot(df, vars=['Gold Prices (USD)', 'Inflation', 'Unemployment rate',␣
↪'Interest rates', 'Oil Prices (USD)'], kind='scatter')

plt.show()

8
[18]: import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your dataframe is named 'df' with the relevant columns


# Compute the correlation matrix
correlation_matrix = df.corr()

# Plot the correlation matrix as a heatmap


plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f",␣
↪square=True)

plt.title('Correlation Matrix between Variables')


plt.show()

9
[ ]: '''it can be seen that relationship between inflation and gold prices is␣
↪positive theoretically , but according to the correlation matrix it is
negative.It maybe due to external factors like govt policies that the data is␣
↪behaving like this. Also this is not so strong relationship'''

[19]: #OUTLIER DETECTION


import pandas as pd

# Assuming your dataframe is named 'df' with the relevant variables

# Define a function to detect outliers using z-score


def detect_outliers_zscore(data, threshold=3):
z_scores = (data - data.mean()) / data.std()
return (z_scores > threshold) | (z_scores < -threshold)

# Define a function to detect outliers using IQR

10
def detect_outliers_iqr(data, threshold=1.5):
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - threshold * iqr
upper_bound = q3 + threshold * iqr
return (data < lower_bound) | (data > upper_bound)

# Iterate over each column in the dataframe to detect outliers


outliers = {}
for column in df.columns:
outliers[column] = {
'zscore_outliers': detect_outliers_zscore(df[column]),
'iqr_outliers': detect_outliers_iqr(df[column])
}

# Display outliers for each variable


for column, values in outliers.items():
print(f"Outliers in {column}:")
print("Outliers:")
print(values['zscore_outliers'])
print("IQR Outliers:")
print(values['iqr_outliers'])
print()

Outliers in Year:
Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Year, Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False

11
2132 False
2133 False
2134 False
2135 False
Name: Year, Length: 2136, dtype: bool

Outliers in Gold Prices (USD):


Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Gold Prices (USD), Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Gold Prices (USD), Length: 2136, dtype: bool

Outliers in Inflation:
Outliers:
0 True
1 True
2 True
3 True
4 True

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Inflation, Length: 2136, dtype: bool

12
IQR Outliers:
0 True
1 True
2 True
3 True
4 True

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Inflation, Length: 2136, dtype: bool

Outliers in Unemployment rate:


Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Unemployment rate, Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Unemployment rate, Length: 2136, dtype: bool

Outliers in Interest rates:


Outliers:
0 True
1 True
2 True
3 True

13
4 True

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Interest rates, Length: 2136, dtype: bool
IQR Outliers:
0 True
1 True
2 True
3 True
4 True

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Interest rates, Length: 2136, dtype: bool

Outliers in Oil Prices (USD):


Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False
Name: Oil Prices (USD), Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False

2131 False
2132 False
2133 False
2134 False
2135 False

14
Name: Oil Prices (USD), Length: 2136, dtype: bool

[21]: import seaborn as sns


import matplotlib.pyplot as plt

# Assuming your dataframe is named 'df' with the relevant variables

# Set the figure size


plt.figure(figsize=(12, 8))

# Iterate over each column in the dataframe and plot box plots with outliers
for i, column in enumerate(df.columns):
plt.subplot(len(df.columns)//2, 2, i+1)
sns.boxplot(x=df[column], showfliers=True)
plt.title(f'Box Plot of {column}')

# Adjust layout
plt.tight_layout()
plt.show()

[20]: import pandas as pd

# Assuming your dataframe is named 'df' with the relevant variables

15
# Define a function to detect outliers using z-score
def detect_outliers_zscore(data, threshold=3):
z_scores = (data - data.mean()) / data.std()
return (z_scores > threshold) | (z_scores < -threshold)

# Define a function to detect outliers using IQR


def detect_outliers_iqr(data, threshold=1.5):
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - threshold * iqr
upper_bound = q3 + threshold * iqr
return (data < lower_bound) | (data > upper_bound)

# Initialize a dictionary to store the count of outliers for each variable


outlier_counts = {}

# Iterate over each column in the dataframe to count outliers


for column in df.columns:
# Count outliers using z-score method
zscore_outliers_count = detect_outliers_zscore(df[column]).sum()

# Count outliers using IQR method


iqr_outliers_count = detect_outliers_iqr(df[column]).sum()

# Store the counts in the dictionary


outlier_counts[column] = {
'zscore_outliers_count': zscore_outliers_count,
'iqr_outliers_count': iqr_outliers_count
}

# Display the count of outliers for each variable


for column, values in outlier_counts.items():
print(f"Outlier counts in {column}:")
print("Z-Score Outliers Count:", values['zscore_outliers_count'])
print("IQR Outliers Count:", values['iqr_outliers_count'])
print()

Outlier counts in Year:


Z-Score Outliers Count: 0
IQR Outliers Count: 0

Outlier counts in Gold Prices (USD):


Z-Score Outliers Count: 0
IQR Outliers Count: 0

16
Outlier counts in Inflation:
Z-Score Outliers Count: 104
IQR Outliers Count: 261

Outlier counts in Unemployment rate:


Z-Score Outliers Count: 9
IQR Outliers Count: 0

Outlier counts in Interest rates:


Z-Score Outliers Count: 52
IQR Outliers Count: 52

Outlier counts in Oil Prices (USD):


Z-Score Outliers Count: 0
IQR Outliers Count: 0

[22]: # Define a function to detect outliers using z-score


def detect_outliers_zscore(data, threshold=3):
z_scores = (data - data.mean()) / data.std()
return (z_scores > threshold) | (z_scores < -threshold)

# Define a function to detect outliers using IQR


def detect_outliers_iqr(data, threshold=1.5):
q1 = data.quantile(0.25)
q3 = data.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - threshold * iqr
upper_bound = q3 + threshold * iqr
return (data < lower_bound) | (data > upper_bound)

# Initialize a dictionary to store the outliers for each variable


outliers = {}

# Iterate over each column in the dataframe to detect and store outliers
for column in df.columns:
# Detect outliers using z-score method
zscore_outliers = df[column][detect_outliers_zscore(df[column])]

# Detect outliers using IQR method


iqr_outliers = df[column][detect_outliers_iqr(df[column])]

# Store the outliers in the dictionary


outliers[column] = {
'zscore_outliers': zscore_outliers,
'iqr_outliers': iqr_outliers
}

17
# Display the outlier values for each variable
for column, values in outliers.items():
print(f"Outliers in {column}:")
print("Z-Score Outliers:")
print(values['zscore_outliers'])
print("IQR Outliers:")
print(values['iqr_outliers'])
print()

Outliers in Year:
Z-Score Outliers:
Series([], Name: Year, dtype: datetime64[ns])
IQR Outliers:
Series([], Name: Year, dtype: datetime64[ns])

Outliers in Gold Prices (USD):


Z-Score Outliers:
Series([], Name: Gold Prices (USD), dtype: float64)
IQR Outliers:
Series([], Name: Gold Prices (USD), dtype: float64)

Outliers in Inflation:
Z-Score Outliers:
0 13.549202
1 13.549202
2 13.549202
3 13.549202
4 13.549202

99 10.334715
100 10.334715
101 10.334715
102 10.334715
103 10.334715
Name: Inflation, Length: 104, dtype: float64
IQR Outliers:
0 13.549202
1 13.549202
2 13.549202
3 13.549202
4 13.549202

1821 0.118627
1822 0.118627
1823 0.118627
1824 0.118627

18
1825 0.118627
Name: Inflation, Length: 261, dtype: float64

Outliers in Unemployment rate:


Z-Score Outliers:
148 10.8
149 10.8
150 10.8
151 10.8
152 10.8
153 10.8
154 10.8
155 10.8
156 10.8
Name: Unemployment rate, dtype: float64
IQR Outliers:
Series([], Name: Unemployment rate, dtype: float64)

Outliers in Interest rates:


Z-Score Outliers:
0 18.9
1 18.9
2 18.9
3 18.9
4 18.9
5 18.9
6 18.9
7 18.9
8 18.9
9 18.9
10 18.9
11 18.9
12 18.9
13 18.9
14 18.9
15 18.9
16 18.9
17 18.9
18 18.9
19 18.9
20 18.9
21 18.9
22 18.9
23 18.9
24 18.9
25 18.9
26 18.9
27 18.9

19
28 18.9
29 18.9
30 18.9
31 18.9
32 18.9
33 18.9
34 18.9
35 18.9
36 18.9
37 18.9
38 18.9
39 18.9
40 18.9
41 18.9
42 18.9
43 18.9
44 18.9
45 18.9
46 18.9
47 18.9
48 18.9
49 18.9
50 18.9
51 18.9
Name: Interest rates, dtype: float64
IQR Outliers:
0 18.9
1 18.9
2 18.9
3 18.9
4 18.9
5 18.9
6 18.9
7 18.9
8 18.9
9 18.9
10 18.9
11 18.9
12 18.9
13 18.9
14 18.9
15 18.9
16 18.9
17 18.9
18 18.9
19 18.9
20 18.9
21 18.9

20
22 18.9
23 18.9
24 18.9
25 18.9
26 18.9
27 18.9
28 18.9
29 18.9
30 18.9
31 18.9
32 18.9
33 18.9
34 18.9
35 18.9
36 18.9
37 18.9
38 18.9
39 18.9
40 18.9
41 18.9
42 18.9
43 18.9
44 18.9
45 18.9
46 18.9
47 18.9
48 18.9
49 18.9
50 18.9
51 18.9
Name: Interest rates, dtype: float64

Outliers in Oil Prices (USD):


Z-Score Outliers:
Series([], Name: Oil Prices (USD), dtype: float64)
IQR Outliers:
Series([], Name: Oil Prices (USD), dtype: float64)

[ ]: '''It can be seen that there are many outliers but share historical event ,␣
↪since the outliers are considered meaningful reflections of historical

events in U.S. history,


it may be appropriate to retain them in the dataset and proceed with the␣
↪analysis.'''

[23]: #Finding multicollinearity


import pandas as pd

21
# Assuming your dataframe is named 'df' with the independent variables as␣
↪columns

# Calculate the correlation matrix


correlation_matrix = df.corr()

# Display the correlation matrix as a table


print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
Year Gold Prices (USD) Inflation Unemployment rate \
Year 1.000000 0.775752 -0.532349 -0.501394
Gold Prices (USD) 0.775752 1.000000 -0.205732 -0.014243
Inflation -0.532349 -0.205732 1.000000 0.277274
Unemployment rate -0.501394 -0.014243 0.277274 1.000000
Interest rates -0.841198 -0.560575 0.772618 0.399629
Oil Prices (USD) 0.678407 0.772340 -0.298871 0.036252

Interest rates Oil Prices (USD)


Year -0.841198 0.678407
Gold Prices (USD) -0.560575 0.772340
Inflation 0.772618 -0.298871
Unemployment rate 0.399629 0.036252
Interest rates 1.000000 -0.550246
Oil Prices (USD) -0.550246 1.000000

[24]: import pandas as pd

# Assuming your dataframe is named 'df' with the independent variables as␣
↪columns

# Calculate the correlation matrix


correlation_matrix = df.corr()

# Set the threshold values for correlation


threshold_min = 0.7
threshold_max = 0.9

# Initialize a list to store pairs of variables with high correlation


high_correlation_pairs = []

# Iterate through the correlation matrix to identify pairs with correlation␣


↪between the thresholds

for i in range(len(correlation_matrix.columns)):
for j in range(i+1, len(correlation_matrix.columns)):

22
if abs(correlation_matrix.iloc[i, j]) >= threshold_min and␣
↪abs(correlation_matrix.iloc[i, j]) <= threshold_max:
high_correlation_pairs.append((correlation_matrix.columns[i],␣
↪correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

# Display the pairs of variables with high correlation


print(f"Pairs of variables with correlation between {threshold_min} and␣
↪{threshold_max}:")

for pair in high_correlation_pairs:


print(f"{pair[0]} - {pair[1]}: {pair[2]}")

Pairs of variables with correlation between 0.7 and 0.9:


Year - Gold Prices (USD): 0.7757519467104836
Year - Interest rates: -0.8411983318709919
Gold Prices (USD) - Oil Prices (USD): 0.7723398490361068
Inflation - Interest rates: 0.7726182398554902

[ ]: #PREPROCESSING IS DONE

[31]: # Save the DataFrame with Year and Gold Prices to a CSV file (time series)
df.to_csv("data_revised.csv", index=False)

[ ]: '''Since there is multicollinearity between "Gold Prices (USD) - Oil Prices␣


↪(USD)" and "Inflation - Interest rates" , we will use
PCA '''

[43]: #PCA
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#Since , PCA does not require time variable , we will drop "Year"
data = df.drop(columns=['Year'])

[44]: data

[44]: Gold Prices (USD) Inflation Unemployment rate Interest rates \


0 588.0 13.549202 6.3 18.90
1 623.0 13.549202 6.3 18.90
2 835.0 13.549202 6.3 18.90
3 668.0 13.549202 6.3 18.90
4 676.5 13.549202 6.9 18.90
… … … … …
2131 1940.8 4.697859 4.6 0.09
2132 1890.9 4.697859 4.6 0.09
2133 1875.7 4.697859 4.6 0.09
2134 1779.3 4.697859 4.6 0.09

23
2135 1843.0 4.697859 4.6 0.09

Oil Prices (USD)


0 21.59
1 21.59
2 21.59
3 21.59
4 21.59
… …
2131 36.86
2132 36.86
2133 36.86
2134 36.86
2135 36.86

[2136 rows x 5 columns]

[46]: # Standardize the features


scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)
print(scaled_data)

[[-0.21158217 4.5466568 -0.00808432 3.63105777 -0.62875152]


[-0.13510079 4.5466568 -0.00808432 3.63105777 -0.62875152]
[ 0.32815787 4.5466568 -0.00808432 3.63105777 -0.62875152]

[ 2.60227714 0.71046548 -1.17299696 -1.07620925 -0.05013795]
[ 2.39162556 0.71046548 -1.17299696 -1.07620925 -0.05013795]
[ 2.53082168 0.71046548 -1.17299696 -1.07620925 -0.05013795]]

[47]: scaled_data

[47]: array([[-0.21158217, 4.5466568 , -0.00808432, 3.63105777, -0.62875152],


[-0.13510079, 4.5466568 , -0.00808432, 3.63105777, -0.62875152],
[ 0.32815787, 4.5466568 , -0.00808432, 3.63105777, -0.62875152],
…,
[ 2.60227714, 0.71046548, -1.17299696, -1.07620925, -0.05013795],
[ 2.39162556, 0.71046548, -1.17299696, -1.07620925, -0.05013795],
[ 2.53082168, 0.71046548, -1.17299696, -1.07620925, -0.05013795]])

[48]: # Perform PCA


pca = PCA()
principal_components = pca.fit_transform(scaled_features)

[49]: explained_variance_ratio = pca.explained_variance_ratio_


print("Explained Variance Ratio:")
print(explained_variance_ratio)

24
Explained Variance Ratio:
[0.53362372 0.26151325 0.13667759 0.0456746 0.02251084]

[50]: # Cumulative explained variance


cumulative_explained_variance = explained_variance_ratio.cumsum()
print("Cumulative Explained Variance:")
print(cumulative_explained_variance)

Cumulative Explained Variance:


[0.53362372 0.79513696 0.93181456 0.97748916 1. ]

[52]: import matplotlib.pyplot as plt

# Plot the scree plot


plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio,␣
↪marker='o', linestyle='-')

plt.title('Scree Plot')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)

# Find the elbow point


# You can visually identify the elbow point or use a threshold to find the␣
↪point programmatically

threshold = 0.05 # Example threshold for change in explained variance ratio


for i in range(1, len(explained_variance_ratio)):
if explained_variance_ratio[i] - explained_variance_ratio[i-1] < threshold:
elbow_point = i
break

# Mark the elbow point on the plot


plt.axvline(x=elbow_point, color='red', linestyle='--', label='Elbow Point')

plt.legend()
plt.show()

print("Elbow Point (k):", elbow_point)

25
Elbow Point (k): 1

[53]: '''However, if the scree plot exhibits a downward slope and there's no clear␣
↪elbow point,

it may imply that PCA is not suitable for dimensionality reduction in this␣
↪dataset,

or that the variables are not linearly related.'''

[ ]: ##MODEL BUILDING
#Since,we have multicollinearity in data , it is better to use Lasso␣
↪Regression . But for exploration purpose ,we are performing both

#Lasso and Linear Regression and to check the accuracy.

[ ]: ---------------------------------------------------------------------------------------------

[60]: #LINEAR REGRESSION


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

[62]: X = df[['Gold Prices (USD)']] # Independent variable


y = df['Gold Prices (USD)'] # Dependent variable

26
[67]: # Assuming X_train, X_test, y_train, y_test are your training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,␣
↪random_state=42)

[127]: df.dtypes

[127]: Year datetime64[ns]


Gold Prices (USD) float64
Inflation float64
Unemployment rate float64
Interest rates float64
Oil Prices (USD) float64
dtype: object

[68]: linear_reg = LinearRegression()


linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)
print("Linear Regression RMSE:", rmse_linear)

Linear Regression RMSE: 1.2366982197576302e-13


C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean
squared error, use the function'root_mean_squared_error'.
warnings.warn(

[135]: #Accuracy
from sklearn.metrics import r2_score
r2_linear = r2_score(y_test, y_pred_linear)
print("Linear Regression R-squared:", r2_linear)

---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[135], line 3
1 #Accuracy
2 from sklearn.metrics import r2_score
----> 3 r2_linear = r2_score(y_test, y_pred_linear)
4 print("Linear Regression R-squared:", r2_linear)

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_param_validation
↪py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)

207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation

27
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we␣
↪replace

217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.__qualname__} must be",
222 str(e),
223 )

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.
↪py:1180, in r2_score(y_true, y_pred, sample_weight, multioutput, force_finite)

1039 @validate_params(
1040 {
1041 "y_true": ["array-like"],
(…)
1059 force_finite=True,
1060 ):
1061 """:math:`R^2` (coefficient of determination) regression score␣
↪function.

1062
1063 Best possible score is 1.0 and it can be negative (because the
(…)
1178 -inf
1179 """
-> 1180 y_type, y_true, y_pred, multioutput = _check_reg_targets(
1181 y_true, y_pred, multioutput
1182 )
1183 check_consistent_length(y_true, y_pred, sample_weight)
1185 if _num_samples(y_pred) < 2:

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.
↪py:102, in _check_reg_targets(y_true, y_pred, multioutput, dtype)

68 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):


69 """Check that y_true and y_pred belong to the same regression task.
70
71 Parameters
(…)
100 correct keyword.
101 """
--> 102 check_consistent_length(y_true, y_pred)

28
103 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
104 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.
↪py:457, in check_consistent_length(*arrays)

455 uniques = np.unique(lengths)


456 if len(uniques) > 1:
--> 457 raise ValueError(
458 "Found input variables with inconsistent numbers of samples: %r"
459 % [int(l) for l in lengths]
460 )

ValueError: Found input variables with inconsistent numbers of samples: [428,␣


↪855]

[ ]: ----------------------------------------------------------------------------------------------

[72]: #LASSO REGRESSION


from sklearn.model_selection import GridSearchCV

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
lasso_reg = Lasso()
parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso_grid = GridSearchCV(lasso_reg, parameters,␣
↪scoring='neg_mean_squared_error', cv=5)

lasso_grid.fit(X_train, y_train)
y_pred_lasso = lasso_grid.predict(X_test)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
print("Lasso Regression RMSE:", rmse_lasso)

Lasso Regression RMSE: 2.2337202089187744e-06


C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean
squared error, use the function'root_mean_squared_error'.
warnings.warn(

[120]:

---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[120], line 1
----> 1 r2_lasso = r2_score(y_test, y_pred_lasso)

29
2 print("Lasso Regression R-squared:", r2_lasso)

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_param_validation
↪py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)

207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we␣
↪replace

217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.__qualname__} must be",
222 str(e),
223 )

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.
↪py:1180, in r2_score(y_true, y_pred, sample_weight, multioutput, force_finite)

1039 @validate_params(
1040 {
1041 "y_true": ["array-like"],
(…)
1059 force_finite=True,
1060 ):
1061 """:math:`R^2` (coefficient of determination) regression score␣
↪function.

1062
1063 Best possible score is 1.0 and it can be negative (because the
(…)
1178 -inf
1179 """
-> 1180 y_type, y_true, y_pred, multioutput = _check_reg_targets(
1181 y_true, y_pred, multioutput
1182 )
1183 check_consistent_length(y_true, y_pred, sample_weight)
1185 if _num_samples(y_pred) < 2:

30
File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.
↪py:102, in _check_reg_targets(y_true, y_pred, multioutput, dtype)

68 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):


69 """Check that y_true and y_pred belong to the same regression task.
70
71 Parameters
(…)
100 correct keyword.
101 """
--> 102 check_consistent_length(y_true, y_pred)
103 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
104 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.
↪py:457, in check_consistent_length(*arrays)

455 uniques = np.unique(lengths)


456 if len(uniques) > 1:
--> 457 raise ValueError(
458 "Found input variables with inconsistent numbers of samples: %r"
459 % [int(l) for l in lengths]
460 )

ValueError: Found input variables with inconsistent numbers of samples: [428,␣


↪855]

[ ]: ----------------------------------------------------------------------------------------------

[ ]: #CROSS VALIDATION

[118]: from sklearn.model_selection import train_test_split, cross_val_score


from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)

# Train the Lasso Regression model


lasso = Lasso(alpha=0.1) # You can adjust the alpha parameter as needed
lasso.fit(X_train, y_train) # Fit the model to the training data

# Evaluate the model's performance on training and testing sets


train_preds = lasso.predict(X_train)

31
test_preds = lasso.predict(X_test)

train_mse = mean_squared_error(y_train, train_preds)


test_mse = mean_squared_error(y_test, test_preds)

print("Training MSE:", train_mse)


print("Testing MSE:", test_mse)

# Perform cross-validation
cv_scores = cross_val_score(lasso, X_train, y_train, cv=5,␣
↪scoring='neg_mean_squared_error')

cv_rmse = np.sqrt(-cv_scores)

print("Cross-validation RMSE scores:", cv_rmse)


print("Mean CV RMSE:", np.mean(cv_rmse))

# Calculate the difference between training and testing accuracies


accuracy_diff = np.abs(train_mse - test_mse)
print("Difference between Training and Testing MSE:", accuracy_diff)

Training MSE: 4.776072211470752e-08


Testing MSE: 4.781923557039156e-08
Cross-validation RMSE scores: [0.00022782 0.00022004 0.00019805 0.0002186
0.00023008]
Mean CV RMSE: 0.00021891751522405177
Difference between Training and Testing MSE: 5.851345568403972e-11

[119]: lasso_overall_r2 = lasso.score(X, y)


print("Overall R-squared:", lasso_overall_r2)

Overall R-squared: 0.9999999999997718

[74]: import matplotlib.pyplot as plt

# Plot actual vs. predicted values


plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_linear, color='blue', label='Actual vs. Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--',␣
↪lw=2, color='red', label='Perfect Prediction')

plt.xlabel('Actual Gold Prices')


plt.ylabel('Predicted Gold Prices')
plt.title('Actual vs. Predicted Gold Prices (Linear Regression)')
plt.legend()
plt.grid(True)
plt.show()

C:\Users\lariy\AppData\Local\Temp\ipykernel_17032\3438038971.py:6: UserWarning:
color is redundantly defined by the 'color' keyword argument and the fmt string

32
"k--" (-> color='k'). The keyword argument will take precedence.
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--',
lw=2, color='red', label='Perfect Prediction')

[ ]: #CALCULATING RESIDUALS

[ ]: #checking assumptions

[78]: from sklearn.linear_model import LinearRegression


import numpy as np

# Assuming X is your independent variable and y is your dependent variable


# X and y should be numpy arrays or pandas DataFrames/Series
# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the predicted values


predicted_values = model.predict(X)

# Calculate residuals
residuals = y - predicted_values

# Now 'residuals' contains the residuals from the regression model

33
[79]: from scipy.stats import shapiro

# Assuming 'residuals' is a variable containing the residuals from your␣


↪regression model

shapiro_test_statistic, shapiro_p_value = shapiro(residuals)

print("Shapiro-Wilk Test:")
print("Test Statistic:", shapiro_test_statistic)
print("p-value:", shapiro_p_value)

if shapiro_p_value > 0.05:


print("p-value > 0.05. Fail to reject the null hypothesis. Residuals are␣
↪normally distributed.")

else:
print("p-value <= 0.05. Reject the null hypothesis. Residuals are not␣
↪normally distributed.")

Shapiro-Wilk Test:
Test Statistic: 0.5694072286447804
p-value: 2.730521207845291e-58
p-value <= 0.05. Reject the null hypothesis. Residuals are not normally
distributed.

[81]: from statsmodels.stats.diagnostic import het_breuschpagan


import statsmodels.api as sm

# Assuming 'X' is your independent variable(s) and 'y' is your dependent␣


↪variable

# Fit the regression model


X = sm.add_constant(X) # Add constant if necessary
model = sm.OLS(y, X)
results = model.fit()

# Perform the Breusch-Pagan test


bp_test_statistic, bp_p_value, _, _ = het_breuschpagan(results.resid, X)
print("\nBreusch-Pagan Test:")
print("Test Statistic:", bp_test_statistic)
print("p-value:", bp_p_value)

Breusch-Pagan Test:
Test Statistic: 1100.672462731512
p-value: 2.3590049793082233e-241

[ ]: #APPLYING LOG TRANSFORMATION TO GET UNIFORM DISTRIBUTED RESIDUALS

34
[82]: import numpy as np
from scipy.stats import skew

# Calculate skewness for each column


skewness = data.apply(skew)

# Select columns with skewness greater than a threshold (e.g., 0.5)


skewed_cols = skewness[skewness > 0.5].index

# Apply log transformation to skewed columns


data[skewed_cols] = np.log1p(data[skewed_cols])

# Check the distributions after transformation

[97]: #Visualising distrbution of each variable to understand distribution


import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your dataframe is named 'df'

# Set the style for seaborn plots


sns.set(style="whitegrid")

# Plot the distribution of each variable


plt.figure(figsize=(12, 8))

# Plot for 'Gold Prices (USD)' variable


plt.subplot(3, 2, 2)
sns.histplot(data['Gold Prices (USD)'], kde=True)
plt.title('Distribution of Gold Prices (USD)')

# Plot for 'Inflation' variable


plt.subplot(3, 2, 3)
sns.histplot(data['Inflation'], kde=True)
plt.title('Distribution of Inflation')

# Plot for 'Unemployment rate' variable


plt.subplot(3, 2, 4)
sns.histplot(df['Unemployment rate'], kde=True)
plt.title('Distribution of Unemployment rate')

# Plot for 'Interest rates' variable


plt.subplot(3, 2, 5)
sns.histplot(data['Interest rates'], kde=True)
plt.title('Distribution of Interest rates')

35
# Plot for 'Oil Prices (USD)' variable
plt.subplot(3, 2, 6)
sns.histplot(data['Oil Prices (USD)'], kde=True)
plt.title('Distribution of Oil Prices (USD)')

plt.tight_layout()
plt.show()

[92]: residuals_1= np.log(y) - np.log(predicted_values)

[ ]: #AGAIN running tests

[93]: from scipy.stats import shapiro

# Assuming 'log_residuals' is a variable containing the residuals after␣


↪applying the logarithmic transformation

shapiro_test_statistic, shapiro_p_value = shapiro(residuals_1)

print("Shapiro-Wilk Test:")
print("Test Statistic:", shapiro_test_statistic)
print("p-value:", shapiro_p_value)

if shapiro_p_value > 0.05:

36
print("p-value > 0.05. Fail to reject the null hypothesis. Residuals are␣
↪normally distributed.")
else:
print("p-value <= 0.05. Reject the null hypothesis. Residuals are not␣
↪normally distributed.")

Shapiro-Wilk Test:
Test Statistic: 0.45959720762702283
p-value: 2.042106007386736e-62
p-value <= 0.05. Reject the null hypothesis. Residuals are not normally
distributed.

[96]: from statsmodels.regression.linear_model import OLS


from statsmodels.stats.diagnostic import het_breuschpagan

# Fit the OLS regression model with the transformed data


model = OLS(y, X).fit()

# Compute the residuals


residuals = model.resid

# Perform the Breusch-Pagan test


bp_test_statistic, bp_p_value, _, _ = het_breuschpagan(residuals, X)

# Print the results


print("\nBreusch-Pagan Test:")
print("Test Statistic:", bp_test_statistic)
print("p-value:", bp_p_value)

Breusch-Pagan Test:
Test Statistic: 1100.672462731512
p-value: 2.3590049793082233e-241

[ ]: #RESULTS ARE THE SAME

[ ]: ------------------------------------------------------------------

[98]: #Weighted least square model


import numpy as np
import statsmodels.api as sm

# Fit a standard linear regression model


X = df[['Gold Prices (USD)']] # Independent variable
y = df['Gold Prices (USD)'] # Dependent variable

X = sm.add_constant(X) # Add constant if needed

37
model = sm.OLS(y, X).fit()

# Calculate residuals
residuals = model.resid

# Calculate weights based on the inverse of the variance of residuals


weights = 1.0 / np.var(residuals)

# Fit a weighted least squares regression model


weighted_model = sm.WLS(y, X, weights=weights).fit()

# Print summary of the weighted model


print(weighted_model.summary())

WLS Regression Results


==============================================================================
Dep. Variable: Gold Prices (USD) R-squared: 1.000
Model: WLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 3.375e+33
Date: Thu, 02 May 2024 Prob (F-statistic): 0.00
Time: 21:40:04 Log-Likelihood: 58148.
No. Observations: 2136 AIC: -1.163e+05
Df Residuals: 2134 BIC: -1.163e+05
Df Model: 1
Covariance Type: nonrobust
================================================================================
=====
coef std err t P>|t| [0.025
0.975]
--------------------------------------------------------------------------------
-----
const -1.705e-13 1.42e-14 -12.028 0.000 -1.98e-13
-1.43e-13
Gold Prices (USD) 1.0000 1.72e-17 5.81e+16 0.000 1.000
1.000
==============================================================================
Omnibus: 384.389 Durbin-Watson: 0.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 620.132
Skew: 1.302 Prob(JB): 2.19e-135
Kurtosis: 3.430 Cond. No. 1.48e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly
specified.
[2] The condition number is large, 1.48e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

38
[ ]: #CROSS VALIDATION

[99]: ##NEed to perform CROSS VALIDATION


from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

# Assuming X_train and y_train are your training data and labels
# Instantiate the model
model = LinearRegression()

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5) # You can adjust␣
↪the number of folds (cv) as needed

# Print the cross-validation scores


print("Cross-validation scores:", cv_scores)

# Calculate the mean and standard deviation of the cross-validation scores


mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
print("Mean CV score:", mean_cv_score)
print("Standard deviation of CV scores:", std_cv_score)

Cross-validation scores: [1. 1. 1. 1. 1.]


Mean CV score: 1.0
Standard deviation of CV scores: 0.0

[102]: from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)

# Fit the model on the training data


model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the training data


y_train_pred = model.predict(X_train)
training_score = r2_score(y_train, y_train_pred)

# Evaluate the model on the testing data


y_test_pred = model.predict(X_test)
testing_score = r2_score(y_test, y_test_pred)

39
print("Training R-squared:", training_score)
print("Testing R-squared:", testing_score)

Training R-squared: 1.0


Testing R-squared: 1.0

[ ]: ----------------------------------------------------------------------------------------------

[ ]: #TRYING MODELS

[ ]: #DECISION TREE

[103]: from sklearn.tree import DecisionTreeRegressor


from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Assuming X contains your independent variable(s) and y contains your␣


↪dependent variable

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)

# Initialize the Decision Tree Regressor


tree_regressor = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data


tree_regressor.fit(X_train, y_train)

# Make predictions on the testing data


y_pred_train = tree_regressor.predict(X_train)
y_pred_test = tree_regressor.predict(X_test)

# Evaluate the model


train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

print("Training R-squared:", train_r2)


print("Testing R-squared:", test_r2)

Training R-squared: 1.0


Testing R-squared: 0.9999932907263452

[105]: y_pred_dt = tree_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE)


mse_dt = mean_squared_error(y_test, y_pred_dt)

40
# Calculate Root Mean Squared Error (RMSE)
rmse_dt = np.sqrt(mse_dt)

print("Decision Tree RMSE:", rmse_dt)

Decision Tree RMSE: 1.1855782531712218

[ ]: ----------------------------------------------------------------------------------------------

[ ]: #RANDOM FOREST

[107]: from sklearn.model_selection import train_test_split


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Split the data into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣
↪random_state=42)

# Initialize and fit the Random Forest Regression model


rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on training and testing data


y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Calculate R-squared for training and testing data


r2_train_rf = r2_score(y_train, y_train_pred_rf)
r2_test_rf = r2_score(y_test, y_test_pred_rf)

print("Random Forest Regression")


print("Training R-squared:", r2_train_rf)
print("Testing R-squared:", r2_test_rf)

Random Forest Regression


Training R-squared: 0.9999980259321656
Testing R-squared: 0.999994285557632

[ ]: ----------------------------------------------------------------------------------------------

[ ]: #GRADIENT BOOSTING

[108]: # Initialize and fit the Gradient Boosting Regression model


gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

41
# Make predictions on training and testing data
y_train_pred_gb = gb_model.predict(X_train)
y_test_pred_gb = gb_model.predict(X_test)

# Calculate R-squared for training and testing data


r2_train_gb = r2_score(y_train, y_train_pred_gb)
r2_test_gb = r2_score(y_test, y_test_pred_gb)

print("\nGradient Boosting Regression")


print("Training R-squared:", r2_train_gb)
print("Testing R-squared:", r2_test_gb)

Gradient Boosting Regression


Training R-squared: 0.9999754562135037
Testing R-squared: 0.9999573703979345

[112]: pip install xgboost

Requirement already satisfied: xgboost in


c:\users\lariy\appdata\local\programs\python\python310\lib\site-packages
(2.0.3)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: scipy in


c:\users\lariy\appdata\local\programs\python\python310\lib\site-packages (from
xgboost) (1.12.0)
Requirement already satisfied: numpy in
c:\users\lariy\appdata\local\programs\python\python310\lib\site-packages (from
xgboost) (1.26.2)

[notice] A new release of pip available: 22.2.2 -> 24.0


[notice] To update, run: python.exe -m pip install --upgrade pip

[115]: #Cross validation for Linear Regression


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming X_train, X_test, y_train, y_test are your training and testing data

# 1. Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

42
# Calculate RMSE
y_pred_train_lr = linear_reg.predict(X_train)
rmse_train_lr = np.sqrt(mean_squared_error(y_train, y_pred_train_lr))
y_pred_test_lr = linear_reg.predict(X_test)
rmse_test_lr = np.sqrt(mean_squared_error(y_test, y_pred_test_lr))

# Cross-validation
cv_scores_lr = cross_val_score(linear_reg, X_train, y_train, cv=5,␣
↪scoring='neg_mean_squared_error')

cv_rmse_lr = np.sqrt(-cv_scores_lr)

# Print results
print("Linear Regression:")
print("Training RMSE:", rmse_train_lr)
print("Testing RMSE:", rmse_test_lr)
print("Mean CV RMSE:", cv_rmse_lr.mean())
print("Std CV RMSE:", cv_rmse_lr.std())

Linear Regression:
Training RMSE: 2.2829534657510607e-13
Testing RMSE: 2.3371014992128577e-13
Mean CV RMSE: 2.4273458808647473e-13
Std CV RMSE: 9.99530028073763e-14

[124]:

C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_validation.py:547: FitFailedWarning:
170 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to
nan.
If these failures are not expected, you can try to debug them by setting
error_score='raise'.

Below are more details about the failures:


--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\base.py", line 1467, in wrapper
estimator._validate_params()
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\base.py", line 666, in _validate_params

43
validate_parameter_constraints(
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\utils\_param_validation.py", line 95, in
validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features'
parameter of RandomForestRegressor must be an int in the range [1, inf), a float
in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto'
instead.

warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_search.py:1051: UserWarning: One or more of
the test scores are non-finite: [ nan -58.93907181 -37.90943391
-39.08803166 -43.19203625
-41.3236575 nan nan nan -57.15259269
-9.60570762 -40.55572985 nan -58.59893245 nan
-73.39584413 -18.16182559 nan -11.13065831 -49.6359911
nan -48.98262037 -11.31604215 -11.67992611 -48.91244679
-15.17315845 -27.45118465 nan -38.86970611 nan
-46.46008978 -76.8895055 -58.00598485 nan nan
-44.38463655 -11.17309129 nan -47.82243001 -43.27158338
-74.06200261 nan nan -17.23503776 -25.69643983
-60.15686948 nan -79.96635582 -27.44276843 -62.70058757
-58.44250228 nan -64.96921744 nan -63.50868913
-64.67395609 -70.42213274 -50.12022647 nan -27.87982487
nan -71.32384802 -25.74420683 nan nan
-73.91215454 -62.70968467 -38.69771057 -71.87860269 nan
-47.75998313 nan -57.17101105 -69.30109927 -27.04269829
nan -21.32999161 nan -11.68845447 -71.50186947
-58.46591787 -63.41533483 nan -25.82496422 -54.44954262
nan -25.74575808 -33.52783331 -32.28189951 nan
-71.64371947 -42.34704253 nan -4.80991776 nan
nan -25.35147124 -79.21123837 nan nan]
warnings.warn(
Best Hyperparameters: {'max_depth': 30, 'max_features': 'log2',
'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 123}
Training RMSE: 0.7535148520604379
Testing RMSE: 1.1135615099471048
C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean
squared error, use the function'root_mean_squared_error'.
warnings.warn(
C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean

44
squared error, use the function'root_mean_squared_error'.
warnings.warn(

[125]: #hyperparameter tunning


from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

# Define the hyperparameters grid


param_dist = {
'n_estimators': randint(50, 200),
'max_depth': [None, 10, 20, 30],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the RandomForestRegressor


rf_regressor = RandomForestRegressor()

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_regressor,␣
↪param_distributions=param_dist, n_iter=100, cv=5,␣

↪scoring='neg_mean_squared_error', random_state=42)

random_search.fit(X_train, y_train)

# Get the best hyperparameters


best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Initialize the RandomForestRegressor with the best hyperparameters


best_rf_regressor = RandomForestRegressor(**best_params)

# Fit the model with the best hyperparameters


best_rf_regressor.fit(X_train, y_train)

# Evaluate the model


train_rmse = mean_squared_error(y_train, best_rf_regressor.predict(X_train),␣
↪squared=False)

test_rmse = mean_squared_error(y_test, best_rf_regressor.predict(X_test),␣


↪squared=False)

print("Training RMSE:", train_rmse)


print("Testing RMSE:", test_rmse)

C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_validation.py:547: FitFailedWarning:

45
170 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to
nan.
If these failures are not expected, you can try to debug them by setting
error_score='raise'.

Below are more details about the failures:


--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\base.py", line 1467, in wrapper
estimator._validate_params()
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\utils\_param_validation.py", line 95, in
validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features'
parameter of RandomForestRegressor must be an int in the range [1, inf), a float
in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto'
instead.

warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_search.py:1051: UserWarning: One or more of
the test scores are non-finite: [ nan -58.46487472 -37.58114991
-42.56400437 -43.44029062
-42.10161918 nan nan nan -57.94417192
-9.16763144 -36.80911199 nan -61.16860533 nan
-72.77088008 -17.58202262 nan -11.41052383 -47.83249256
nan -49.86612921 -11.77792633 -12.80862819 -48.15975327
-16.30022891 -26.89035096 nan -36.89072673 nan
-50.18467986 -75.53246766 -60.59819993 nan nan
-43.91990928 -11.54445493 nan -49.10973958 -45.45312132
-71.05456128 nan nan -16.48893719 -25.75054451
-60.00189991 nan -70.68815294 -27.23028137 -64.0287326
-56.70982844 nan -63.52681669 nan -63.1903081
-61.92191672 -70.84075316 -54.15822214 nan -27.03075124
nan -72.24201098 -27.96395719 nan nan
-73.31688285 -59.45094749 -37.26006078 -71.19256996 nan
-49.74931541 nan -56.2931076 -70.51918141 -26.51741668
nan -21.61383175 nan -11.67718781 -72.78182952

46
-57.23201728 -62.8046166 nan -27.12209161 -51.13025851
nan -27.13044707 -32.43912226 -32.90987822 nan
-69.76462899 -42.51182264 nan -4.74511793 nan
nan -26.98026314 -77.91433524 nan nan]
warnings.warn(
Best Hyperparameters: {'max_depth': 30, 'max_features': 'log2',
'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 123}
Training RMSE: 0.6656633581584913
Testing RMSE: 1.1766919161273477
C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean
squared error, use the function'root_mean_squared_error'.
warnings.warn(
C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean
squared error, use the function'root_mean_squared_error'.
warnings.warn(

47

You might also like