Regression and Eda

regression-and-eda
May 3, 2024
[2]: import pandas as pd
[3]: df=pd.read_excel('C:\\Users\\lariy\\Downloads\\capstone\\final_data.xlsx')
df
[3]: Unnamed: 0 Unnamed: 1 Year Gold Prices (USD) Inflation \

0 NaN NaN 1980-01-04 588.00 13.549202
1 NaN NaN 1980-01-11 623.00 13.549202
2 NaN NaN 1980-01-18 835.00 13.549202
3 NaN NaN 1980-01-25 668.00 13.549202
4 NaN NaN 1980-02-01 676.50 13.549202
… … … … … …
2239 NaN NaN 2022-12-02 1784.75 NaN
2240 NaN NaN 2022-12-09 1796.15 NaN
2241 NaN NaN 2022-12-16 1792.55 NaN
2242 NaN NaN 2022-12-23 1800.70 NaN
2243 NaN NaN 2022-12-30 1813.75 NaN
Unemployment rate Interest rates Oil Prices (USD)

0 6.3 18.9 21.59
1 6.3 18.9 21.59
2 6.3 18.9 21.59
3 6.3 18.9 21.59
4 6.9 18.9 21.59
… … … …
2239 NaN 4.1 93.97
2240 NaN 4.1 93.97
2241 NaN 4.1 93.97
2242 NaN 4.1 93.97
2243 NaN 4.1 93.97
[2244 rows x 8 columns]
[4]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2244 entries, 0 to 2243
Data columns (total 8 columns):
1
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 0 non-null float64
1 Unnamed: 1 0 non-null float64
2 Year 2244 non-null datetime64[ns]
3 Gold Prices (USD) 2244 non-null float64
4 Inflation 2192 non-null float64
5 Unemployment rate 2136 non-null float64
6 Interest rates 2244 non-null float64
7 Oil Prices (USD) 2244 non-null float64
dtypes: datetime64[ns](1), float64(7)
memory usage: 140.4 KB
[7]: df.describe()
[7]: Unnamed: 0 Unnamed: 1 Year Gold Prices (USD) \

count 0.0 0.0 2244 2244.000000
mean NaN NaN 2001-07-02 12:00:00 738.579835
min NaN NaN 1980-01-04 00:00:00 253.800000
25% NaN NaN 1990-10-03 06:00:00 357.787500
50% NaN NaN 2001-07-02 12:00:00 427.575000
75% NaN NaN 2012-03-31 18:00:00 1215.312500
max NaN NaN 2022-12-30 00:00:00 2031.150000
std NaN NaN NaN 506.810257
Inflation Unemployment rate Interest rates Oil Prices (USD)

count 2192.000000 2136.000000 2244.000000 2244.000000
mean 3.180372 6.311798 4.276190 40.127362
min -0.355546 3.900000 0.070000 10.870000
25% 2.069337 5.300000 0.540000 16.540000
50% 2.931204 5.900000 4.160000 27.560000
75% 3.156842 7.800000 6.770000 59.690000
max 13.549202 10.800000 18.900000 95.990000
std 2.402141 1.459679 3.957089 27.380432
[27]: df.columns
[27]: Index(['Year', 'Gold Prices (USD)', 'Inflation', 'Unemployment rate',

'Interest rates', 'Oil Prices (USD)'],
dtype='object')
[8]: #Removing unnecessary columns like "unnamed:0" and "unnamed:1"

import pandas as pd
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])
print(df.head())
2
Year Gold Prices (USD) Inflation Unemployment rate Interest rates \
0 1980-01-04 588.0 13.549202 6.3 18.9
1 1980-01-11 623.0 13.549202 6.3 18.9
2 1980-01-18 835.0 13.549202 6.3 18.9
3 1980-01-25 668.0 13.549202 6.3 18.9
4 1980-02-01 676.5 13.549202 6.9 18.9
Oil Prices (USD)

0 21.59
1 21.59
2 21.59
3 21.59
4 21.59
[133]: # Visualize data using histograms

df['Gold Prices (USD)'].hist()
# Visualize data using box plots

df.boxplot(column='Gold Prices (USD)')
# Visualize data using scatter plots

df.plot.scatter(x='Year', y='Gold Prices (USD)')
[133]: <Axes: xlabel='Year', ylabel='Gold Prices (USD)'>
3
[10]: #Removing missing values from "inflation" and "unemployment rate"
# Assuming your dataframe is named 'df'
# Remove rows with missing values in both 'Inflation' and 'Unemployment rate'␣
↪columns
df = df.dropna(subset=['Inflation', 'Unemployment rate'])
# Display the modified dataframe

print(df.head())
Year Gold Prices (USD) Inflation Unemployment rate Interest rates \

0 1980-01-04 588.0 13.549202 6.3 18.9
1 1980-01-11 623.0 13.549202 6.3 18.9
2 1980-01-18 835.0 13.549202 6.3 18.9
3 1980-01-25 668.0 13.549202 6.3 18.9
4 1980-02-01 676.5 13.549202 6.9 18.9
Oil Prices (USD)

0 21.59
4
1 21.59
2 21.59
3 21.59
4 21.59
[12]: df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 2136 entries, 0 to 2135
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Year 2136 non-null datetime64[ns]
1 Gold Prices (USD) 2136 non-null float64
2 Inflation 2136 non-null float64
3 Unemployment rate 2136 non-null float64
4 Interest rates 2136 non-null float64
5 Oil Prices (USD) 2136 non-null float64
dtypes: datetime64[ns](1), float64(5)
memory usage: 116.8 KB
[ ]: #missng values are removed , left us with 2136 rows"
[14]: #Visualising distrbution of each variable to understand distribution

import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for seaborn plots

sns.set(style="whitegrid")
# Plot the distribution of each variable

plt.figure(figsize=(12, 8))
# Plot for 'Gold Prices (USD)' variable

plt.subplot(3, 2, 2)
sns.histplot(df['Gold Prices (USD)'], kde=True)
plt.title('Distribution of Gold Prices (USD)')
# Plot for 'Inflation' variable

sns.histplot(df['Inflation'], kde=True)
plt.title('Distribution of Inflation')
# Plot for 'Unemployment rate' variable
5
sns.histplot(df['Unemployment rate'], kde=True)
plt.title('Distribution of Unemployment rate')
# Plot for 'Interest rates' variable

sns.histplot(df['Interest rates'], kde=True)
plt.title('Distribution of Interest rates')
# Plot for 'Oil Prices (USD)' variable

sns.histplot(df['Oil Prices (USD)'], kde=True)
plt.title('Distribution of Oil Prices (USD)')
plt.tight_layout()
plt.show()
[15]: import seaborn as sns

6
# Plot the trend of each variable over time (year)

# Plot for 'Gold Prices (USD)'

sns.lineplot(x='Year', y='Gold Prices (USD)', data=df)
plt.title('Trend of Gold Prices (USD)')
# Plot for 'Inflation'

sns.lineplot(x='Year', y='Inflation', data=df)
plt.title('Trend of Inflation')
# Plot for 'Unemployment rate'

sns.lineplot(x='Year', y='Unemployment rate', data=df)
plt.title('Trend of Unemployment rate')
# Plot for 'Interest rates'

sns.lineplot(x='Year', y='Interest rates', data=df)
plt.title('Trend of Interest rates')
# Plot for 'Oil Prices (USD)'

sns.lineplot(x='Year', y='Oil Prices (USD)', data=df)
plt.title('Trend of Oil Prices (USD)')
plt.tight_layout()
plt.show()
7
# Pairwise relationships between variables

sns.pairplot(df, vars=['Gold Prices (USD)', 'Inflation', 'Unemployment rate',␣
↪'Interest rates', 'Oil Prices (USD)'], kind='scatter')
plt.show()
8
# Assuming your dataframe is named 'df' with the relevant columns

# Compute the correlation matrix
correlation_matrix = df.corr()
# Plot the correlation matrix as a heatmap

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f",␣
↪square=True)
plt.title('Correlation Matrix between Variables')

plt.show()
9
[ ]: '''it can be seen that relationship between inflation and gold prices is␣
↪positive theoretically , but according to the correlation matrix it is
negative.It maybe due to external factors like govt policies that the data is␣
↪behaving like this. Also this is not so strong relationship'''
[19]: #OUTLIER DETECTION

import pandas as pd
# Assuming your dataframe is named 'df' with the relevant variables
# Define a function to detect outliers using z-score

def detect_outliers_zscore(data, threshold=3):
z_scores = (data - data.mean()) / data.std()
return (z_scores > threshold) | (z_scores < -threshold)
# Define a function to detect outliers using IQR
10
def detect_outliers_iqr(data, threshold=1.5):
q1 = data.quantile(0.25)
iqr = q3 - q1
lower_bound = q1 - threshold * iqr
upper_bound = q3 + threshold * iqr
return (data < lower_bound) | (data > upper_bound)
# Iterate over each column in the dataframe to detect outliers

outliers = {}
for column in df.columns:
outliers[column] = {
'zscore_outliers': detect_outliers_zscore(df[column]),
'iqr_outliers': detect_outliers_iqr(df[column])
}
# Display outliers for each variable

for column, values in outliers.items():
print(f"Outliers in {column}:")
print("Outliers:")
print(values['zscore_outliers'])
print("IQR Outliers:")
print(values['iqr_outliers'])
print()
Outliers in Year:
Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Year, Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
11
2132 False
2133 False
2134 False
2135 False
Name: Year, Length: 2136, dtype: bool
Outliers in Gold Prices (USD):

Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Gold Prices (USD), Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Gold Prices (USD), Length: 2136, dtype: bool
Outliers in Inflation:
Outliers:
0 True
1 True
2 True
3 True
4 True
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Inflation, Length: 2136, dtype: bool
12
IQR Outliers:
0 True
1 True
2 True
3 True
4 True
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Inflation, Length: 2136, dtype: bool
Outliers in Unemployment rate:

Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Unemployment rate, Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Unemployment rate, Length: 2136, dtype: bool
Outliers in Interest rates:

Outliers:
0 True
1 True
2 True
3 True
13
4 True
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Interest rates, Length: 2136, dtype: bool
IQR Outliers:
0 True
1 True
2 True
3 True
4 True
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Interest rates, Length: 2136, dtype: bool
Outliers in Oil Prices (USD):

Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
Name: Oil Prices (USD), Length: 2136, dtype: bool
IQR Outliers:
0 False
1 False
2 False
3 False
4 False
…
2131 False
2132 False
2133 False
2134 False
2135 False
14
Name: Oil Prices (USD), Length: 2136, dtype: bool

# Set the figure size

# Iterate over each column in the dataframe and plot box plots with outliers
for i, column in enumerate(df.columns):
plt.subplot(len(df.columns)//2, 2, i+1)
sns.boxplot(x=df[column], showfliers=True)
plt.title(f'Box Plot of {column}')
# Adjust layout
plt.tight_layout()
plt.show()
15
# Define a function to detect outliers using z-score

iqr = q3 - q1
# Initialize a dictionary to store the count of outliers for each variable

outlier_counts = {}
# Iterate over each column in the dataframe to count outliers

# Count outliers using z-score method
zscore_outliers_count = detect_outliers_zscore(df[column]).sum()
# Count outliers using IQR method

iqr_outliers_count = detect_outliers_iqr(df[column]).sum()
# Store the counts in the dictionary

outlier_counts[column] = {
'zscore_outliers_count': zscore_outliers_count,
'iqr_outliers_count': iqr_outliers_count
}
# Display the count of outliers for each variable

for column, values in outlier_counts.items():
print(f"Outlier counts in {column}:")
print("Z-Score Outliers Count:", values['zscore_outliers_count'])
print("IQR Outliers Count:", values['iqr_outliers_count'])
print()
Outlier counts in Year:

Z-Score Outliers Count: 0
IQR Outliers Count: 0
Outlier counts in Gold Prices (USD):

16
Outlier counts in Inflation:
Outlier counts in Unemployment rate:

Outlier counts in Interest rates:

Outlier counts in Oil Prices (USD):

[22]: # Define a function to detect outliers using z-score


iqr = q3 - q1
# Initialize a dictionary to store the outliers for each variable

outliers = {}
# Iterate over each column in the dataframe to detect and store outliers
# Detect outliers using z-score method
zscore_outliers = df[column][detect_outliers_zscore(df[column])]
# Detect outliers using IQR method

iqr_outliers = df[column][detect_outliers_iqr(df[column])]
# Store the outliers in the dictionary

outliers[column] = {
'zscore_outliers': zscore_outliers,
'iqr_outliers': iqr_outliers
}
17
# Display the outlier values for each variable
for column, values in outliers.items():
print(f"Outliers in {column}:")
print("Z-Score Outliers:")
print(values['zscore_outliers'])
print("IQR Outliers:")
print(values['iqr_outliers'])
print()
Outliers in Year:
Z-Score Outliers:
Series([], Name: Year, dtype: datetime64[ns])
IQR Outliers:
Series([], Name: Year, dtype: datetime64[ns])
Outliers in Gold Prices (USD):

Z-Score Outliers:
Series([], Name: Gold Prices (USD), dtype: float64)
IQR Outliers:
Series([], Name: Gold Prices (USD), dtype: float64)
Outliers in Inflation:
Z-Score Outliers:
0 13.549202
1 13.549202
2 13.549202
3 13.549202
4 13.549202
…
99 10.334715
100 10.334715
101 10.334715
102 10.334715
103 10.334715
Name: Inflation, Length: 104, dtype: float64
IQR Outliers:
0 13.549202
1 13.549202
2 13.549202
3 13.549202
4 13.549202
…
1821 0.118627
1822 0.118627
1823 0.118627
1824 0.118627
18
1825 0.118627
Name: Inflation, Length: 261, dtype: float64
Outliers in Unemployment rate:

Z-Score Outliers:
148 10.8
149 10.8
150 10.8
151 10.8
152 10.8
153 10.8
154 10.8
155 10.8
156 10.8
Name: Unemployment rate, dtype: float64
IQR Outliers:
Series([], Name: Unemployment rate, dtype: float64)
Outliers in Interest rates:

Z-Score Outliers:
0 18.9
1 18.9
2 18.9
3 18.9
4 18.9
5 18.9
6 18.9
7 18.9
8 18.9
9 18.9
10 18.9
11 18.9
12 18.9
13 18.9
14 18.9
15 18.9
16 18.9
17 18.9
18 18.9
19 18.9
20 18.9
21 18.9
22 18.9
23 18.9
24 18.9
25 18.9
26 18.9
27 18.9
19
28 18.9
29 18.9
30 18.9
31 18.9
32 18.9
33 18.9
34 18.9
35 18.9
36 18.9
37 18.9
38 18.9
39 18.9
40 18.9
41 18.9
42 18.9
43 18.9
44 18.9
45 18.9
46 18.9
47 18.9
48 18.9
49 18.9
50 18.9
51 18.9
Name: Interest rates, dtype: float64
IQR Outliers:
0 18.9
1 18.9
2 18.9
3 18.9
4 18.9
5 18.9
6 18.9
7 18.9
8 18.9
9 18.9
10 18.9
11 18.9
12 18.9
13 18.9
14 18.9
15 18.9
16 18.9
17 18.9
18 18.9
19 18.9
20 18.9
21 18.9
20
22 18.9
23 18.9
24 18.9
25 18.9
26 18.9
27 18.9
28 18.9
29 18.9
30 18.9
31 18.9
32 18.9
33 18.9
34 18.9
35 18.9
36 18.9
37 18.9
38 18.9
39 18.9
40 18.9
41 18.9
42 18.9
43 18.9
44 18.9
45 18.9
46 18.9
47 18.9
48 18.9
49 18.9
50 18.9
51 18.9
Name: Interest rates, dtype: float64
Outliers in Oil Prices (USD):

Z-Score Outliers:
Series([], Name: Oil Prices (USD), dtype: float64)
IQR Outliers:
Series([], Name: Oil Prices (USD), dtype: float64)
[ ]: '''It can be seen that there are many outliers but share historical event ,␣
↪since the outliers are considered meaningful reflections of historical
events in U.S. history,

it may be appropriate to retain them in the dataset and proceed with the␣
↪analysis.'''
[23]: #Finding multicollinearity

import pandas as pd
21
# Assuming your dataframe is named 'df' with the independent variables as␣
↪columns
# Calculate the correlation matrix

# Display the correlation matrix as a table

print("Correlation Matrix:")
print(correlation_matrix)
Correlation Matrix:
Year Gold Prices (USD) Inflation Unemployment rate \
Year 1.000000 0.775752 -0.532349 -0.501394
Gold Prices (USD) 0.775752 1.000000 -0.205732 -0.014243
Inflation -0.532349 -0.205732 1.000000 0.277274
Unemployment rate -0.501394 -0.014243 0.277274 1.000000
Interest rates -0.841198 -0.560575 0.772618 0.399629
Oil Prices (USD) 0.678407 0.772340 -0.298871 0.036252
Interest rates Oil Prices (USD)

Year -0.841198 0.678407
Gold Prices (USD) -0.560575 0.772340
Inflation 0.772618 -0.298871
Unemployment rate 0.399629 0.036252
Interest rates 1.000000 -0.550246
Oil Prices (USD) -0.550246 1.000000
# Assuming your dataframe is named 'df' with the independent variables as␣
↪columns
# Calculate the correlation matrix

# Set the threshold values for correlation

threshold_min = 0.7
threshold_max = 0.9
# Initialize a list to store pairs of variables with high correlation

high_correlation_pairs = []
# Iterate through the correlation matrix to identify pairs with correlation␣

↪between the thresholds
for i in range(len(correlation_matrix.columns)):
for j in range(i+1, len(correlation_matrix.columns)):
22
if abs(correlation_matrix.iloc[i, j]) >= threshold_min and␣
↪abs(correlation_matrix.iloc[i, j]) <= threshold_max:
high_correlation_pairs.append((correlation_matrix.columns[i],␣
↪correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))
# Display the pairs of variables with high correlation

print(f"Pairs of variables with correlation between {threshold_min} and␣
↪{threshold_max}:")
for pair in high_correlation_pairs:

print(f"{pair[0]} - {pair[1]}: {pair[2]}")
Pairs of variables with correlation between 0.7 and 0.9:

Year - Gold Prices (USD): 0.7757519467104836
Year - Interest rates: -0.8411983318709919
Gold Prices (USD) - Oil Prices (USD): 0.7723398490361068
Inflation - Interest rates: 0.7726182398554902
[ ]: #PREPROCESSING IS DONE
[31]: # Save the DataFrame with Year and Gold Prices to a CSV file (time series)
df.to_csv("data_revised.csv", index=False)
[ ]: '''Since there is multicollinearity between "Gold Prices (USD) - Oil Prices␣

↪(USD)" and "Inflation - Interest rates" , we will use
PCA '''
[43]: #PCA
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#Since , PCA does not require time variable , we will drop "Year"
data = df.drop(columns=['Year'])
[44]: data
[44]: Gold Prices (USD) Inflation Unemployment rate Interest rates \

0 588.0 13.549202 6.3 18.90
1 623.0 13.549202 6.3 18.90
2 835.0 13.549202 6.3 18.90
3 668.0 13.549202 6.3 18.90
4 676.5 13.549202 6.9 18.90
… … … … …
2131 1940.8 4.697859 4.6 0.09
2132 1890.9 4.697859 4.6 0.09
2133 1875.7 4.697859 4.6 0.09
2134 1779.3 4.697859 4.6 0.09
23
2135 1843.0 4.697859 4.6 0.09
Oil Prices (USD)

0 21.59
1 21.59
2 21.59
3 21.59
4 21.59
… …
2131 36.86
2132 36.86
2133 36.86
2134 36.86
2135 36.86
[2136 rows x 5 columns]
[46]: # Standardize the features

scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)
print(scaled_data)
[[-0.21158217 4.5466568 -0.00808432 3.63105777 -0.62875152]

[-0.13510079 4.5466568 -0.00808432 3.63105777 -0.62875152]
[ 0.32815787 4.5466568 -0.00808432 3.63105777 -0.62875152]
…
[ 2.60227714 0.71046548 -1.17299696 -1.07620925 -0.05013795]
[ 2.39162556 0.71046548 -1.17299696 -1.07620925 -0.05013795]
[ 2.53082168 0.71046548 -1.17299696 -1.07620925 -0.05013795]]
[47]: scaled_data
[47]: array([[-0.21158217, 4.5466568 , -0.00808432, 3.63105777, -0.62875152],

[-0.13510079, 4.5466568 , -0.00808432, 3.63105777, -0.62875152],
[ 0.32815787, 4.5466568 , -0.00808432, 3.63105777, -0.62875152],
…,
[ 2.60227714, 0.71046548, -1.17299696, -1.07620925, -0.05013795],
[ 2.39162556, 0.71046548, -1.17299696, -1.07620925, -0.05013795],
[ 2.53082168, 0.71046548, -1.17299696, -1.07620925, -0.05013795]])
[48]: # Perform PCA

pca = PCA()
principal_components = pca.fit_transform(scaled_features)
[49]: explained_variance_ratio = pca.explained_variance_ratio_

print("Explained Variance Ratio:")
print(explained_variance_ratio)
24
Explained Variance Ratio:
[0.53362372 0.26151325 0.13667759 0.0456746 0.02251084]
[50]: # Cumulative explained variance

cumulative_explained_variance = explained_variance_ratio.cumsum()
print("Cumulative Explained Variance:")
print(cumulative_explained_variance)
Cumulative Explained Variance:

[0.53362372 0.79513696 0.93181456 0.97748916 1. ]
[52]: import matplotlib.pyplot as plt
# Plot the scree plot

plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio,␣
↪marker='o', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)
# Find the elbow point

# You can visually identify the elbow point or use a threshold to find the␣
↪point programmatically
threshold = 0.05 # Example threshold for change in explained variance ratio

for i in range(1, len(explained_variance_ratio)):
if explained_variance_ratio[i] - explained_variance_ratio[i-1] < threshold:
elbow_point = i
break
# Mark the elbow point on the plot

plt.axvline(x=elbow_point, color='red', linestyle='--', label='Elbow Point')
plt.legend()
plt.show()
print("Elbow Point (k):", elbow_point)
25
Elbow Point (k): 1
[53]: '''However, if the scree plot exhibits a downward slope and there's no clear␣
↪elbow point,
it may imply that PCA is not suitable for dimensionality reduction in this␣
↪dataset,
or that the variables are not linearly related.'''
[ ]: ##MODEL BUILDING
#Since,we have multicollinearity in data , it is better to use Lasso␣
↪Regression . But for exploration purpose ,we are performing both
#Lasso and Linear Regression and to check the accuracy.
[ ]: ---------------------------------------------------------------------------------------------
[60]: #LINEAR REGRESSION

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
[62]: X = df[['Gold Prices (USD)']] # Independent variable

y = df['Gold Prices (USD)'] # Dependent variable
26
[67]: # Assuming X_train, X_test, y_train, y_test are your training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,␣
↪random_state=42)
[127]: df.dtypes
[127]: Year datetime64[ns]

Gold Prices (USD) float64
Inflation float64
Unemployment rate float64
Interest rates float64
Oil Prices (USD) float64
dtype: object
[68]: linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)
print("Linear Regression RMSE:", rmse_linear)
Linear Regression RMSE: 1.2366982197576302e-13

C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\metrics\_regression.py:483: FutureWarning: 'squared' is
deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean
squared error, use the function'root_mean_squared_error'.
warnings.warn(
[135]: #Accuracy
from sklearn.metrics import r2_score
r2_linear = r2_score(y_test, y_pred_linear)
print("Linear Regression R-squared:", r2_linear)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[135], line 3
1 #Accuracy
2 from sklearn.metrics import r2_score
----> 3 r2_linear = r2_score(y_test, y_pred_linear)
4 print("Linear Regression R-squared:", r2_linear)
File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_param_validation
↪py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
27
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we␣
↪replace
217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.__qualname__} must be",
222 str(e),
223 )
File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.
↪py:1180, in r2_score(y_true, y_pred, sample_weight, multioutput, force_finite)
1039 @validate_params(
1040 {
1041 "y_true": ["array-like"],
(…)
1059 force_finite=True,
1060 ):
1061 """:math:`R^2` (coefficient of determination) regression score␣
↪function.
1062
1063 Best possible score is 1.0 and it can be negative (because the
(…)
1178 -inf
1179 """
-> 1180 y_type, y_true, y_pred, multioutput = _check_reg_targets(
1181 y_true, y_pred, multioutput
1182 )
1183 check_consistent_length(y_true, y_pred, sample_weight)
1185 if _num_samples(y_pred) < 2:
File␣
↪py:102, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
68 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):

69 """Check that y_true and y_pred belong to the same regression task.
70
71 Parameters
(…)
100 correct keyword.
101 """
--> 102 check_consistent_length(y_true, y_pred)
28
103 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
104 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.
↪py:457, in check_consistent_length(*arrays)
455 uniques = np.unique(lengths)

456 if len(uniques) > 1:
--> 457 raise ValueError(
458 "Found input variables with inconsistent numbers of samples: %r"
459 % [int(l) for l in lengths]
460 )
ValueError: Found input variables with inconsistent numbers of samples: [428,␣

↪855]
[ ]: ----------------------------------------------------------------------------------------------
[72]: #LASSO REGRESSION

from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
lasso_reg = Lasso()
parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso_grid = GridSearchCV(lasso_reg, parameters,␣
↪scoring='neg_mean_squared_error', cv=5)
lasso_grid.fit(X_train, y_train)
y_pred_lasso = lasso_grid.predict(X_test)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
print("Lasso Regression RMSE:", rmse_lasso)
Lasso Regression RMSE: 2.2337202089187744e-06

warnings.warn(
[120]:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[120], line 1
----> 1 r2_lasso = r2_score(y_test, y_pred_lasso)
29
2 print("Lasso Regression R-squared:", r2_lasso)
File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_param_validation
↪py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
207 try:
208 with config_context(
209 skip_parameter_validation=(
210 prefer_skip_nested_validation or global_skip_validation
211 )
212 ):
--> 213 return func(*args, **kwargs)
214 except InvalidParameterError as e:
215 # When the function is just a wrapper around an estimator, we allow
216 # the function to delegate validation to the estimator, but we␣
↪replace
217 # the name of the estimator by the name of the function in the error
218 # message to avoid confusion.
219 msg = re.sub(
220 r"parameter of \w+ must be",
221 f"parameter of {func.__qualname__} must be",
222 str(e),
223 )
File␣
↪py:1180, in r2_score(y_true, y_pred, sample_weight, multioutput, force_finite)
1039 @validate_params(
1040 {
1041 "y_true": ["array-like"],
(…)
1059 force_finite=True,
1060 ):
1061 """:math:`R^2` (coefficient of determination) regression score␣
↪function.
1062
1063 Best possible score is 1.0 and it can be negative (because the
(…)
1178 -inf
1179 """
-> 1180 y_type, y_true, y_pred, multioutput = _check_reg_targets(
1181 y_true, y_pred, multioutput
1182 )
1183 check_consistent_length(y_true, y_pred, sample_weight)
1185 if _num_samples(y_pred) < 2:
30
File␣
↪py:102, in _check_reg_targets(y_true, y_pred, multioutput, dtype)
68 def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):

69 """Check that y_true and y_pred belong to the same regression task.
70
71 Parameters
(…)
100 correct keyword.
101 """
--> 102 check_consistent_length(y_true, y_pred)
103 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
104 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
File␣
↪~\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.
↪py:457, in check_consistent_length(*arrays)
455 uniques = np.unique(lengths)

456 if len(uniques) > 1:
--> 457 raise ValueError(
458 "Found input variables with inconsistent numbers of samples: %r"
459 % [int(l) for l in lengths]
460 )
ValueError: Found input variables with inconsistent numbers of samples: [428,␣

↪855]
[ ]: ----------------------------------------------------------------------------------------------
[ ]: #CROSS VALIDATION
[118]: from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import Lasso
import numpy as np
# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Split the data into training and testing sets
↪random_state=42)
# Train the Lasso Regression model

lasso = Lasso(alpha=0.1) # You can adjust the alpha parameter as needed
lasso.fit(X_train, y_train) # Fit the model to the training data
# Evaluate the model's performance on training and testing sets

train_preds = lasso.predict(X_train)
31
test_preds = lasso.predict(X_test)
train_mse = mean_squared_error(y_train, train_preds)

test_mse = mean_squared_error(y_test, test_preds)
print("Training MSE:", train_mse)

print("Testing MSE:", test_mse)
# Perform cross-validation
cv_scores = cross_val_score(lasso, X_train, y_train, cv=5,␣
↪scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print("Cross-validation RMSE scores:", cv_rmse)

print("Mean CV RMSE:", np.mean(cv_rmse))
# Calculate the difference between training and testing accuracies

accuracy_diff = np.abs(train_mse - test_mse)
print("Difference between Training and Testing MSE:", accuracy_diff)
Training MSE: 4.776072211470752e-08

Testing MSE: 4.781923557039156e-08
Cross-validation RMSE scores: [0.00022782 0.00022004 0.00019805 0.0002186
0.00023008]
Mean CV RMSE: 0.00021891751522405177
Difference between Training and Testing MSE: 5.851345568403972e-11
[119]: lasso_overall_r2 = lasso.score(X, y)

print("Overall R-squared:", lasso_overall_r2)
Overall R-squared: 0.9999999999997718
[74]: import matplotlib.pyplot as plt
# Plot actual vs. predicted values

plt.scatter(y_test, y_pred_linear, color='blue', label='Actual vs. Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--',␣
↪lw=2, color='red', label='Perfect Prediction')
plt.xlabel('Actual Gold Prices')

plt.ylabel('Predicted Gold Prices')
plt.title('Actual vs. Predicted Gold Prices (Linear Regression)')
plt.legend()
plt.grid(True)
plt.show()
C:\Users\lariy\AppData\Local\Temp\ipykernel_17032\3438038971.py:6: UserWarning:
color is redundantly defined by the 'color' keyword argument and the fmt string
32
"k--" (-> color='k'). The keyword argument will take precedence.
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--',
lw=2, color='red', label='Perfect Prediction')
[ ]: #CALCULATING RESIDUALS
[ ]: #checking assumptions
[78]: from sklearn.linear_model import LinearRegression

import numpy as np
# Assuming X is your independent variable and y is your dependent variable

# X and y should be numpy arrays or pandas DataFrames/Series
# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)
# Get the predicted values

predicted_values = model.predict(X)
# Calculate residuals
residuals = y - predicted_values
# Now 'residuals' contains the residuals from the regression model
33
[79]: from scipy.stats import shapiro
# Assuming 'residuals' is a variable containing the residuals from your␣

↪regression model
shapiro_test_statistic, shapiro_p_value = shapiro(residuals)
print("Shapiro-Wilk Test:")
print("Test Statistic:", shapiro_test_statistic)
print("p-value:", shapiro_p_value)
if shapiro_p_value > 0.05:

print("p-value > 0.05. Fail to reject the null hypothesis. Residuals are␣
↪normally distributed.")
else:
print("p-value <= 0.05. Reject the null hypothesis. Residuals are not␣
Shapiro-Wilk Test:
Test Statistic: 0.5694072286447804
p-value: 2.730521207845291e-58
p-value <= 0.05. Reject the null hypothesis. Residuals are not normally
distributed.
[81]: from statsmodels.stats.diagnostic import het_breuschpagan

import statsmodels.api as sm
# Assuming 'X' is your independent variable(s) and 'y' is your dependent␣

↪variable
# Fit the regression model

X = sm.add_constant(X) # Add constant if necessary
model = sm.OLS(y, X)
results = model.fit()
# Perform the Breusch-Pagan test

bp_test_statistic, bp_p_value, _, _ = het_breuschpagan(results.resid, X)
print("\nBreusch-Pagan Test:")
print("Test Statistic:", bp_test_statistic)
print("p-value:", bp_p_value)
Breusch-Pagan Test:
Test Statistic: 1100.672462731512
p-value: 2.3590049793082233e-241
[ ]: #APPLYING LOG TRANSFORMATION TO GET UNIFORM DISTRIBUTED RESIDUALS
34
[82]: import numpy as np
from scipy.stats import skew
# Calculate skewness for each column

skewness = data.apply(skew)
# Select columns with skewness greater than a threshold (e.g., 0.5)

skewed_cols = skewness[skewness > 0.5].index
# Apply log transformation to skewed columns

data[skewed_cols] = np.log1p(data[skewed_cols])
# Check the distributions after transformation
[97]: #Visualising distrbution of each variable to understand distribution

import seaborn as sns

# Plot the distribution of each variable

# Plot for 'Gold Prices (USD)' variable

sns.histplot(data['Gold Prices (USD)'], kde=True)
plt.title('Distribution of Gold Prices (USD)')
# Plot for 'Inflation' variable

sns.histplot(data['Inflation'], kde=True)
plt.title('Distribution of Inflation')
# Plot for 'Unemployment rate' variable

sns.histplot(df['Unemployment rate'], kde=True)
plt.title('Distribution of Unemployment rate')
# Plot for 'Interest rates' variable

sns.histplot(data['Interest rates'], kde=True)
plt.title('Distribution of Interest rates')
35
# Plot for 'Oil Prices (USD)' variable
sns.histplot(data['Oil Prices (USD)'], kde=True)
plt.title('Distribution of Oil Prices (USD)')
plt.tight_layout()
plt.show()
[92]: residuals_1= np.log(y) - np.log(predicted_values)
[ ]: #AGAIN running tests
[93]: from scipy.stats import shapiro
# Assuming 'log_residuals' is a variable containing the residuals after␣

↪applying the logarithmic transformation
shapiro_test_statistic, shapiro_p_value = shapiro(residuals_1)
print("Shapiro-Wilk Test:")
print("Test Statistic:", shapiro_test_statistic)
print("p-value:", shapiro_p_value)
if shapiro_p_value > 0.05:
36
print("p-value > 0.05. Fail to reject the null hypothesis. Residuals are␣
else:
print("p-value <= 0.05. Reject the null hypothesis. Residuals are not␣
Shapiro-Wilk Test:
Test Statistic: 0.45959720762702283
p-value: 2.042106007386736e-62
p-value <= 0.05. Reject the null hypothesis. Residuals are not normally
distributed.
[96]: from statsmodels.regression.linear_model import OLS

from statsmodels.stats.diagnostic import het_breuschpagan
# Fit the OLS regression model with the transformed data

model = OLS(y, X).fit()
# Compute the residuals

residuals = model.resid
# Perform the Breusch-Pagan test

bp_test_statistic, bp_p_value, _, _ = het_breuschpagan(residuals, X)
# Print the results

print("\nBreusch-Pagan Test:")
print("Test Statistic:", bp_test_statistic)
print("p-value:", bp_p_value)
Breusch-Pagan Test:
Test Statistic: 1100.672462731512
p-value: 2.3590049793082233e-241
[ ]: #RESULTS ARE THE SAME
[ ]: ------------------------------------------------------------------
[98]: #Weighted least square model

import numpy as np
import statsmodels.api as sm
# Fit a standard linear regression model

X = df[['Gold Prices (USD)']] # Independent variable
y = df['Gold Prices (USD)'] # Dependent variable
X = sm.add_constant(X) # Add constant if needed
37
model = sm.OLS(y, X).fit()
# Calculate residuals
residuals = model.resid
# Calculate weights based on the inverse of the variance of residuals

weights = 1.0 / np.var(residuals)
# Fit a weighted least squares regression model

weighted_model = sm.WLS(y, X, weights=weights).fit()
# Print summary of the weighted model

print(weighted_model.summary())
WLS Regression Results

==============================================================================
Dep. Variable: Gold Prices (USD) R-squared: 1.000
Model: WLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 3.375e+33
Date: Thu, 02 May 2024 Prob (F-statistic): 0.00
Time: 21:40:04 Log-Likelihood: 58148.
No. Observations: 2136 AIC: -1.163e+05
Df Residuals: 2134 BIC: -1.163e+05
Df Model: 1
Covariance Type: nonrobust
================================================================================
=====
coef std err t P>|t| [0.025
0.975]
--------------------------------------------------------------------------------
-----
const -1.705e-13 1.42e-14 -12.028 0.000 -1.98e-13
-1.43e-13
Gold Prices (USD) 1.0000 1.72e-17 5.81e+16 0.000 1.000
1.000
==============================================================================
Omnibus: 384.389 Durbin-Watson: 0.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 620.132
Skew: 1.302 Prob(JB): 2.19e-135
Kurtosis: 3.430 Cond. No. 1.48e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly
specified.
[2] The condition number is large, 1.48e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
38
[ ]: #CROSS VALIDATION
[99]: ##NEed to perform CROSS VALIDATION

from sklearn.model_selection import cross_val_score
# Assuming X_train and y_train are your training data and labels
# Instantiate the model
# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5) # You can adjust␣
↪the number of folds (cv) as needed
# Print the cross-validation scores

print("Cross-validation scores:", cv_scores)
# Calculate the mean and standard deviation of the cross-validation scores

mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
print("Mean CV score:", mean_cv_score)
print("Standard deviation of CV scores:", std_cv_score)
Cross-validation scores: [1. 1. 1. 1. 1.]

Mean CV score: 1.0
Standard deviation of CV scores: 0.0
[102]: from sklearn.model_selection import train_test_split


↪random_state=42)
# Fit the model on the training data

model.fit(X_train, y_train)
# Evaluate the model on the training data

y_train_pred = model.predict(X_train)
training_score = r2_score(y_train, y_train_pred)
# Evaluate the model on the testing data

y_test_pred = model.predict(X_test)
testing_score = r2_score(y_test, y_test_pred)
39
print("Training R-squared:", training_score)
print("Testing R-squared:", testing_score)
Training R-squared: 1.0

Testing R-squared: 1.0
[ ]: ----------------------------------------------------------------------------------------------
[ ]: #TRYING MODELS
[ ]: #DECISION TREE
[103]: from sklearn.tree import DecisionTreeRegressor

# Assuming X contains your independent variable(s) and y contains your␣

↪dependent variable

↪random_state=42)
# Initialize the Decision Tree Regressor

tree_regressor = DecisionTreeRegressor(random_state=42)
# Fit the model on the training data

tree_regressor.fit(X_train, y_train)
# Make predictions on the testing data

y_pred_train = tree_regressor.predict(X_train)
y_pred_test = tree_regressor.predict(X_test)
# Evaluate the model

train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print("Training R-squared:", train_r2)

print("Testing R-squared:", test_r2)

[105]: y_pred_dt = tree_regressor.predict(X_test)
# Calculate Mean Squared Error (MSE)

mse_dt = mean_squared_error(y_test, y_pred_dt)
40
# Calculate Root Mean Squared Error (RMSE)
rmse_dt = np.sqrt(mse_dt)
print("Decision Tree RMSE:", rmse_dt)
Decision Tree RMSE: 1.1855782531712218
[ ]: ----------------------------------------------------------------------------------------------
[ ]: #RANDOM FOREST
[107]: from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error

↪random_state=42)
# Initialize and fit the Random Forest Regression model

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
# Make predictions on training and testing data

y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)
# Calculate R-squared for training and testing data

r2_train_rf = r2_score(y_train, y_train_pred_rf)
r2_test_rf = r2_score(y_test, y_test_pred_rf)
print("Random Forest Regression")

print("Training R-squared:", r2_train_rf)
print("Testing R-squared:", r2_test_rf)
Random Forest Regression

[ ]: ----------------------------------------------------------------------------------------------
[ ]: #GRADIENT BOOSTING
[108]: # Initialize and fit the Gradient Boosting Regression model

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
41
# Make predictions on training and testing data
y_train_pred_gb = gb_model.predict(X_train)
y_test_pred_gb = gb_model.predict(X_test)
# Calculate R-squared for training and testing data

r2_train_gb = r2_score(y_train, y_train_pred_gb)
r2_test_gb = r2_score(y_test, y_test_pred_gb)
print("\nGradient Boosting Regression")

print("Training R-squared:", r2_train_gb)
print("Testing R-squared:", r2_test_gb)
Gradient Boosting Regression

[112]: pip install xgboost
Requirement already satisfied: xgboost in

c:\users\lariy\appdata\local\programs\python\python310\lib\site-packages
(2.0.3)Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: scipy in

c:\users\lariy\appdata\local\programs\python\python310\lib\site-packages (from
xgboost) (1.12.0)
Requirement already satisfied: numpy in
c:\users\lariy\appdata\local\programs\python\python310\lib\site-packages (from
xgboost) (1.26.2)
[notice] A new release of pip available: 22.2.2 -> 24.0

[notice] To update, run: python.exe -m pip install --upgrade pip
[115]: #Cross validation for Linear Regression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import numpy as np
# Assuming X_train, X_test, y_train, y_test are your training and testing data
# 1. Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
42
# Calculate RMSE
y_pred_train_lr = linear_reg.predict(X_train)
rmse_train_lr = np.sqrt(mean_squared_error(y_train, y_pred_train_lr))
y_pred_test_lr = linear_reg.predict(X_test)
rmse_test_lr = np.sqrt(mean_squared_error(y_test, y_pred_test_lr))
# Cross-validation
cv_scores_lr = cross_val_score(linear_reg, X_train, y_train, cv=5,␣
↪scoring='neg_mean_squared_error')
cv_rmse_lr = np.sqrt(-cv_scores_lr)
# Print results
print("Linear Regression:")
print("Training RMSE:", rmse_train_lr)
print("Testing RMSE:", rmse_test_lr)
print("Mean CV RMSE:", cv_rmse_lr.mean())
print("Std CV RMSE:", cv_rmse_lr.std())
Linear Regression:
Training RMSE: 2.2829534657510607e-13
Testing RMSE: 2.3371014992128577e-13
Mean CV RMSE: 2.4273458808647473e-13
Std CV RMSE: 9.99530028073763e-14
[124]:
packages\sklearn\model_selection\_validation.py:547: FitFailedWarning:
170 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to
nan.
If these failures are not expected, you can try to debug them by setting
error_score='raise'.
Below are more details about the failures:

--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\lariy\AppData\Local\Programs\Python\Python310\lib\site-
packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
packages\sklearn\base.py", line 1467, in wrapper
estimator._validate_params()
packages\sklearn\base.py", line 666, in _validate_params
43
validate_parameter_constraints(
packages\sklearn\utils\_param_validation.py", line 95, in
validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features'
parameter of RandomForestRegressor must be an int in the range [1, inf), a float
in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto'
instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
packages\sklearn\model_selection\_search.py:1051: UserWarning: One or more of
the test scores are non-finite: [ nan -58.93907181 -37.90943391
-39.08803166 -43.19203625
-41.3236575 nan nan nan -57.15259269
-9.60570762 -40.55572985 nan -58.59893245 nan
-73.39584413 -18.16182559 nan -11.13065831 -49.6359911
nan -48.98262037 -11.31604215 -11.67992611 -48.91244679
-15.17315845 -27.45118465 nan -38.86970611 nan
-46.46008978 -76.8895055 -58.00598485 nan nan
-44.38463655 -11.17309129 nan -47.82243001 -43.27158338
-74.06200261 nan nan -17.23503776 -25.69643983
-60.15686948 nan -79.96635582 -27.44276843 -62.70058757
-58.44250228 nan -64.96921744 nan -63.50868913
-64.67395609 -70.42213274 -50.12022647 nan -27.87982487
nan -71.32384802 -25.74420683 nan nan
-73.91215454 -62.70968467 -38.69771057 -71.87860269 nan
-47.75998313 nan -57.17101105 -69.30109927 -27.04269829
nan -21.32999161 nan -11.68845447 -71.50186947
-58.46591787 -63.41533483 nan -25.82496422 -54.44954262
nan -25.74575808 -33.52783331 -32.28189951 nan
-71.64371947 -42.34704253 nan -4.80991776 nan
nan -25.35147124 -79.21123837 nan nan]
warnings.warn(
Best Hyperparameters: {'max_depth': 30, 'max_features': 'log2',
'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 123}
Training RMSE: 0.7535148520604379
Testing RMSE: 1.1135615099471048
warnings.warn(
44
warnings.warn(
[125]: #hyperparameter tunning

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
# Define the hyperparameters grid

param_dist = {
'n_estimators': randint(50, 200),
'max_depth': [None, 10, 20, 30],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': ['auto', 'sqrt', 'log2']
}
# Initialize the RandomForestRegressor

rf_regressor = RandomForestRegressor()
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_regressor,␣
↪param_distributions=param_dist, n_iter=100, cv=5,␣
↪scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)
# Get the best hyperparameters

best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)
# Initialize the RandomForestRegressor with the best hyperparameters

best_rf_regressor = RandomForestRegressor(**best_params)
# Fit the model with the best hyperparameters

best_rf_regressor.fit(X_train, y_train)
# Evaluate the model

train_rmse = mean_squared_error(y_train, best_rf_regressor.predict(X_train),␣
↪squared=False)
test_rmse = mean_squared_error(y_test, best_rf_regressor.predict(X_test),␣

↪squared=False)
print("Training RMSE:", train_rmse)

print("Testing RMSE:", test_rmse)
packages\sklearn\model_selection\_validation.py:547: FitFailedWarning:
45
170 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to
nan.
If these failures are not expected, you can try to debug them by setting
error_score='raise'.
Below are more details about the failures:

--------------------------------------------------------------------------------
170 fits failed with the following error:
Traceback (most recent call last):
packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
packages\sklearn\base.py", line 1467, in wrapper
estimator._validate_params()
packages\sklearn\base.py", line 666, in _validate_params
validate_parameter_constraints(
packages\sklearn\utils\_param_validation.py", line 95, in
validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features'
parameter of RandomForestRegressor must be an int in the range [1, inf), a float
in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto'
instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
packages\sklearn\model_selection\_search.py:1051: UserWarning: One or more of
the test scores are non-finite: [ nan -58.46487472 -37.58114991
-42.56400437 -43.44029062
-42.10161918 nan nan nan -57.94417192
-9.16763144 -36.80911199 nan -61.16860533 nan
-72.77088008 -17.58202262 nan -11.41052383 -47.83249256
nan -49.86612921 -11.77792633 -12.80862819 -48.15975327
-16.30022891 -26.89035096 nan -36.89072673 nan
-50.18467986 -75.53246766 -60.59819993 nan nan
-43.91990928 -11.54445493 nan -49.10973958 -45.45312132
-71.05456128 nan nan -16.48893719 -25.75054451
-60.00189991 nan -70.68815294 -27.23028137 -64.0287326
-56.70982844 nan -63.52681669 nan -63.1903081
-61.92191672 -70.84075316 -54.15822214 nan -27.03075124
nan -72.24201098 -27.96395719 nan nan
-73.31688285 -59.45094749 -37.26006078 -71.19256996 nan
-49.74931541 nan -56.2931076 -70.51918141 -26.51741668
nan -21.61383175 nan -11.67718781 -72.78182952
46
-57.23201728 -62.8046166 nan -27.12209161 -51.13025851
nan -27.13044707 -32.43912226 -32.90987822 nan
-69.76462899 -42.51182264 nan -4.74511793 nan
nan -26.98026314 -77.91433524 nan nan]
warnings.warn(
Best Hyperparameters: {'max_depth': 30, 'max_features': 'log2',
'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 123}
Training RMSE: 0.6656633581584913
Testing RMSE: 1.1766919161273477
warnings.warn(
warnings.warn(
47

Regression and Eda

Uploaded by

Copyright:

Available Formats

You might also like

Regression and Eda

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Regression and Eda

Uploaded by

Copyright:

Available Formats

regression-and-eda

[2]: import pandas as pd

[3]: Unnamed: 0 Unnamed: 1 Year Gold Prices (USD) Inflation \

Unemployment rate Interest rates Oil Prices (USD)

[2244 rows x 8 columns]

[7]: Unnamed: 0 Unnamed: 1 Year Gold Prices (USD) \

Inflation Unemployment rate Interest rates Oil Prices (USD)

[27]: Index(['Year', 'Gold Prices (USD)', 'Inflation', 'Unemployment rate',

[8]: #Removing unnecessary columns like "unnamed:0" and "unnamed:1"

df = df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])

Oil Prices (USD)

[133]: # Visualize data using histograms

# Visualize data using box plots

# Visualize data using scatter plots

[133]: <Axes: xlabel='Year', ylabel='Gold Prices (USD)'>

df = df.dropna(subset=['Inflation', 'Unemployment rate'])

# Display the modified dataframe

Year Gold Prices (USD) Inflation Unemployment rate Interest rates \

Oil Prices (USD)

[ ]: #missng values are removed , left us with 2136 rows"

[14]: #Visualising distrbution of each variable to understand distribution

# Assuming your dataframe is named 'df'

# Set the style for seaborn plots

# Plot the distribution of each variable

# Plot for 'Gold Prices (USD)' variable

# Plot for 'Inflation' variable

# Plot for 'Unemployment rate' variable

# Plot for 'Interest rates' variable

# Plot for 'Oil Prices (USD)' variable

[15]: import seaborn as sns

# Assuming your dataframe is named 'df'

# Set the style for seaborn plots

# Plot the trend of each variable over time (year)

# Plot for 'Gold Prices (USD)'

# Plot for 'Inflation'

# Plot for 'Unemployment rate'

# Plot for 'Interest rates'

# Plot for 'Oil Prices (USD)'

# Pairwise relationships between variables

# Assuming your dataframe is named 'df' with the relevant columns

# Plot the correlation matrix as a heatmap

plt.title('Correlation Matrix between Variables')

[19]: #OUTLIER DETECTION

# Assuming your dataframe is named 'df' with the relevant variables

# Define a function to detect outliers using z-score

# Define a function to detect outliers using IQR

# Iterate over each column in the dataframe to detect outliers

# Display outliers for each variable

Outliers in Gold Prices (USD):

Outliers in Unemployment rate:

Outliers in Interest rates:

Outliers in Oil Prices (USD):

[21]: import seaborn as sns

# Assuming your dataframe is named 'df' with the relevant variables

# Set the figure size

[20]: import pandas as pd

# Assuming your dataframe is named 'df' with the relevant variables

# Define a function to detect outliers using IQR