ARIMA Predict Forecast

In [1]: import pandas as pd
data=pd.read_excel('Downloads/Daily_temp.xlsx','Sheet1')
In [3]: data['Date']=pd.to_datetime(data['Date'])
In [5]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 3650 non-null datetime64[ns]
1 Temp 3650 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 57.2 KB
In [6]: #Set as an index

d= data.set_index('Date')
In [7]: d.head(3)
Out[7]:
Temp
Date
1981-01-01 20.7
1981-01-02 17.9
1981-01-03 18.8
In [8]: from matplotlib import pyplot as plt
In [12]: d.plot(figsize=(15, 6))
plt.title("Daily Temperature", color='green')

plt.xlabel('Year', color='red')
plt.ylabel('Temp', color='red')
plt.show()
In [13]: from arch.unitroot import ADF
In [14]: adf=ADF(d, lags=20)

print(adf.summary())
Augmented Dickey-Fuller Results

=====================================
Test Statistic -4.445
P-value 0.000
Lags 20
-------------------------------------
Trend: Constant
Critical Values: -3.43 (1%), -2.86 (5%), -2.57 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary.
In [15]: from arch.unitroot import PhillipsPerron as pp
In [16]: ppt = pp(d, lags=50)

print(ppt.summary())
Phillips-Perron Test (Z-tau)

=====================================
Test Statistic -37.280
P-value 0.000
Lags 50
-------------------------------------
Trend: Constant
Critical Values: -3.43 (1%), -2.86 (5%), -2.57 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary.
In [18]: from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
In [19]: plot_acf(d, lags=20)

plt.show()
In [20]: plot_pacf(d, lags=20, method='ols')

plt.show()
In [21]: from statsmodels.tsa.ar_model import AutoReg
In [22]: a = d.values
In [26]: Model_AR8 = AutoReg(a, lags=8).fit()
In [27]: print(Model_AR8.summary())
AutoReg Model Results

==============================================================================
Dep. Variable: y No. Observations: 3650
Model: AutoReg(8) Log Likelihood -8406.781
Method: Conditional MLE S.D. of innovations 2.434
Date: Wed, 06 Mar 2024 AIC 16833.563
Time: 22:46:41 BIC 16895.566
Sample: 8 HQIC 16855.646
3650
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 1.0333 0.140 7.379 0.000 0.759 1.308
y.L1 0.6184 0.017 37.428 0.000 0.586 0.651
y.L2 -0.0783 0.019 -4.033 0.000 -0.116 -0.040
y.L3 0.0690 0.019 3.549 0.000 0.031 0.107
y.L4 0.0574 0.019 2.951 0.003 0.019 0.096
y.L5 0.0602 0.019 3.094 0.002 0.022 0.098
y.L6 0.0493 0.019 2.538 0.011 0.011 0.087
y.L7 0.0562 0.019 2.894 0.004 0.018 0.094
y.L8 0.0750 0.017 4.541 0.000 0.043 0.107
Roots
=============================================================================
Real Imaginary Modulus Frequency
-----------------------------------------------------------------------------
AR.1 1.0347 -0.0000j 1.0347 -0.0000
AR.2 0.8986 -0.9252j 1.2897 -0.1273
AR.3 0.8986 +0.9252j 1.2897 0.1273
AR.4 0.0699 -1.4139j 1.4157 -0.2421
AR.5 0.0699 +1.4139j 1.4157 0.2421
AR.6 -1.6176 -0.0000j 1.6176 -0.5000
AR.7 -1.0516 -1.1332j 1.5460 -0.3691
AR.8 -1.0516 +1.1332j 1.5460 0.3691
-----------------------------------------------------------------------------
In [29]: Model_AR8.plot_diagnostics(figsize=(15, 12))

plt.show()
In [30]: residual = Model_AR8.resid
In [31]: plot_acf(residual, lags=20)

plt.show()
In [32]: plot_pacf(residual, lags=20, method='ols')

plt.show()
In [34]: from statsmodels.stats.diagnostic import acorr_ljungbox as gb
In [35]: gb(residual, lags=10, return_df=True)
Out[35]:
lb_stat lb_pvalue
1 0.120086 0.728941
2 0.492914 0.781565
3 1.286879 0.732251
4 3.055532 0.548576
5 6.022535 0.304032
6 10.872029 0.092413
7 14.968504 0.036405
8 38.589406 0.000006
9 39.281441 0.000010
10 39.826915 0.000018
In [37]: from statsmodels.tsa.arima.model import ARIMA
In [40]: Model = ARIMA(a, order=(3, 0, 1)).fit()
In [41]: print(Model.summary())
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 3650
Model: ARIMA(3, 0, 1) Log Likelihood -8382.800
Date: Wed, 06 Mar 2024 AIC 16777.600
Time: 22:53:01 BIC 16814.815
Sample: 0 HQIC 16790.853
- 3650
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 11.4717 0.801 14.322 0.000 9.902 13.042
ar.L1 1.4851 0.018 80.334 0.000 1.449 1.521
ar.L2 -0.6204 0.027 -23.367 0.000 -0.672 -0.568
ar.L3 0.1304 0.018 7.386 0.000 0.096 0.165
ma.L1 -0.8928 0.012 -76.046 0.000 -0.916 -0.870
sigma2 5.7833 0.128 45.330 0.000 5.533 6.033
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 16.70
Prob(Q): 0.94 Prob(JB): 0.00
Heteroskedasticity (H): 0.86 Skew: 0.08
Prob(H) (two-sided): 0.01 Kurtosis: 3.28
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [42]: res = Model.resid
In [45]: Model.plot_diagnostics(figsize=(15, 12))

plt.show()
In [49]: plot_pacf(res, lags=20, method='ols')

plt.show()
In [50]: gb(res, lags=10, return_df=True)
Out[50]:
lb_stat lb_pvalue
1 0.006790 0.934329
2 0.008225 0.995896
3 0.009125 0.999769
4 0.082014 0.999182
5 0.094185 0.999860
6 0.310555 0.999444
7 0.577923 0.999108
8 0.673870 0.999589
9 0.916520 0.999607
10 2.172116 0.994843
In [55]: #computing R square

import numpy as np
actual_mean=np.mean(a)
tss=np.sum((a-actual_mean)**2)
In [56]: rss=np.sum(res**2)
In [59]: rsq = 1-(rss/tss)

print ('R-Squared:',rsq)
R-Squared: 0.6501153054406361
In [51]: #Model Comparison and choose a better model and use for prediction
In [53]: len(a)
Out[53]: 3650
In [69]: #train set will have only 3643 and test set should have the remaining 7
train = a[:len(a)-20]
In [71]: len(train)
Out[71]: 3630
In [68]: test=a[len(a)-20:]
In [70]: len(test)
Out[70]: 20
In [72]: predict = Model.predict(start=len(train), end=len(d)-1, dynamic =False)
In [73]: # get the predicted and actual value

predict
Out[73]: array([12.98550599, 14.48991393, 12.50693103, 13.37909663, 13.66951836,

13.63231113, 13.77002824, 15.68759928, 13.91063827, 14.71991477,
13.31994904, 13.64742264, 13.99256948, 11.56942667, 13.63452401,
14.17770342, 13.63062571, 13.49909431, 13.49237402, 14.79938027])
In [74]: plt.plot(predict)
plt.plot(test, color='red')
plt.show()
In [76]: from sklearn.metrics import mean_squared_error as mse
In [78]: rmse = np.sqrt(mse(test,predict))
In [79]: rmse #Predictions are off by 1.65 units - on an average, the predicted termreratures differ from the actual
# by 1.65 degree celsius ( depending on the scale used)
Out[79]: 1.6598401226331614
In [80]: #Forecast
pred_t=Model.get_forecast(steps=15)
In [84]: fc = pred_t.predicted_mean
In [85]: fc
Out[85]: array([12.98912502, 13.32829955, 13.48681285, 13.51038796, 13.49127239,

13.46892017, 13.4506564 , 13.43490765, 13.41993572, 13.4050901 ,
13.39027797, 13.37553849, 13.36090258, 13.34637979, 13.33197021])
In [97]: # we need it in dataframe or tabular form- Date and Forecasted value as columns
n=15
fcast_index= pd.date_range(start=d.index[-1], periods=n+1, freq=d.index.freq)
In [89]: d.index[-1]
Out[89]: Timestamp('1990-12-31 00:00:00')
In [98]: fcast_index
Out[98]: DatetimeIndex(['1990-12-31', '1991-01-01', '1991-01-02', '1991-01-03',

'1991-01-04', '1991-01-05', '1991-01-06', '1991-01-07',
'1991-01-08', '1991-01-09', '1991-01-10', '1991-01-11',
'1991-01-12', '1991-01-13', '1991-01-14', '1991-01-15'],
dtype='datetime64[ns]', freq='D')
In [100]: # Dataframe
Future_temp =pd.DataFrame({'Date': fcast_index[1:], 'Forecasted Temp':fc})
In [101]: Future_temp
Out[101]:
Date Forecasted Temp
0 1991-01-01 12.989125
1 1991-01-02 13.328300
2 1991-01-03 13.486813
3 1991-01-04 13.510388
4 1991-01-05 13.491272
5 1991-01-06 13.468920
6 1991-01-07 13.450656
7 1991-01-08 13.434908
8 1991-01-09 13.419936
9 1991-01-10 13.405090
10 1991-01-11 13.390278
11 1991-01-12 13.375538
12 1991-01-13 13.360903
13 1991-01-14 13.346380
14 1991-01-15 13.331970
In [103]: # Make Date as row

Future_temp.set_index('Date', inplace=True)
In [104]: Future_temp
Out[104]:
Forecasted Temp
Date
1991-01-01 12.989125
1991-01-02 13.328300
1991-01-03 13.486813
1991-01-04 13.510388
1991-01-05 13.491272
1991-01-06 13.468920
1991-01-07 13.450656
1991-01-08 13.434908
1991-01-09 13.419936
1991-01-10 13.405090
1991-01-11 13.390278
1991-01-12 13.375538
1991-01-13 13.360903
1991-01-14 13.346380
1991-01-15 13.331970
In [107]: Future_temp.plot(figsize =(10, 5))

plt.show()

ARIMA Predict Forecast

Uploaded by

Copyright:

Available Formats

You might also like

ARIMA Predict Forecast

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

ARIMA Predict Forecast

Uploaded by

Copyright:

Available Formats

In [1]: import pandas as pd

In [6]: #Set as an index

In [8]: from matplotlib import pyplot as plt

In [12]: d.plot(figsize=(15, 6))

plt.title("Daily Temperature", color='green')

In [13]: from arch.unitroot import ADF

In [14]: adf=ADF(d, lags=20)

Augmented Dickey-Fuller Results

In [15]: from arch.unitroot import PhillipsPerron as pp

In [16]: ppt = pp(d, lags=50)

Phillips-Perron Test (Z-tau)

In [18]: from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [19]: plot_acf(d, lags=20)

In [20]: plot_pacf(d, lags=20, method='ols')

In [21]: from statsmodels.tsa.ar_model import AutoReg

In [26]: Model_AR8 = AutoReg(a, lags=8).fit()

AutoReg Model Results

In [29]: Model_AR8.plot_diagnostics(figsize=(15, 12))

In [30]: residual = Model_AR8.resid

In [31]: plot_acf(residual, lags=20)

In [32]: plot_pacf(residual, lags=20, method='ols')

In [34]: from statsmodels.stats.diagnostic import acorr_ljungbox as gb

In [35]: gb(residual, lags=10, return_df=True)

In [37]: from statsmodels.tsa.arima.model import ARIMA

In [40]: Model = ARIMA(a, order=(3, 0, 1)).fit()

In [42]: res = Model.resid

In [45]: Model.plot_diagnostics(figsize=(15, 12))

In [49]: plot_pacf(res, lags=20, method='ols')

In [50]: gb(res, lags=10, return_df=True)

In [55]: #computing R square

In [59]: rsq = 1-(rss/tss)

In [72]: predict = Model.predict(start=len(train), end=len(d)-1, dynamic =False)

In [73]: # get the predicted and actual value

Out[73]: array([12.98550599, 14.48991393, 12.50693103, 13.37909663, 13.66951836,

In [76]: from sklearn.metrics import mean_squared_error as mse

In [78]: rmse = np.sqrt(mse(test,predict))

Out[85]: array([12.98912502, 13.32829955, 13.48681285, 13.51038796, 13.49127239,

Out[89]: Timestamp('1990-12-31 00:00:00')

Out[98]: DatetimeIndex(['1990-12-31', '1991-01-01', '1991-01-02', '1991-01-03',

In [103]: # Make Date as row

In [107]: Future_temp.plot(figsize =(10, 5))

You might also like