Download as pdf or txt
Download as pdf or txt
You are on page 1of 24

In [2]: import warnings

warnings.filterwarnings('ignore')
import pandas as pd
from pandas import Series,DataFrame
from pandas.plotting import lag_plot
import numpy as np
import seaborn as sns
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt


import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]: import pandas_datareader.data as web


import datetime
listofcomp=['AAPL','GOOG','MSFT','AMZN']
end = datetime.datetime.now()
start = datetime.datetime(end.year - 1,end.month,end.day)

for x in listofcomp:
globals()[x]=web.DataReader(x, 'yahoo',start,end)

In [4]: AAPL.describe()

Out[4]:
High Low Open Close Volume Adj Close

count 253.000000 253.000000 253.000000 253.000000 2.530000e+02 253.000000

mean 140.128953 137.296324 138.672016 138.780712 9.128900e+07 138.364038

std 12.584228 12.366709 12.369123 12.528156 2.937889e+07 12.701024

min 120.400002 116.209999 119.029999 116.360001 4.100000e+07 115.819870

25% 129.720001 126.809998 128.410004 127.900002 6.990710e+07 127.306313

50% 137.339996 134.589996 136.279999 136.690002 8.722280e+07 135.927673

75% 149.169998 146.470001 148.270004 148.119995 1.068203e+08 147.919998

max 182.130005 175.529999 181.119995 179.449997 1.925415e+08 179.449997


In [5]: AAPL.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 253 entries, 2020-12-14 to 2021-12-14
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 High 253 non-null float64
1 Low 253 non-null float64
2 Open 253 non-null float64
3 Close 253 non-null float64
4 Volume 253 non-null float64
5 Adj Close 253 non-null float64
dtypes: float64(6)
memory usage: 13.8 KB

In [6]: AAPL['Adj Close'].plot(legend=True,figsize=(10,4))

Out[6]: <AxesSubplot:xlabel='Date'>

In [7]: AAPL['Volume'].plot(legend=True,figsize=(10,4))

Out[7]: <AxesSubplot:xlabel='Date'>
In [8]: ma_day=[10,20,50]
for ma in ma_day:
column_name = 'MA for %s days' %(str(ma))
AAPL[column_name] = pd.Series(AAPL['Adj Close']).rolling(window

In [9]: AAPL[['Adj Close','MA for 10 days','MA for 20 days','MA for 50 days'

Out[9]: <AxesSubplot:xlabel='Date'>

In [10]: AAPL['Daily Return'] =AAPL['Adj Close'].pct_change()

AAPL['Daily Return'].plot(figsize=(10,4),legend=True,linestyle='--',

Out[10]: <AxesSubplot:xlabel='Date'>
In [11]: AAPL['Daily Return'].hist(color='purple')

Out[11]: <AxesSubplot:>

In [12]: sns.distplot(AAPL['Daily Return'].dropna(), bins = 100 , color = 'Purpl

Out[12]: <AxesSubplot:xlabel='Daily Return', ylabel='Density'>

In [13]: closing_df = web.DataReader(['AAPL','GOOG','MSFT','AMZN'],'yahoo',start

In [14]: closing_df = web.DataReader(listofcomp,'yahoo',start,end)['Adj Close'


In [15]: closing_df.head()

Out[15]:
Symbols AAPL GOOG MSFT AMZN

Date

2020-12-14 121.033844 1760.060059 212.419434 3156.969971

2020-12-15 127.096466 1767.770020 212.350021 3165.120117

2020-12-16 127.026901 1763.000000 217.457199 3240.959961

2020-12-17 127.911446 1747.900024 217.596039 3236.080078

2020-12-18 125.883957 1731.010010 216.772949 3201.649902

In [16]: tech_rets = closing_df.pct_change()

In [17]: tech_rets.head()

Out[17]:
Symbols AAPL GOOG MSFT AMZN

Date

2020-12-14 NaN NaN NaN NaN

2020-12-15 0.050090 0.004381 -0.000327 0.002582

2020-12-16 -0.000547 -0.002698 0.024051 0.023961

2020-12-17 0.006963 -0.008565 0.000638 -0.001506

2020-12-18 -0.015851 -0.009663 -0.003783 -0.010639


In [18]: sns.jointplot('GOOG','GOOG',tech_rets,kind='scatter',color='seagreen'

Out[18]: <seaborn.axisgrid.JointGrid at 0x7fa297f34ac0>


In [19]: sns.jointplot('GOOG','MSFT',tech_rets,kind='scatter')

Out[19]: <seaborn.axisgrid.JointGrid at 0x7fa297721130>

In [20]: sns.pairplot(tech_rets.dropna())

Out[20]: <seaborn.axisgrid.PairGrid at 0x7fa2984b9c70>


In [21]:
returns_fig = sns.PairGrid(tech_rets.dropna())

returns_fig.map_upper(plt.scatter,color='purple')

returns_fig.map_lower(sns.kdeplot,cmap='cool_d')

returns_fig.map_diag(plt.hist,bins=30)

Out[21]: <seaborn.axisgrid.PairGrid at 0x7fa298eb8ac0>

We can also analyze the correlation of the closing prices using this exact same
technique.
In [22]: returns_fig = sns.PairGrid(closing_df)

returns_fig.map_upper(plt.scatter,color='purple')

returns_fig.map_lower(sns.kdeplot,cmap='cool_d')

returns_fig.map_diag(plt.hist,bins=30)

Out[22]: <seaborn.axisgrid.PairGrid at 0x7fa298fe1490>

https://www.hackerearth.com/blog/developers/data-visualization-techniques/
(https://www.hackerearth.com/blog/developers/data-visualization-techniques/) : ALL THE
PLOTS EXPLAINED WELL HERE.

In [23]: rets = tech_rets.dropna()


In [24]: rets=tech_rets.dropna()
area = np.pi*20
plt.scatter(rets.mean(),rets.std(),alpha=0.5,s=area)
plt.ylim([0.01,0.040])
plt.xlim([-0.003,0.004])
plt.xlabel('Expected returns')
plt.ylabel('Risk')

for label, x, y in zip(rets.columns, rets.mean(), rets.std()):


plt.annotate(
label,
xy = (x, y), xytext = (50, 50),
textcoords = 'offset points', ha = 'right', va = 'bottom',
arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad

Value at Risk

We can treat value at risk as the amount of money we could expect to lose (aka putting
at risk) for a given confidence interval. Theres several methods we can use for estimating
a value at risk.

Value at risk using the "bootstrap" method

For this method we will calculate the empirical quantiles from a histogram of daily
returns.
In [25]: sns.distplot(AAPL['Daily Return'].dropna(),bins=100,color='purple')

Out[25]: <AxesSubplot:xlabel='Daily Return', ylabel='Density'>

In [26]: rets['AAPL'].quantile(0.05)

Out[26]: -0.025191669250715143

In [27]: days = 365


dt =1/days
mu = rets.mean()['GOOG']
sigma = rets.std()['GOOG']

In [28]: def stock_monte_carlo(start_price,days,mu,sigma):

price = np.zeros(days)
price[0] = start_price
shock = np.zeros(days)
drift = np.zeros(days)

for x in range(1,days):
shock[x] = np.random.normal(loc=mu*dt , scale=sigma*np.sqrt(
drift[x] = mu*dt
price[x] = price[x-1] + (price[x-1]*(drift[x] + shock[x]))
return price
In [29]: start_price = 569.85
for run in range(100):
plt.plot(stock_monte_carlo(start_price,days,mu,sigma))
plt.xlabel('Days')
plt.ylabel('Price')
plt.title('Monte Carlo Analysis for Google')

Out[29]: Text(0.5, 1.0, 'Monte Carlo Analysis for Google')

In [30]: runs = 1000


simulations = np.zeros(runs)
np.set_printoptions(threshold=5)
for run in range(runs):
simulations[run] = np.round(stock_monte_carlo(start_price,days,
In [31]: q = np.percentile(simulations, 1)
plt.hist(simulations, bins=30)
plt.figtext(0.6, 0.8, s = 'Start price: $%.2f ' %start_price)
plt.figtext(0.6, 0.7, 'Mean final price: $%.2f' % simulations.mean())
plt.figtext(0.6, 0.6 , 'VaR(0.99): $%.2f' % (start_price - q,))
plt.figtext(0.15, 0.6, 'q(0.99): $%.2f' % q)

plt.axvline(x=q , linewidth= 4, color='r')

plt.title('Final price distribution for google stock after %s days'

In [32]: listofcomp=['AAPL','GOOG','MSFT','AMZN']
end = datetime.datetime.now()
start = datetime.datetime(end.year - 5,end.month,end.day)

for x in listofcomp:
globals()[x]=web.DataReader(x, 'yahoo',start,end)

In [33]: AAPL.describe()

Out[33]:
High Low Open Close Volume Adj Close

count 1259.000000 1259.000000 1259.000000 1259.000000 1.259000e+03 1259.000000

mean 74.067123 72.510326 73.268038 73.327927 1.210569e+08 72.131347

std 40.132350 39.165632 39.650271 39.671464 5.612718e+07 40.151178

min 29.049999 28.690001 28.760000 28.797501 4.100000e+07 27.106562

25% 43.148750 42.422499 42.778749 42.768751 8.388080e+07 41.179798

50% 53.794998 52.767502 53.297501 53.259998 1.062040e+08 51.864834

75% 116.065002 112.840000 114.670002 114.965000 1.409695e+08 114.063820

max 182.130005 175.529999 181.119995 179.449997 4.479400e+08 179.449997


In [34]: AAPL.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1259 entries, 2016-12-14 to 2021-12-14
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 High 1259 non-null float64
1 Low 1259 non-null float64
2 Open 1259 non-null float64
3 Close 1259 non-null float64
4 Volume 1259 non-null float64
5 Adj Close 1259 non-null float64
dtypes: float64(6)
memory usage: 68.9 KB

In [35]: AAPL['Adj Close'].plot(legend=True,figsize=(10,4))


plt.title('APPLE STOCK PRICES')

Out[35]: Text(0.5, 1.0, 'APPLE STOCK PRICES')


In [36]: train_data, test_data = AAPL[0:int(len(AAPL)*0.8)], AAPL[int(len(AAPL
plt.figure(figsize=(14,7))
plt.title('Apple Stock Prices')
plt.xlabel('Dates')
plt.ylabel('Prices')
plt.plot(AAPL['Close'], 'blue', label ='Training Data')
plt.plot(test_data['Close'],'green', label='Testing Data')
plt.legend()

Out[36]: <matplotlib.legend.Legend at 0x7fa29b4703d0>


ARIMA

ARIMA (AutoRegressive Integrated Moving Average) is a forecasting algorithm based on


the idea that the information in the past values of the time series can alone be used to
predict the future values.

ARIMA models explain a time series based on its own past values, basically its own lags
and the lagged forecast errors.

An ARIMA model is characterized by 3 terms (p, d, q): p is the order of the AR term d is
the number of differencing required to make the time series stationary q is the order of
the MA term

As we see in the parameters required by the model, any stationary time series can be
modeled with ARIMA models.

STATIONARITY

A stationary time series is one whose properties do not depend on the time at which the
series is observed. Thus, time series with trends, or with seasonality, are not stationary —
the trend and seasonality will affect the value of the time series at different times.

Subtract the previous value from the current value. Now if we just difference once, we
might not get a stationary series so we might need to do that multiple times.

And the minimum number of differencing operations needed to make the series
stationary needs to be imputed into our ARIMA model.

ADF TEST

We'll use the Augumented Dickey Fuller (ADF) test to check if the price series is
stationary.

The null hypothesis of the ADF test is that the time series is non-stationary. So, if the p-
value of the test is less than the significance level (0.05) then we can reject the null
hypothesis and infer that the time series is indeed stationary.

So, in our case, if the p-value > 0.05 we'll need to find the order of differencing.

In [37]: #Check if price series is stationary


from statsmodels.tsa.stattools import adfuller

result = adfuller(AAPL.Close.dropna())
print(f"ADF Statistic: {result[0]}")
print(f"p-value: {result[1]}")

ADF Statistic: 1.130600379322786


p-value: 0.9954695061218785

Since p-value > 0.05, therefore time series is not staionary


In [38]: pip install pmdarima

Requirement already satisfied: pmdarima in ./opt/anaconda3/lib/pyt


hon3.8/site-packages (1.8.3)
Requirement already satisfied: scikit-learn>=0.22 in ./opt/anacond
a3/lib/python3.8/site-packages (from pmdarima) (0.24.1)
Requirement already satisfied: Cython!=0.29.18,>=0.29 in ./opt/ana
conda3/lib/python3.8/site-packages (from pmdarima) (0.29.23)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in ./op
t/anaconda3/lib/python3.8/site-packages (from pmdarima) (52.0.0.po
st20210125)
Requirement already satisfied: joblib>=0.11 in ./opt/anaconda3/lib
/python3.8/site-packages (from pmdarima) (1.0.1)
Requirement already satisfied: urllib3 in ./opt/anaconda3/lib/pyth
on3.8/site-packages (from pmdarima) (1.26.4)
Requirement already satisfied: scipy>=1.3.2 in ./opt/anaconda3/lib
/python3.8/site-packages (from pmdarima) (1.6.2)
Requirement already satisfied: pandas>=0.19 in ./opt/anaconda3/lib
/python3.8/site-packages (from pmdarima) (1.2.4)
Requirement already satisfied: statsmodels!=0.12.0,>=0.11 in ./opt
/anaconda3/lib/python3.8/site-packages (from pmdarima) (0.12.2)
Requirement already satisfied: numpy>=1.19.3 in ./opt/anaconda3/li
b/python3.8/site-packages (from pmdarima) (1.20.1)
Requirement already satisfied: python-dateutil>=2.7.3 in ./opt/ana
conda3/lib/python3.8/site-packages (from pandas>=0.19->pmdarima) (
2.8.1)
Requirement already satisfied: pytz>=2017.3 in ./opt/anaconda3/lib
/python3.8/site-packages (from pandas>=0.19->pmdarima) (2021.1)
Requirement already satisfied: six>=1.5 in ./opt/anaconda3/lib/pyt
hon3.8/site-packages (from python-dateutil>=2.7.3->pandas>=0.19->p
mdarima) (1.15.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in ./opt/anaco
nda3/lib/python3.8/site-packages (from scikit-learn>=0.22->pmdarim
a) (2.1.0)
Requirement already satisfied: patsy>=0.5 in ./opt/anaconda3/lib/p
ython3.8/site-packages (from statsmodels!=0.12.0,>=0.11->pmdarima)
(0.5.1)
Note: you may need to restart the kernel to use updated packages.

In [39]: from pmdarima.arima.utils import ndiffs


ndiffs(AAPL.Close, test = "adf")

Out[39]: 1
Therefore d value is 1

p is the order of the Auto Regressive (AR) term. It refers to the number of lags to be used
as predictors.

Wecan find out the required number of AR terms by inspecting the Partial Autocorrelation
(PACF) plot.

The partial autocorrelation represents the correlation between the series and its lags.

In [40]: from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

diff = AAPL.Close.diff().dropna()

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(16,4))

ax1.plot(diff)
ax1.set_title('Difference once')
ax2.set_ylim(0,1)
plot_pacf(diff, ax=ax2);

q is the order of the Moving Average (MA) term. It refers to the number of lagged forecast
errors that should go into the ARIMA model.

We can look at the ACF plot for the number of MA terms.


In [41]: diff = AAPL.Close.diff().dropna()

fig, (ax1, ax2) = plt.subplots(1,2, figsize = (16,4))

ax1.plot(diff)
ax1.set_title('Difference once')
ax2.set_ylim(0,1)
plot_acf(diff, ax=ax2);

In [42]: dataset = AAPL.copy()


dataset= dataset[['Close']]
from matplotlib import pyplot
pyplot.figure()
pyplot.subplot(211)
plot_acf(dataset, ax=pyplot.gca(), lags=10)
pyplot.subplot(212)
plot_pacf(dataset, ax=pyplot.gca(), lags=10)
pyplot.show()

In order to evaluate the ARIMA model, I decided to use two different error functions:
Mean Squared Error (MSE) and Symmetric Mean Absolute Percentage Error (SMAPE).
SMAPE is commonly used as an accuracy measure based on relative errors.

SMAPE is not currently supported in Scikit-learn as a loss function I, therefore, had first
to create this function on my own.
In [43]: def smape_kun(y_true, y_pred):
return np.mean((np.abs(y_pred - y_true)*200/ (np.abs(y_pred) +

In [44]: train_ar = train_data['Close'].values


test_ar = test_data['Close'].values
history = [x for x in train_ar]
print(type(history))
predictions = list()
for t in range(len(test_ar)):
model = ARIMA(history, order=(2,1,1))
model_fit = model.fit(disp=0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test_ar[t]
history.append(obs)
error = mean_squared_error(test_ar, predictions)
print('Testing Mean Squared Error: %.3f' % error)
error2 = smape_kun(test_ar, predictions)
print('Symmetric mean absolute percentage error: %.3f' %error2)

<class 'list'>
Testing Mean Squared Error: 4.879
Symmetric mean absolute percentage error: 9.893

SMAPE is commonly used loss function for Time Series problems and can, therefore,
provide a more reliable analysis. That showed that our model is good.

In [45]: print(model_fit.summary())

ARIMA Model Results


==================================================================
============
Dep. Variable: D.y No. Observations:
1257
Model: ARIMA(2, 1, 1) Log Likelihood
-2368.785
Method: css-mle S.D. of innovations
1.593
Date: Wed, 15 Dec 2021 AIC
4747.571
Time: 02:20:05 BIC
4773.253
Sample: 1 HQIC
4757.223

==================================================================
============
coef std err z P>|z| [0.025
0.975]
In [46]: residuals = pd.DataFrame(model_fit.resid)
residuals.plot()

Out[46]: <AxesSubplot:>

In [47]: residuals.plot(kind='kde')

Out[47]: <AxesSubplot:ylabel='Density'>
In [48]: residuals.describe()

Out[48]:
0

count 1257.000000

mean 0.000004

std 1.593496

min -10.996909

25% -0.521333

50% -0.043387

75% 0.538770

max 9.974758

In [49]: plt.figure(figsize=(14,7))
plt.plot(AAPL['Close'], 'green', color='blue', label='Training Data'
plt.plot(test_data.index, predictions, color='green', marker='o', lines
plt.plot(test_data.index, test_data['Close'], color='red', label ='Actu
plt.title('Apple Prices Prediction')
plt.xlabel('Years')
plt.ylabel('Prices')
plt.legend()

Out[49]: <matplotlib.legend.Legend at 0x7fa280e007f0>


In [50]: model_fit.plot_predict(
start=1,
end=60,
dynamic=False,
);

In [51]: plt.figure(figsize=(14,7))
plt.plot(test_data.index, predictions, color='green', marker='o', lines
plt.plot(test_data.index, test_data['Close'],color='red',label='Actual
plt.legend()
plt.title('APPLE Stock Prices Prediction')
plt.xlabel('Years')
plt.ylabel('Prices')
plt.legend()

Out[51]: <matplotlib.legend.Legend at 0x7fa2819fea00>


The above image is a zoomed in version. From this can be noticed how the two curves
closely follow each other. However, the predicted price seems to look like a “noisy”
version of the actual price.

This analysis using ARIMA lead overall to appreciable results. This model demonstrated
in fact to offer good prediction accuracy and to be relatively fast compared to other
alternatives such as RRNs (Recurrent Neural Networks).

In [ ]:

You might also like