DR - Eldirdiri Fadol Ibrahim (R+Python Specialist)

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 1

Search Anaconda Cloud  View   Help   derderi 

derderi / notebooks / ai-1  0

Jupyter Notebook

Notebook Files Labels Badges Settings

You are viewing 2020.08.26.1623    Download

Another way expressing the Features (plots)

By:Dr.Eldirdiri Fadol Ibrahim Fadol

Scientific Research Center Founder & UP Intiative

Private Scientific Researcher

Scientific Data Analyst- SUDAN

27th August 2020

In [41]: import pandas as pd


import matplotlib.pyplot as plt # For plotting graphs
from sklearn.metrics import mean_squared_error as MSE
from math import sqrt
from statsmodels.tsa.api import SimpleExpSmoothing, Holt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller # for Dickey Fuller test
from statsmodels.tsa.stattools import acf, pacf # for p,q in Arima Model
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
import numpy as np

# Load Dataset
train= pd.read_csv('Train.csv')
test=data = pd.read_csv('Test.csv')

In [42]: # make a copy of original dataset


train['Datetime'] = pd.to_datetime(train.Datetime,format='%d-%m-%Y %H:%M')
test['Datetime'] = pd.to_datetime(test.Datetime,format='%d-%m-%Y %H:%M')

In [43]: valid = train.iloc[16056:18287, :]


train = train.iloc[0:16055, :]

In [44]: # Visualize trainin-validation data split


plt.figure(figsize=(40,20))
plt.plot(train.Datetime, train['Count'], label='train')
plt.plot(valid.Datetime, valid['Count'], label='validation')
plt.xlabel("Datetime")
plt.ylabel("Passenger count")
plt.legend(loc='best')
plt.show()

In [45]: # Naive method to predict time series


y_hat = valid.copy()
# Assume that all next values will be the same as last observed value
y_hat['Count'] = train['Count'][len(train)-1]

In [46]: # Visualize Naive method predictions


plt.figure(figsize=(40,20))
plt.plot(train.Datetime, train['Count'], label='train')
plt.plot(valid.Datetime, valid['Count'], label='validation')
plt.plot(y_hat.Datetime, y_hat['Count'], label='Naive Forecast')
plt.xlabel('Datetime')
plt.ylabel('Passenger count')
plt.legend(loc='best')
plt.show()

In [47]: rmse = pd.DataFrame(columns=['Method', 'RMSE'])

In [48]: # Calculate RMSE for Naive method


rmse.loc[len(rmse)]="Naive", sqrt(MSE(valid.Count, y_hat.Count))

In [49]: # Moving Average Method to predict time series

# last 10 days
y_hat['Count'] = train['Count'].rolling(10).mean().iloc[-1]
# Calculate RMSE for Moving average 10 days
rmse.loc[len(rmse)]="Moving Average 10D", sqrt(MSE(valid.Count, y_hat.Count))

# last 20 days
y_hat['Count'] = train['Count'].rolling(20).mean().iloc[-1]
# Calculate RMSE for Moving average 20 days
rmse.loc[len(rmse)]="Moving Average 20D", sqrt(MSE(valid.Count, y_hat.Count))

# last 50 days
y_hat['Count'] = train['Count'].rolling(50).mean().iloc[-1]
# Calculate RMSE for Moving average 50 days
rmse.loc[len(rmse)]="Moving Average 50D", sqrt(MSE(valid.Count, y_hat.Count))

# RMSE of 10 days is better than 20 and 50 days


# Thus predictions are getting weaker as we increase number of observations

# Visualize Moving Average predictions with window size of 10 days


plt.figure(figsize=(40,20))
plt.plot(train.Datetime, train['Count'], label='train')
plt.plot(valid.Datetime, valid['Count'], label='validation')
plt.plot(y_hat.Datetime, y_hat['Count'], label='Moving average 10 days forecast')
plt.xlabel('Datetime')
plt.ylabel('Passenger count')
plt.legend(loc='best')
plt.show()

# Simple Exponential Smoothing to predict time series

y_hat = valid.copy()
fit1 = SimpleExpSmoothing(train['Count']).fit(smoothing_level=0.1, optimized=False)
y_hat['Count'] = fit1.forecast(len(valid)+1)
# Calculate RMSE for SES 0.1
rmse.loc[len(rmse)]="Simple Exp Smoothing 0.1", sqrt(MSE(valid.Count, y_hat.Count))

fit1 = SimpleExpSmoothing(train['Count']).fit(smoothing_level=0.2, optimized=False)


y_hat['Count'] = fit1.forecast(len(valid)+1)
# Calculate RMSE for SES 0.2
rmse.loc[len(rmse)]="Simple Exp Smoothing 0.2", sqrt(MSE(valid.Count, y_hat.Count))

fit1 = SimpleExpSmoothing(train['Count']).fit(smoothing_level=0.6, optimized=False)


y_hat['Count'] = fit1.forecast(len(valid)+1)
# Calculate RMSE for SES 0.6
rmse.loc[len(rmse)]="Simple Exp Smoothing 0.6", sqrt(MSE(valid.Count, y_hat.Count))

# Visualize Simple Exp Smoothing predictions with smoothing const of 0.2


plt.figure(figsize=(40,20))
plt.plot(train.Datetime, train['Count'], label='train')
plt.plot(valid.Datetime, valid['Count'], label='validation')
plt.plot(y_hat.Datetime, y_hat['Count'], label='Simple Exp Smoothing forecast')
plt.xlabel('Datetime')
plt.ylabel('Passenger count')
plt.legend(loc='best')
plt.show()

# Holt's Linear Trend Model to predcit time series

# Similar to SES but also takes trend into account

# Visualize the trend in data


sm.tsa.seasonal_decompose(np.asarray(train['Count']), freq=24).plot()
result = sm.tsa.stattools.adfuller(train['Count'])
plt.show()

# We can see that the trend is increasing


# Thus Holt's linear trend model will perform better than above methods

fit1 = Holt(train['Count']).fit(smoothing_level = 0.1,smoothing_slope = 0.0001)


y_hat['Count'] = fit1.forecast(len(valid) + 1)

# Calculate RMSE for Holt's Linear Trending Model


rmse.loc[len(rmse)]="Holt's Linear Trend 0.0001", sqrt(MSE(valid.Count, y_hat.Count))

# Visualize Holt's predictions


plt.figure(figsize=(40,20))
plt.plot(train.Datetime, train['Count'], label='train')
plt.plot(valid.Datetime, valid['Count'], label='validation')
plt.plot(y_hat.Datetime, y_hat['Count'], label='Holts Linear Trending Forecast')
plt.xlabel('Datetime')
plt.ylabel('Passenger count')
plt.legend(loc='best')
plt.show()

<ipython-input-49-e369170aa96a>:67: FutureWarning: the 'freq'' keyword is deprecated, use 'period' inste


ad
sm.tsa.seasonal_decompose(np.asarray(train['Count']), freq=24).plot()

C:\Users\compu lab\anaconda3\lib\site-packages\statsmodels\tsa\holtwinters.py:731: RuntimeWarning: inval


id value encountered in greater_equal
loc = initial_p >= ub

In [50]: valid = data.iloc[16056:18287, :]


data = data.iloc[0:16055, :]

In [ ]:

In [ ]:

Anaconda Cloud Community


Gallery Anaconda Community
About Open Source
Documentation NumFOCUS
Support Support
About Anaconda, Inc. Developer Blog
Download Anaconda

PRIVACY POLICY  |  EULA (Anaconda Cloud v2.33.29) © 2020 Anaconda, Inc. All Rights Reserved.

You might also like