Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 9

install.

packages("tidyverse")
library(tidyverse)

#Import file
df <- read_csv("/Users/tabish/Downloads/Analyst Project/flight_data_SAN.csv")

#Convert Timestamp from Character to Date format. This is required to extract Date, year,
month, hour etc.
df$scheduled_arrival_dttm <- mdy_hm(df$scheduled_arrival_dttm)
df$scheduled_departure_dttm <- mdy_hm(df$scheduled_departure_dttm)
df$actual_departure_dttm <- mdy_hm(df$actual_departure_dttm)
df$actual_arrival_dttm <- mdy_hm(df$actual_arrival_dttm)

# extract year, month, day, hour and minute from timestamp (actual departure timestamp)
df$act_dep_year <- year(df$actual_departure_dttm)
df$act_dep_month <- month(df$actual_departure_dttm)
df$act_dep_day <- day(df$actual_departure_dttm)
df$act_dep_hour <- hour(df$actual_departure_dttm)
df$act_dep_min <- minute(df$actual_departure_dttm)

#convert NAs to 0
sum(is.na(df$taxiout))
index <- is.na(df$taxiout)
df$taxiout[index] <-0

#Correlation Check on ActDepMonth


df1 <-data.frame(df$act_dep_month,df$taxiout)
#df1 %>% group_by(df$sch_dep_month) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$act_dep_month) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "Month"
cordepmonth= lm(df_short1$TaxiOutTime~df_short1$Month,data = df_short1)
summary(cordepmonth)
plot(df_short1$Month,df_short1$TaxiOutTime, ylim = c(12,20))
abline(cordepmonth)
cor.test(df_short1$Month, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on ActDepDay


df1 <-data.frame(df$act_dep_day,df$taxiout)
#df1 %>% group_by(df$a_dep_day) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$act_dep_day) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "day"
cordepday= lm(df_short1$TaxiOutTime~df_short1$day,data = df_short1)
summary(cordepday)
plot(df_short1$day,df_short1$TaxiOutTime, ylim = c(12,20))
abline(cordepday)
cor.test(df_short1$day, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on ActDepHour


df1 <-data.frame(df$act_dep_hour,df$taxiout)
#df1 %>% group_by(df$a_dep_day) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$act_dep_hour) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "Hour"
cordephr= lm(df_short1$TaxiOutTime~df_short1$Hour,data = df_short1)
summary(cordephr)
plot(df_short1$Hour,df_short1$TaxiOutTime)
abline(cordephr)
cor.test(df_short1$Hour, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on ActDepMinute


df1 <-data.frame(df$act_dep_min,df$taxiout)
#df1 %>% group_by(df$a_dep_day) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$act_dep_min) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "Minute"
cordepmin= lm(df_short1$TaxiOutTime~df_short1$Minute,data = df_short1)
summary(cordepmin)
plot(df_short1$Minute,df_short1$TaxiOutTime,ylim = c(12,20))
abline(cordepmin)
cor.test(df_short1$Minute, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on Total Seats


df1 <-data.frame(df$totalseatcount,df$taxiout)
df1 %>% group_by(df$totalseatcount) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$totalseatcount) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "TotalSeats"
corseats1= lm(df_short1$TaxiOutTime~df_short1$TotalSeats,data = df_short1)
summary(corseats1)
plot(df_short1$TotalSeats,df_short1$TaxiOutTime)
abline(corseats1)
cor.test(df_short1$TotalSeats, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on Airtime


df1 <-data.frame(df$airtime,df$taxiout)
#df1 %>% group_by(df$airtime) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$airtime) %>% summarize(TaxiOutTime = mean(df.taxiout))
colnames(df_short1)[1] <- "Airtime"
corairtime= lm(df_short1$TaxiOutTime~df_short1$Airtime,data = df_short1)
summary(corairtime)
plot(df_short1$Airtime,df_short1$TaxiOutTime)
abline(corairtime)
cor.test(df_short1$Airtime, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on taxiin


df1 <-data.frame(df$taxiin,df$taxiout)
#df1 %>% group_by(df$airtime) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$taxiin) %>% summarize(TaxiOutTime = mean(df.taxiout))
colnames(df_short1)[1] <- "Taxiin"
cortaxiin= lm(df_short1$TaxiOutTime~df_short1$Taxiin,data = df_short1)
summary(cortaxiin)
plot(df_short1$Taxiin,df_short1$TaxiOutTime)
abline(cortaxiin)
cor.test(df_short1$Taxiin, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on DepVariance


df1 <-data.frame(df$depvariance,df$taxiout)
#df1 %>% group_by(df$airtime) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$depvariance) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "DepVariance"
cordepvar= lm(df_short1$TaxiOutTime~df_short1$DepVariance,data = df_short1)
summary(cordepvar)
plot(df_short1$DepVariance,df_short1$TaxiOutTime)
abline(cordepvar)
cor.test(df_short1$DepVariance, df_short1$TaxiOutTime, method="pearson")

#Correlation Check on ArrVariance


df1 <-data.frame(df$arrvariance,df$taxiout)
#df1 %>% group_by(df$airtime) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$arrvariance) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "ArrVariance"
corarrvar= lm(df_short1$TaxiOutTime~df_short1$ArrVariance,data = df_short1)
summary(corarrvar)
plot(df_short1$ArrVariance,df_short1$TaxiOutTime, ylim=c(0,50))
abline(corarrvar)
cor.test(df_short1$ArrVariance, df_short1$TaxiOutTime, method="pearson")
#Correlation Check on Airline
df1 <-data.frame(df$airline,df$taxiout)
df1 %>% group_by(df$totalseatcount) %>% summarize(TaxiOutTime = mean(df.taxiout))
df_short1<-df1 %>% group_by(df$airline) %>% summarize(TaxiOutTime = mean(df.taxiout))
colnames(df_short1)[1] <- "Airline"
corAirline1= lm(df1$df.taxiout~df1$df.airline,data = df5)
summary(corAirline1)

#Correlation Check on Aircraft


df1 <-data.frame(df$generalacft,df$taxiout)
df_short1<-df1 %>% group_by(df$generalacft) %>% summarize(TaxiOutTime =
mean(df.taxiout))
colnames(df_short1)[1] <- "Aircraft"
coraircraft1= lm(df_short6$TaxiOutTime~df_short1$Aircraft,data = df_short1)
summary(coraircraft1)

install.packages("tidyverse")
library(tidyverse)
install.packages("fpp")
library(fpp)

#Import file
df <- read_csv("/Users/tabish/Downloads/Analyst Project/flight_data_SAN.csv")

#Creating a new data frame keeping the orinal one intact


df_new<-df

#Convert Timestamp from Character to Date format. This is required to extract Date, year,
month, hour etc.
df_new$scheduled_arrival_dttm <- mdy_hm(df_new$scheduled_arrival_dttm)
df_new$scheduled_departure_dttm <- mdy_hm(df_new$scheduled_departure_dttm)
df_new$actual_departure_dttm <- mdy_hm(df_new$actual_departure_dttm)
df_new$actual_arrival_dttm <- mdy_hm(df_new$actual_arrival_dttm)

# Extract Date from Timestamp format


df_new$act_dep_date <- as.Date(df_new$actual_departure_dttm)
# extract year, month, day, hour and minute from timestamp (actual departure timestamp)
df_new$act_dep_year <- year(df_new$actual_departure_dttm)
df_new$act_dep_month <- month(df_new$actual_departure_dttm)
df_new$act_dep_day <- day(df_new$actual_departure_dttm)
df_new$act_dep_hour <- hour(df_new$actual_departure_dttm)
df_new$act_dep_min <- minute(df_new$actual_departure_dttm)

#convert NAs in Taxi-out time to 0


sum(is.na(df_new$taxiout))
index <- is.na(df_new$taxiout)
df_new$taxiout[index] <-0

#MONTHLY TIME SERIES

#Extracting Year-Month from Date


df_new$act_dep <- format(df_new$act_dep_date, "%Y-%m")
mydatamonth<-data.frame(df_new$act_dep,df_new$taxiout)
#Check for NA in departure month
sum(is.na(mydatamonth$df_new.act_dep))
#Replacing NAs with 0
index <- is.na(mydatamonth$df_new.act_dep)
mydatamonth$df_new.act_dep[index] <-0
#Adding average taxi out time against each year-month (Summarize)
mydatamonth_summary <-mydatamonth %>%
group_by(Month=(mydatamonth$df_new.act_dep)) %>% summarize(TaxiOutTime =
mean(df_new.taxiout))
#delete row with 0 day and 0 taxiout time
mydatamonth_summary <- mydatamonth_summary[-1,]

#Creating Time series


tsdata=ts(mydatamonth_summary$TaxiOutTime, start=c(2017,1), end = c(2018,12), frequency
= 12)

#decomposing time series- multiplicative


plot(decompose(tsdata,type=c("multiplicative")))

#decomposing time series- additive


plot(decompose(tsdata,type=c("additive")))

#ADF Test (Augmented Dickey Fuller Test to test Stationarity)


adf.test(tsdata)
#Non Stationary series

#Take Diff
tsdatadiff=diff(tsdata)
plot(tsdatadiff)
#ETS Test
tsdataets=ets(tsdata)
summary(tsdataets)
predictets=forecast(tsdataets, h=3)
plot(predictets)
plot(tsdata, col="blue")
lines(predictets$fitted,col="red")
lines(predictets$mean,col="green")
checkresiduals(predictets)

#ARIMA Test
acf(coredata(tsdata),lag.max = 30)
pacf(coredata(tsdata),lag.max = 30)
dataarima=auto.arima(tsdata, trace=TRUE)
predictarima=forecast(dataarima, h=3)
plot(predictarima)

#DAILY TIME SERIES


mydataday<-data.frame(df_new$act_dep_date,df_new$taxiout)
#convert date to character. This is required to check for NAs in date column and convert them
to 0. NAs are not replaced to 0 if the column format is Date type
mydataday$df_new.act_dep_date <- as.character(mydataday$df_new.act_dep_date)
#Check for NA in departure date and replace it with 0
sum(is.na(mydataday$df_new.act_dep_date))
index <- is.na(mydataday$df_new.act_dep_date)
mydataday$df_new.act_dep_date[index] <-0

#Adding average taxi out time against each day of the year (Summarize)
mydataday_summary <-mydataday %>% group_by(Day=(mydataday$df_new.act_dep_date))
%>% summarize(TaxiOutTime = mean(df_new.taxiout))

#delete row with 0 day and 0 taxiout time


mydataday_summary <- mydataday_summary[-1,]

#Creating Time series


tsdata=ts(mydataday_summary$TaxiOutTime, start=c(2017,1), end = c(2018,365), frequency =
365)

#decomposing time series- multiplicative


plot(decompose(tsdata,type=c("multiplicative")))

#decomposing time series- additive


plot(decompose(tsdata,type=c("additive")))
#ADF Test
adf.test(tsdata)
#Stationary series

#Take Diff
tsdatadiff=diff(tsdata)
plot(tsdatadiff)

#ETS Test
tsdataets=ets(tsdata)
summary(tsdataets)
predictets=forecast(tsdataets, h=90)
plot(predictets)
plot(tsdata, col="blue")
lines(predictets$fitted,col="red")
lines(predictets$mean,col="black")
checkresiduals(predictets)
summary(predictets)

#ARIMA Test
acf(coredata(tsdata),lag.max = 30)
pacf(coredata(tsdata),lag.max = 30)
acf(coredata(tsdatadiff),lag.max = 30)
pacf(coredata(tsdatadiff),lag.max = 30)
dataarima=auto.arima(tsdata, D=1, trace=TRUE, seasonal = TRUE)
predictarima=forecast(dataarima, h=90)
plot(predictarima)
plot(tsdata, col="black")
lines(predictarima$fitted,col="red")
lines(predictarima$mean,col="blue")
#plot(predictarima$fitted,col="red")
#plot(tsdata)

#HOURLY TIME SERIES


#Creating Date with hour from extracted year, month, day, hour, minutes columns.
df_new$act_dephour <-
ISOdatetime(df_new$act_dep_year,df_new$act_dep_month,df_new$act_dep_day,df_new$act
_dep_hour,0,0)

mydatahour<-data.frame(df_new$act_dephour,df_new$taxiout)

#convert date to character. This is required to check for NAs in date column and convert them
to 0. NAs are not replaced to 0 if the column format is Date type
mydatahour$df_new.act_dephour <- as.character(mydatahour$df_new.act_dephour)

#Check for NA in departure hour and replace it with 0


sum(is.na(mydatahour$df_new.act_dephour))
index <- is.na(mydatahour$df_new.act_dephour)
mydatahour$df_new.act_dephour[index] <-0

#Adding average taxi out time against each Date+hour (Summarize)


mydatahour_summary <-mydatahour %>%
group_by(Hour_dep=(mydatahour$df_new.act_dephour)) %>% summarize(TaxiOutTime =
mean(df_new.taxiout))

#delete row with 0 hour and 0 taxiout time


mydatahour_summary <- mydatahour_summary[-1,]

#Creating Time series


tsdata=ts(mydatahour_summary$TaxiOutTime, frequency = 6375)

#decomposing time series- multiplicative


plot(decompose(tsdata,type=c("multiplicative")))

#decomposing time series- additive


plot(decompose(tsdata,type=c("additive")))

#ADF Test
adf.test(tsdata)
#Stationary series

#Take Diff
tsdatadiff=diff(tsdata)
plot(tsdatadiff)

#ETS Test
tsdataets=ets(tsdata)
summary(tsdataets)
predictets=forecast(tsdataets, h=1530)
plot(predictets)
plot(tsdata, col="blue")
lines(predictets$fitted,col="red")
lines(predictets$mean,col="black")
checkresiduals(predictets)
summary(predictets)

#ARIMA Test
acf(coredata(tsdata),lag.max = 30)
pacf(coredata(tsdata),lag.max = 30)
dataarima=auto.arima(tsdata, trace=TRUE, seasonal = TRUE)
predictarima=forecast(dataarima, h=1530)
plot(predictarima)
plot(tsdata, col="blue")
lines(predictarima$fitted,col="red")
lines(predictarima$mean,col="green")
checkresiduals(predictarima)
summary(predictarima)

You might also like