WA Rural Interstate– Daily Speed (ARIMA)

Subasish Das and Choalun Ma

2018-11-12

# Step 1: Load R Packages 
### options(repos='http://cran.rstudio.com/')
#install.packages("astsa")
#install.packages('ggplot2')
#install.packages('forecast')
#install.packages('tseries')
#install.packages("data.table")

library(astsa)
library(forecast)
library(tseries)
library(zoo)
library(tseries)

library(data.table)
library(dplyr)
library(tidyr)
library(naniar)
library(stringr)
library(ggplot2)
library(DT)
library(lubridate)
library(ggpubr)


setwd("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/")
load("WA_Rural_Interstate_41_TMC_TT_SI_reduce_withCrash.rData")
mytype = 'RI'
setwd(paste0("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/",mytype))

df_RI <- b02a
dim(df_RI)
## [1] 9303840      64
df_RI$spd_av = 3600*df_RI$DISTANCE/df_RI$Travel_TIME_ALL_VEHICLES
df_RI$spd_pv = 3600*df_RI$DISTANCE/df_RI$Travel_TIME_PASSENGER_VEHICLES
df_RI$spd_ft = 3600*df_RI$DISTANCE/df_RI$Travel_TIME_FREIGHT_TRUCKS

### Month, Day
df_RI$date <- as.character(df_RI$DATE)
df_RI$date <- str_pad(df_RI$DATE, 8, pad = "0")
df_RI$Month <- substr(df_RI$date, start = 1, stop = 2)
df_RI$Day   <- substr(df_RI$date, start = 3, stop = 4)
df_RI$Year  <- substr(df_RI$date, start = 5, stop = 8)
df_RI$MonthDay <- paste0(df_RI$Month,"_", df_RI$Day)
head(df_RI)
##            TimeStamp       TMC V1    DATE EPOCH15 Travel_TIME_ALL_VEHICLES
## 1:  114N04098_0101_0 114N04098  1 1012015       0                       NA
## 2:  114N04098_0101_1 114N04098  2 1012015       1                      222
## 3: 114N04098_0101_10 114N04098 11 1012015      10                       NA
## 4: 114N04098_0101_11 114N04098 12 1012015      11                      184
## 5: 114N04098_0101_12 114N04098 13 1012015      12                      227
## 6: 114N04098_0101_13 114N04098 14 1012015      13                      192
##    Travel_TIME_PASSENGER_VEHICLES Travel_TIME_FREIGHT_TRUCKS NP ADMIN_LEVE
## 1:                             NA                         NA  N        USA
## 2:                             NA                        222  N        USA
## 3:                             NA                         NA  N        USA
## 4:                             NA                        184  N        USA
## 5:                             NA                        227  N        USA
## 6:                             NA                        192  N        USA
##    ADMIN_LE_1 ADMIN_LE_2 DISTANCE ROAD_NUMBE ROAD_NAME LATITUDE LONGITUDE
## 1: Washington       King  2.79553       I-90           47.44331 -121.6681
## 2: Washington       King  2.79553       I-90           47.44331 -121.6681
## 3: Washington       King  2.79553       I-90           47.44331 -121.6681
## 4: Washington       King  2.79553       I-90           47.44331 -121.6681
## 5: Washington       King  2.79553       I-90           47.44331 -121.6681
## 6: Washington       King  2.79553       I-90           47.44331 -121.6681
##    ROAD_DIREC  ORN_FID    FID_1 ACCESS LSHL_TY2 LSHL_TYP MED_TYPE NHS_IND
## 1:  Eastbound 17787.59 6232.662      F        A        A        S       Y
## 2:  Eastbound 17787.59 6232.662      F        A        A        S       Y
## 3:  Eastbound 17787.59 6232.662      F        A        A        S       Y
## 4:  Eastbound 17787.59 6232.662      F        A        A        S       Y
## 5:  Eastbound 17787.59 6232.662      F        A        A        S       Y
## 6:  Eastbound 17787.59 6232.662      F        A        A        S       Y
##    PRK_ZNE RSHL_TY2 RSHL_TYP SURF_TYP SURF_TY2 TERRAIN COMP_DIR COUNTY
## 1:                A        A        P        P       M        E     17
## 2:                A        A        P        P       M        E     17
## 3:                A        A        P        P       M        E     17
## 4:                A        A        P        P       M        E     17
## 5:                A        A        P        P       M        E     17
## 6:                A        A        P        P       M        E     17
##    FUNC_CLS MEDBARTY ST_FUNC RTE_NBR          HPMS ROAD_INV SPD_LIMT BEGMP
## 1:       41       DE      R5      90 2.514882e-311       90       70 33.36
## 2:       41       DE      R5      90 2.514882e-311       90       70 33.36
## 3:       41       DE      R5      90 2.514882e-311       90       70 33.36
## 4:       41       DE      R5      90 2.514882e-311       90       70 33.36
## 5:       41       DE      R5      90 2.514882e-311       90       70 33.36
## 6:       41       DE      R5      90 2.514882e-311       90       70 33.36
##    ENDMP LSHLDWID   MEDWID NO_LANE1 NO_LANE2 NO_LANES RSHLDWID RSHL_WD2
## 1: 36.16 7.779324 100.0268        4        3        7 9.655901 9.584419
## 2: 36.16 7.779324 100.0268        4        3        7 9.655901 9.584419
## 3: 36.16 7.779324 100.0268        4        3        7 9.655901 9.584419
## 4: 36.16 7.779324 100.0268        4        3        7 9.655901 9.584419
## 5: 36.16 7.779324 100.0268        4        3        7 9.655901 9.584419
## 6: 36.16 7.779324 100.0268        4        3        7 9.655901 9.584419
##      SEG_LNG  lanewid rdwy_wd1 rdwy_wd2 rdwy_wid     AADT     mvmt
## 1: 0.5697374 12.25326  48.6882 37.13331 85.82151 32974.55 6.874649
## 2: 0.5697374 12.25326  48.6882 37.13331 85.82151 32974.55 6.874649
## 3: 0.5697374 12.25326  48.6882 37.13331 85.82151 32974.55 6.874649
## 4: 0.5697374 12.25326  48.6882 37.13331 85.82151 32974.55 6.874649
## 5: 0.5697374 12.25326  48.6882 37.13331 85.82151 32974.55 6.874649
## 6: 0.5697374 12.25326  48.6882 37.13331 85.82151 32974.55 6.874649
##    rodwycls ORN_FID_1 Total Fatal Injury PDO DAYMTH Crash   spd_av spd_pv
## 1:        6  17787.59    17     0      6  11   0101     0       NA     NA
## 2:        6  17787.59    17     0      6  11   0101     0 45.33292     NA
## 3:        6  17787.59    17     0      6  11   0101     0       NA     NA
## 4:        6  17787.59    17     0      6  11   0101     0 54.69515     NA
## 5:        6  17787.59    17     0      6  11   0101     0 44.33440     NA
## 6:        6  17787.59    17     0      6  11   0101     0 52.41619     NA
##      spd_ft     date Month Day Year MonthDay
## 1:       NA 01012015    01  01 2015    01_01
## 2: 45.33292 01012015    01  01 2015    01_01
## 3:       NA 01012015    01  01 2015    01_01
## 4: 54.69515 01012015    01  01 2015    01_01
## 5: 44.33440 01012015    01  01 2015    01_01
## 6: 52.41619 01012015    01  01 2015    01_01
day1<- df_RI[,-c(1)] %>% group_by(MonthDay) %>% summarize(Speed_All_Mean=mean(spd_av, na.rm=TRUE))
day1
## # A tibble: 365 x 2
##    MonthDay Speed_All_Mean
##    <chr>             <dbl>
##  1 01_01              60.4
##  2 01_02              59.9
##  3 01_03              60.3
##  4 01_04              55.3
##  5 01_05              59.0
##  6 01_06              59.4
##  7 01_07              59.2
##  8 01_08              59.2
##  9 01_09              60.0
## 10 01_10              59.9
## # ... with 355 more rows
# Step 2: Examine Data
speed_clean <- tsclean(ts(day1$Speed_All_Mean))
plot.ts(speed_clean)

# ggplot() + geom_line(data = Q1, aes(x = TimeStamp, y = speed_clean)) + ylab('Cleaned Speed Records')

day1$cnt_ma = ma(speed_clean, order=7) # using the clean count with no outliers
day1$cnt_ma30 = ma(speed_clean, order=30)
# Step 3: Decompose Your Data
count_ma = ts(na.omit(speed_clean), frequency=30)
decomp = stl(count_ma, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp)

# Step 4: Stationarity
# statinary test
adf.test(count_ma, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_ma
## Dickey-Fuller = -2.0188, Lag order = 7, p-value = 0.569
## alternative hypothesis: stationary
adf.test(deseasonal_cnt, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  deseasonal_cnt
## Dickey-Fuller = -1.9854, Lag order = 7, p-value = 0.5831
## alternative hypothesis: stationary
d1 = diff(deseasonal_cnt)
adf.test(d1, alternative = "stationary")
## Warning in adf.test(d1, alternative = "stationary"): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  d1
## Dickey-Fuller = -9.1652, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# Step 5: Autocorrelations and Choosing Model Order
# check ACF and PACF
acf2(count_ma)

##          ACF  PACF
##   [1,]  0.77  0.77
##   [2,]  0.60  0.02
##   [3,]  0.50  0.08
##   [4,]  0.47  0.15
##   [5,]  0.54  0.28
##   [6,]  0.63  0.27
##   [7,]  0.72  0.31
##   [8,]  0.63 -0.15
##   [9,]  0.51 -0.07
##  [10,]  0.41 -0.07
##  [11,]  0.41  0.10
##  [12,]  0.48  0.06
##  [13,]  0.59  0.17
##  [14,]  0.64  0.02
##  [15,]  0.60  0.08
##  [16,]  0.50 -0.05
##  [17,]  0.38 -0.10
##  [18,]  0.40  0.11
##  [19,]  0.46  0.03
##  [20,]  0.55  0.05
##  [21,]  0.58 -0.04
##  [22,]  0.49 -0.19
##  [23,]  0.37 -0.10
##  [24,]  0.26 -0.08
##  [25,]  0.28  0.04
##  [26,]  0.34 -0.06
##  [27,]  0.42  0.00
##  [28,]  0.45 -0.03
##  [29,]  0.39 -0.01
##  [30,]  0.26 -0.11
##  [31,]  0.18 -0.01
##  [32,]  0.19 -0.03
##  [33,]  0.28  0.13
##  [34,]  0.40  0.09
##  [35,]  0.42  0.04
##  [36,]  0.35 -0.10
##  [37,]  0.21 -0.03
##  [38,]  0.13  0.00
##  [39,]  0.12 -0.03
##  [40,]  0.20  0.01
##  [41,]  0.31  0.07
##  [42,]  0.34  0.07
##  [43,]  0.27 -0.03
##  [44,]  0.14 -0.03
##  [45,]  0.08  0.05
##  [46,]  0.07  0.00
##  [47,]  0.14  0.02
##  [48,]  0.25  0.05
##  [49,]  0.29 -0.01
##  [50,]  0.22 -0.05
##  [51,]  0.10 -0.05
##  [52,]  0.03 -0.01
##  [53,]  0.03 -0.01
##  [54,]  0.10 -0.07
##  [55,]  0.19  0.01
##  [56,]  0.25  0.09
##  [57,]  0.17 -0.03
##  [58,]  0.05 -0.04
##  [59,] -0.02 -0.01
##  [60,] -0.02 -0.08
##  [61,]  0.05 -0.01
##  [62,]  0.13 -0.01
##  [63,]  0.20  0.09
##  [64,]  0.12 -0.09
##  [65,] -0.02 -0.08
##  [66,] -0.07  0.06
##  [67,] -0.06 -0.02
##  [68,]  0.02  0.03
##  [69,]  0.10 -0.03
##  [70,]  0.13 -0.02
##  [71,]  0.06 -0.03
##  [72,] -0.05  0.07
##  [73,] -0.11 -0.01
##  [74,] -0.10 -0.01
##  [75,] -0.01  0.06
##  [76,]  0.07 -0.01
##  [77,]  0.10  0.03
##  [78,]  0.03 -0.02
##  [79,] -0.06  0.03
##  [80,] -0.15  0.01
##  [81,] -0.15 -0.09
##  [82,] -0.08 -0.09
##  [83,]  0.01  0.00
##  [84,]  0.05  0.04
##  [85,] -0.01  0.06
##  [86,] -0.11 -0.04
##  [87,] -0.18 -0.02
##  [88,] -0.16  0.03
##  [89,] -0.11 -0.06
##  [90,] -0.04 -0.07
##  [91,]  0.01  0.00
##  [92,] -0.05 -0.02
##  [93,] -0.13  0.08
##  [94,] -0.18  0.03
##  [95,] -0.16  0.03
##  [96,] -0.12 -0.08
##  [97,] -0.04 -0.01
##  [98,] -0.01  0.02
##  [99,] -0.08 -0.03
## [100,] -0.16 -0.02
## [101,] -0.23 -0.05
## [102,] -0.21  0.04
## [103,] -0.17 -0.08
## [104,] -0.08  0.01
## [105,] -0.04  0.04
## [106,] -0.10  0.04
## [107,] -0.19  0.01
## [108,] -0.24  0.00
## [109,] -0.21 -0.01
## [110,] -0.16 -0.01
## [111,] -0.08 -0.03
## [112,] -0.06 -0.04
## [113,] -0.11 -0.05
## [114,] -0.20  0.00
## [115,] -0.26  0.01
## [116,] -0.25 -0.02
## [117,] -0.19  0.01
## [118,] -0.12 -0.02
## [119,] -0.10 -0.01
## [120,] -0.14 -0.01
acf2(deseasonal_cnt)

##          ACF  PACF
##   [1,]  0.78  0.78
##   [2,]  0.61  0.01
##   [3,]  0.50  0.05
##   [4,]  0.48  0.17
##   [5,]  0.55  0.29
##   [6,]  0.64  0.27
##   [7,]  0.73  0.30
##   [8,]  0.64 -0.16
##   [9,]  0.52 -0.07
##  [10,]  0.42 -0.07
##  [11,]  0.42  0.10
##  [12,]  0.48  0.07
##  [13,]  0.60  0.18
##  [14,]  0.65  0.01
##  [15,]  0.60  0.05
##  [16,]  0.50 -0.03
##  [17,]  0.39 -0.09
##  [18,]  0.40  0.11
##  [19,]  0.47  0.03
##  [20,]  0.56  0.04
##  [21,]  0.59 -0.04
##  [22,]  0.50 -0.20
##  [23,]  0.37 -0.12
##  [24,]  0.27 -0.06
##  [25,]  0.29  0.05
##  [26,]  0.34 -0.07
##  [27,]  0.43 -0.01
##  [28,]  0.46 -0.03
##  [29,]  0.39 -0.01
##  [30,]  0.26 -0.14
##  [31,]  0.18  0.03
##  [32,]  0.20 -0.01
##  [33,]  0.28  0.11
##  [34,]  0.40  0.10
##  [35,]  0.43  0.03
##  [36,]  0.35 -0.09
##  [37,]  0.21 -0.02
##  [38,]  0.13  0.00
##  [39,]  0.13 -0.02
##  [40,]  0.20  0.01
##  [41,]  0.32  0.07
##  [42,]  0.35  0.06
##  [43,]  0.27 -0.02
##  [44,]  0.15 -0.03
##  [45,]  0.07  0.04
##  [46,]  0.07  0.01
##  [47,]  0.14  0.00
##  [48,]  0.26  0.07
##  [49,]  0.30 -0.01
##  [50,]  0.22 -0.06
##  [51,]  0.10 -0.04
##  [52,]  0.03 -0.01
##  [53,]  0.03 -0.03
##  [54,]  0.10 -0.07
##  [55,]  0.19  0.01
##  [56,]  0.25  0.09
##  [57,]  0.18 -0.03
##  [58,]  0.06 -0.03
##  [59,] -0.01 -0.01
##  [60,] -0.03 -0.12
##  [61,]  0.05  0.01
##  [62,]  0.13  0.00
##  [63,]  0.20  0.09
##  [64,]  0.12 -0.07
##  [65,] -0.02 -0.11
##  [66,] -0.07  0.08
##  [67,] -0.06  0.00
##  [68,]  0.02  0.02
##  [69,]  0.10 -0.03
##  [70,]  0.13 -0.02
##  [71,]  0.06 -0.03
##  [72,] -0.05  0.08
##  [73,] -0.11  0.00
##  [74,] -0.10 -0.02
##  [75,] -0.01  0.05
##  [76,]  0.07 -0.01
##  [77,]  0.10  0.02
##  [78,]  0.03  0.01
##  [79,] -0.07  0.03
##  [80,] -0.15 -0.02
##  [81,] -0.15 -0.08
##  [82,] -0.08 -0.09
##  [83,]  0.00 -0.01
##  [84,]  0.05  0.04
##  [85,] -0.01  0.06
##  [86,] -0.12 -0.03
##  [87,] -0.18 -0.03
##  [88,] -0.16  0.04
##  [89,] -0.11 -0.07
##  [90,] -0.05 -0.11
##  [91,]  0.00  0.03
##  [92,] -0.05 -0.01
##  [93,] -0.13  0.09
##  [94,] -0.19  0.05
##  [95,] -0.16  0.01
##  [96,] -0.12 -0.06
##  [97,] -0.05  0.00
##  [98,] -0.01  0.02
##  [99,] -0.08 -0.01
## [100,] -0.17 -0.03
## [101,] -0.24 -0.06
## [102,] -0.21  0.05
## [103,] -0.17 -0.07
## [104,] -0.08  0.00
## [105,] -0.05  0.03
## [106,] -0.10  0.04
## [107,] -0.19 -0.01
## [108,] -0.24  0.03
## [109,] -0.21  0.00
## [110,] -0.16 -0.03
## [111,] -0.08 -0.03
## [112,] -0.06 -0.05
## [113,] -0.11 -0.06
## [114,] -0.20  0.00
## [115,] -0.26  0.02
## [116,] -0.25 -0.02
## [117,] -0.20  0.00
## [118,] -0.12  0.00
## [119,] -0.10 -0.02
## [120,] -0.14 -0.05

Seasonility Not in Consideration

# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=FALSE)
## Series: deseasonal_cnt 
## ARIMA(4,1,3) 
## 
## Coefficients:
##           ar1     ar2      ar3      ar4      ma1      ma2     ma3
##       -0.1154  0.2794  -0.6509  -0.3840  -0.2486  -0.5248  0.5934
## s.e.   0.0702  0.0563   0.0585   0.0581   0.0621   0.0829  0.0728
## 
## sigma^2 estimated as 0.201:  log likelihood=-221.94
## AIC=459.87   AICc=460.28   BIC=491.05
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality not considered]')

# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)

Seasonility in Consideration

# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=TRUE)
## Series: deseasonal_cnt 
## ARIMA(4,1,3)(0,0,1)[30] 
## 
## Coefficients:
##           ar1     ar2      ar3      ar4      ma1      ma2     ma3     sma1
##       -0.1081  0.2681  -0.6208  -0.3735  -0.2623  -0.5221  0.5706  -0.1718
## s.e.   0.0756  0.0617   0.0668   0.0598   0.0673   0.0903  0.0831   0.0715
## 
## sigma^2 estimated as 0.1978:  log likelihood=-218.89
## AIC=455.78   AICc=456.29   BIC=490.85
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=TRUE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality considered]')

# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)