Washington Rural Two-Lane – Daily Speed (ARIMA)

Subasish Das and Choalun Ma

2018-11-12

# Step 1: Load R Packages 
### options(repos='http://cran.rstudio.com/')
#install.packages("astsa")
#install.packages('ggplot2')
#install.packages('forecast')
#install.packages('tseries')
#install.packages("data.table")

library(astsa)
library(forecast)
library(tseries)
library(zoo)
library(tseries)

library(data.table)
library(dplyr)
library(tidyr)
library(naniar)
library(stringr)
library(ggplot2)
library(DT)
library(lubridate)
library(ggpubr)


mytype = 'R2'
setwd("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/")
load("./two-lane_undivided_WA_reduce_withCrash.rData")
setwd(paste0("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/",mytype))

df_R2 <- W_2un_nomed
dim(df_R2)
## [1] 23080608       64
df_R2$spd_av = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_ALL_VEHICLES
df_R2$spd_pv = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_PASSENGER_VEHICLES
df_R2$spd_ft = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_FREIGHT_TRUCKS

### Remove outliers
df_R2$spd_av = ifelse(df_R2$spd_av <120, df_R2$spd_av, NA)
df_R2$spd_pv = ifelse(df_R2$spd_pv <120, df_R2$spd_pv, NA)
df_R2$spd_ft = ifelse(df_R2$spd_ft <120, df_R2$spd_ft, NA)

### Month, Day
df_R2$date <- as.character(df_R2$DATE)
df_R2$date <- str_pad(df_R2$DATE, 8, pad = "0")
df_R2$Month <- substr(df_R2$date, start = 1, stop = 2)
df_R2$Day   <- substr(df_R2$date, start = 3, stop = 4)
df_R2$Year  <- substr(df_R2$date, start = 5, stop = 8)
df_R2$MonthDay <- paste0(df_R2$Month,"_", df_R2$Day)
head(df_R2)
##            TimeStamp       TMC      V1    DATE EPOCH15
## 1:  114N15190_0701_0 114N15190 1598113 7012015       0
## 2:  114N15190_0701_1 114N15190 1598114 7012015       1
## 3: 114N15190_0701_10 114N15190 1598123 7012015      10
## 4: 114N15190_0701_11 114N15190 1598124 7012015      11
## 5: 114N15190_0701_12 114N15190 1598125 7012015      12
## 6: 114N15190_0701_13 114N15190 1598126 7012015      13
##    Travel_TIME_ALL_VEHICLES Travel_TIME_PASSENGER_VEHICLES
## 1:                       NA                             NA
## 2:                       NA                             NA
## 3:                       NA                             NA
## 4:                       NA                             NA
## 5:                       NA                             NA
## 6:                       NA                             NA
##    Travel_TIME_FREIGHT_TRUCKS NP ADMIN_LEVE ADMIN_LE_1 ADMIN_LE_2 DISTANCE
## 1:                         NA  N        USA Washington  Jefferson  1.10912
## 2:                         NA  N        USA Washington  Jefferson  1.10912
## 3:                         NA  N        USA Washington  Jefferson  1.10912
## 4:                         NA  N        USA Washington  Jefferson  1.10912
## 5:                         NA  N        USA Washington  Jefferson  1.10912
## 6:                         NA  N        USA Washington  Jefferson  1.10912
##    ROAD_NUMBE ROAD_NAME LATITUDE LONGITUDE ROAD_DIREC  ORN_FID    FID_1
## 1:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
## 2:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
## 3:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
## 4:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
## 5:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
## 6:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
##    ACCESS LSHL_TY2 LSHL_TYP MED_TYPE NHS_IND PRK_ZNE RSHL_TY2 RSHL_TYP
## 1:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
## 2:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
## 3:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
## 4:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
## 5:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
## 6:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
##    SURF_TYP SURF_TY2 TERRAIN COMP_DIR COUNTY FUNC_CLS MEDBARTY ST_FUNC
## 1:        B     <NA>       R       NE     16       45     <NA>      R3
## 2:        B     <NA>       R       NE     16       45     <NA>      R3
## 3:        B     <NA>       R       NE     16       45     <NA>      R3
## 4:        B     <NA>       R       NE     16       45     <NA>      R3
## 5:        B     <NA>       R       NE     16       45     <NA>      R3
## 6:        B     <NA>       R       NE     16       45     <NA>      R3
##    RTE_NBR          HPMS ROAD_INV SPD_LIMT BEGMP ENDMP LSHLDWID MEDWID
## 1:     116 61161606111A0      116        0     0  1.12 5.431342      0
## 2:     116 61161606111A0      116        0     0  1.12 5.431342      0
## 3:     116 61161606111A0      116        0     0  1.12 5.431342      0
## 4:     116 61161606111A0      116        0     0  1.12 5.431342      0
## 5:     116 61161606111A0      116        0     0  1.12 5.431342      0
## 6:     116 61161606111A0      116        0     0  1.12 5.431342      0
##    NO_LANE1 NO_LANE2 NO_LANES RSHLDWID RSHL_WD2 SEG_LNG lanewid rdwy_wd1
## 1:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
## 2:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
## 3:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
## 4:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
## 5:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
## 6:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
##    rdwy_wd2 rdwy_wid AADT      mvmt rodwycls ORN_FID_1 Total Fatal Injury
## 1:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
## 2:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
## 3:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
## 4:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
## 5:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
## 6:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
##    PDO DAYMTH Crash spd_av spd_pv spd_ft     date Month Day Year MonthDay
## 1:   2   0701     0     NA     NA     NA 07012015    07  01 2015    07_01
## 2:   2   0701     0     NA     NA     NA 07012015    07  01 2015    07_01
## 3:   2   0701     0     NA     NA     NA 07012015    07  01 2015    07_01
## 4:   2   0701     0     NA     NA     NA 07012015    07  01 2015    07_01
## 5:   2   0701     0     NA     NA     NA 07012015    07  01 2015    07_01
## 6:   2   0701     0     NA     NA     NA 07012015    07  01 2015    07_01
day1<- df_R2[,-c(1)] %>% group_by(MonthDay) %>% summarize(Speed_All_Mean=mean(spd_av, na.rm=TRUE))
day1
## # A tibble: 365 x 2
##    MonthDay Speed_All_Mean
##    <chr>             <dbl>
##  1 01_01              49.4
##  2 01_02              48.0
##  3 01_03              48.1
##  4 01_04              44.6
##  5 01_05              47.7
##  6 01_06              48.5
##  7 01_07              48.2
##  8 01_08              48.2
##  9 01_09              48.9
## 10 01_10              48.5
## # ... with 355 more rows
# Step 2: Examine Data
speed_clean <- tsclean(ts(day1$Speed_All_Mean))
plot.ts(speed_clean)

# ggplot() + geom_line(data = Q1, aes(x = TimeStamp, y = speed_clean)) + ylab('Cleaned Speed Records')

day1$cnt_ma = ma(speed_clean, order=7) # using the clean count with no outliers
day1$cnt_ma30 = ma(speed_clean, order=30)
# Step 3: Decompose Your Data
count_ma = ts(na.omit(speed_clean), frequency=30)
decomp = stl(count_ma, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp)

# Step 4: Stationarity
# statinary test
adf.test(count_ma, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_ma
## Dickey-Fuller = -1.7171, Lag order = 7, p-value = 0.6963
## alternative hypothesis: stationary
adf.test(deseasonal_cnt, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  deseasonal_cnt
## Dickey-Fuller = -1.6652, Lag order = 7, p-value = 0.7182
## alternative hypothesis: stationary
d1 = diff(deseasonal_cnt)
adf.test(d1, alternative = "stationary")
## Warning in adf.test(d1, alternative = "stationary"): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  d1
## Dickey-Fuller = -7.1508, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# Step 5: Autocorrelations and Choosing Model Order
# check ACF and PACF
acf2(count_ma)

##          ACF  PACF
##   [1,]  0.85  0.85
##   [2,]  0.80  0.27
##   [3,]  0.77  0.16
##   [4,]  0.74  0.09
##   [5,]  0.72  0.08
##   [6,]  0.71  0.10
##   [7,]  0.72  0.16
##   [8,]  0.67 -0.14
##   [9,]  0.60 -0.21
##  [10,]  0.57 -0.02
##  [11,]  0.53 -0.05
##  [12,]  0.50  0.00
##  [13,]  0.50  0.08
##  [14,]  0.48  0.00
##  [15,]  0.44 -0.06
##  [16,]  0.40  0.01
##  [17,]  0.37 -0.02
##  [18,]  0.34 -0.01
##  [19,]  0.32  0.04
##  [20,]  0.31 -0.02
##  [21,]  0.30  0.00
##  [22,]  0.27 -0.01
##  [23,]  0.25  0.03
##  [24,]  0.22 -0.02
##  [25,]  0.21  0.02
##  [26,]  0.21  0.04
##  [27,]  0.20  0.01
##  [28,]  0.20  0.03
##  [29,]  0.17 -0.09
##  [30,]  0.14 -0.06
##  [31,]  0.11 -0.10
##  [32,]  0.09 -0.01
##  [33,]  0.11  0.10
##  [34,]  0.11  0.06
##  [35,]  0.11 -0.01
##  [36,]  0.07 -0.08
##  [37,]  0.06  0.02
##  [38,]  0.05  0.06
##  [39,]  0.03 -0.04
##  [40,]  0.04  0.01
##  [41,]  0.05 -0.01
##  [42,]  0.03 -0.09
##  [43,]  0.01 -0.03
##  [44,] -0.01  0.00
##  [45,] -0.02  0.01
##  [46,] -0.04  0.01
##  [47,] -0.03  0.01
##  [48,] -0.02  0.00
##  [49,] -0.03  0.04
##  [50,] -0.05 -0.04
##  [51,] -0.05  0.04
##  [52,] -0.06  0.02
##  [53,] -0.06  0.00
##  [54,] -0.06 -0.02
##  [55,] -0.06 -0.04
##  [56,] -0.06  0.00
##  [57,] -0.08 -0.03
##  [58,] -0.08  0.01
##  [59,] -0.09 -0.03
##  [60,] -0.11 -0.06
##  [61,] -0.10  0.00
##  [62,] -0.10 -0.01
##  [63,] -0.10  0.02
##  [64,] -0.12 -0.02
##  [65,] -0.12  0.05
##  [66,] -0.12 -0.01
##  [67,] -0.13  0.00
##  [68,] -0.13 -0.05
##  [69,] -0.12  0.05
##  [70,] -0.13 -0.03
##  [71,] -0.14  0.02
##  [72,] -0.13  0.07
##  [73,] -0.14 -0.03
##  [74,] -0.12  0.07
##  [75,] -0.12 -0.09
##  [76,] -0.12 -0.01
##  [77,] -0.13 -0.04
##  [78,] -0.13  0.01
##  [79,] -0.12  0.02
##  [80,] -0.12  0.02
##  [81,] -0.12 -0.10
##  [82,] -0.11  0.01
##  [83,] -0.12  0.03
##  [84,] -0.14 -0.05
##  [85,] -0.14  0.04
##  [86,] -0.12  0.02
##  [87,] -0.12  0.03
##  [88,] -0.12  0.00
##  [89,] -0.11  0.02
##  [90,] -0.09  0.05
##  [91,] -0.10 -0.03
##  [92,] -0.11 -0.03
##  [93,] -0.08  0.03
##  [94,] -0.09 -0.03
##  [95,] -0.09 -0.04
##  [96,] -0.06  0.10
##  [97,] -0.06 -0.05
##  [98,] -0.07  0.02
##  [99,] -0.06  0.02
## [100,] -0.05 -0.04
## [101,] -0.06 -0.02
## [102,] -0.05  0.03
## [103,] -0.04 -0.06
## [104,] -0.04  0.01
## [105,] -0.03  0.13
## [106,] -0.02 -0.04
## [107,] -0.01  0.01
## [108,] -0.01  0.04
## [109,]  0.01  0.01
## [110,]  0.00 -0.06
## [111,] -0.01 -0.02
## [112,]  0.01  0.02
## [113,]  0.03  0.08
## [114,]  0.02 -0.07
## [115,]  0.03 -0.01
## [116,]  0.04  0.02
## [117,]  0.04  0.01
## [118,]  0.03  0.02
## [119,]  0.04 -0.01
## [120,]  0.05 -0.03
acf2(deseasonal_cnt)

##          ACF  PACF
##   [1,]  0.86  0.86
##   [2,]  0.81  0.26
##   [3,]  0.78  0.16
##   [4,]  0.75  0.08
##   [5,]  0.73  0.07
##   [6,]  0.72  0.09
##   [7,]  0.73  0.15
##   [8,]  0.67 -0.15
##   [9,]  0.60 -0.21
##  [10,]  0.58 -0.01
##  [11,]  0.53 -0.06
##  [12,]  0.51  0.01
##  [13,]  0.50  0.09
##  [14,]  0.49 -0.01
##  [15,]  0.44 -0.07
##  [16,]  0.40  0.00
##  [17,]  0.37 -0.01
##  [18,]  0.35  0.00
##  [19,]  0.33  0.05
##  [20,]  0.32 -0.02
##  [21,]  0.31 -0.01
##  [22,]  0.28 -0.02
##  [23,]  0.25  0.01
##  [24,]  0.23 -0.02
##  [25,]  0.21  0.03
##  [26,]  0.21  0.04
##  [27,]  0.21  0.01
##  [28,]  0.20  0.01
##  [29,]  0.17 -0.09
##  [30,]  0.14 -0.09
##  [31,]  0.11 -0.07
##  [32,]  0.09 -0.01
##  [33,]  0.11  0.11
##  [34,]  0.11  0.07
##  [35,]  0.10 -0.01
##  [36,]  0.07 -0.08
##  [37,]  0.06  0.03
##  [38,]  0.05  0.06
##  [39,]  0.03 -0.05
##  [40,]  0.04  0.03
##  [41,]  0.05 -0.02
##  [42,]  0.03 -0.10
##  [43,]  0.01 -0.02
##  [44,] -0.02 -0.02
##  [45,] -0.03 -0.01
##  [46,] -0.04  0.01
##  [47,] -0.03  0.02
##  [48,] -0.02  0.01
##  [49,] -0.02  0.06
##  [50,] -0.05 -0.05
##  [51,] -0.05  0.04
##  [52,] -0.06  0.01
##  [53,] -0.06 -0.01
##  [54,] -0.06 -0.02
##  [55,] -0.06 -0.03
##  [56,] -0.05  0.01
##  [57,] -0.07 -0.03
##  [58,] -0.08  0.02
##  [59,] -0.09 -0.04
##  [60,] -0.11 -0.09
##  [61,] -0.10  0.03
##  [62,] -0.10 -0.03
##  [63,] -0.10  0.04
##  [64,] -0.12 -0.01
##  [65,] -0.12  0.05
##  [66,] -0.13 -0.03
##  [67,] -0.13  0.01
##  [68,] -0.13 -0.05
##  [69,] -0.12  0.04
##  [70,] -0.13 -0.01
##  [71,] -0.14  0.00
##  [72,] -0.13  0.07
##  [73,] -0.14 -0.01
##  [74,] -0.12  0.06
##  [75,] -0.13 -0.12
##  [76,] -0.12 -0.01
##  [77,] -0.13 -0.04
##  [78,] -0.13  0.02
##  [79,] -0.12  0.05
##  [80,] -0.12  0.00
##  [81,] -0.12 -0.10
##  [82,] -0.12 -0.01
##  [83,] -0.12  0.02
##  [84,] -0.14 -0.05
##  [85,] -0.14  0.06
##  [86,] -0.12  0.02
##  [87,] -0.11  0.03
##  [88,] -0.12  0.01
##  [89,] -0.11  0.02
##  [90,] -0.10  0.01
##  [91,] -0.11 -0.03
##  [92,] -0.11 -0.05
##  [93,] -0.09  0.06
##  [94,] -0.09 -0.02
##  [95,] -0.09 -0.03
##  [96,] -0.07  0.08
##  [97,] -0.07 -0.04
##  [98,] -0.07  0.02
##  [99,] -0.06  0.01
## [100,] -0.06 -0.03
## [101,] -0.06 -0.04
## [102,] -0.05  0.04
## [103,] -0.04 -0.04
## [104,] -0.04  0.02
## [105,] -0.03  0.11
## [106,] -0.02 -0.05
## [107,] -0.01  0.00
## [108,]  0.00  0.05
## [109,]  0.01  0.04
## [110,]  0.00 -0.10
## [111,] -0.01  0.00
## [112,]  0.01 -0.02
## [113,]  0.03  0.09
## [114,]  0.02 -0.08
## [115,]  0.04  0.00
## [116,]  0.04  0.02
## [117,]  0.04  0.02
## [118,]  0.03  0.02
## [119,]  0.04 -0.01
## [120,]  0.04 -0.05

Seasonility Not in Consideration

# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=FALSE)
## Series: deseasonal_cnt 
## ARIMA(3,1,2) 
## 
## Coefficients:
##          ar1      ar2      ar3      ma1     ma2
##       0.8595  -0.2881  -0.1150  -1.3642  0.5770
## s.e.  0.1900   0.0883   0.0804   0.1853  0.1387
## 
## sigma^2 estimated as 0.09534:  log likelihood=-86.53
## AIC=185.05   AICc=185.29   BIC=208.44
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality not considered]')

# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)

Seasonility in Consideration

# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=TRUE)
## Series: deseasonal_cnt 
## ARIMA(2,1,2)(0,0,2)[30] 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     sma1     sma2
##       0.9856  -0.3316  -1.4786  0.6152  -0.0251  -0.2670
## s.e.  0.1716   0.0863   0.1698  0.1280   0.0678   0.0848
## 
## sigma^2 estimated as 0.09196:  log likelihood=-81.67
## AIC=177.33   AICc=177.65   BIC=204.61
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=TRUE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality considered]')

# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)