Washington Rural Multi-lane Undivided

Subasish Das and Choalun Ma

2018-11-12

# Step 1: Load R Packages 
### options(repos='http://cran.rstudio.com/')
#install.packages("astsa")
#install.packages('ggplot2')
#install.packages('forecast')
#install.packages('tseries')
#install.packages("data.table")

library(astsa)
library(forecast)
library(tseries)
library(zoo)
library(tseries)

library(data.table)
library(dplyr)
library(tidyr)
library(naniar)
library(stringr)
library(ggplot2)
library(DT)
library(lubridate)
library(ggpubr)


setwd("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/")
load("./multi-lane_undivided_WA_reduce_withCrash.rData")
mytype = 'RMU'
setwd(paste0("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/",mytype))

df_RMU <- W_mun_nomed
dim(df_RMU)
## [1] 946944     64
df_RMU$spd_av = 3600*df_RMU$DISTANCE/df_RMU$Travel_TIME_ALL_VEHICLES
df_RMU$spd_pv = 3600*df_RMU$DISTANCE/df_RMU$Travel_TIME_PASSENGER_VEHICLES
df_RMU$spd_ft = 3600*df_RMU$DISTANCE/df_RMU$Travel_TIME_FREIGHT_TRUCKS

### Month, Day
df_RMU$date <- as.character(df_RMU$DATE)
df_RMU$date <- str_pad(df_RMU$DATE, 8, pad = "0")
df_RMU$Month <- substr(df_RMU$date, start = 1, stop = 2)
df_RMU$Day   <- substr(df_RMU$date, start = 3, stop = 4)
df_RMU$Year  <- substr(df_RMU$date, start = 5, stop = 8)
df_RMU$MonthDay <- paste0(df_RMU$Month,"_", df_RMU$Day)
head(df_RMU)
##            TimeStamp       TMC     V1    DATE EPOCH15
## 1:  114N05556_0101_0 114N05556 479137 1012015       0
## 2:  114N05556_0101_1 114N05556 479138 1012015       1
## 3: 114N05556_0101_10 114N05556 479147 1012015      10
## 4: 114N05556_0101_11 114N05556 479148 1012015      11
## 5: 114N05556_0101_12 114N05556 479149 1012015      12
## 6: 114N05556_0101_13 114N05556 479150 1012015      13
##    Travel_TIME_ALL_VEHICLES Travel_TIME_PASSENGER_VEHICLES
## 1:                       NA                             NA
## 2:                       NA                             NA
## 3:                       NA                             NA
## 4:                      156                             NA
## 5:                       NA                             NA
## 6:                       NA                             NA
##    Travel_TIME_FREIGHT_TRUCKS NP ADMIN_LEVE ADMIN_LE_1 ADMIN_LE_2 DISTANCE
## 1:                         NA  N        USA Washington    Whitman  0.67404
## 2:                         NA  N        USA Washington    Whitman  0.67404
## 3:                         NA  N        USA Washington    Whitman  0.67404
## 4:                        156  N        USA Washington    Whitman  0.67404
## 5:                         NA  N        USA Washington    Whitman  0.67404
## 6:                         NA  N        USA Washington    Whitman  0.67404
##    ROAD_NUMBE ROAD_NAME LATITUDE LONGITUDE ROAD_DIREC  ORN_FID    FID_1
## 1:     US-195            46.8799 -117.3648 Southbound 31759.53 9634.893
## 2:     US-195            46.8799 -117.3648 Southbound 31759.53 9634.893
## 3:     US-195            46.8799 -117.3648 Southbound 31759.53 9634.893
## 4:     US-195            46.8799 -117.3648 Southbound 31759.53 9634.893
## 5:     US-195            46.8799 -117.3648 Southbound 31759.53 9634.893
## 6:     US-195            46.8799 -117.3648 Southbound 31759.53 9634.893
##    ACCESS LSHL_TY2 LSHL_TYP MED_TYPE NHS_IND PRK_ZNE RSHL_TY2 RSHL_TYP
## 1:      5                 C                Y       X                 C
## 2:      5                 C                Y       X                 C
## 3:      5                 C                Y       X                 C
## 4:      5                 C                Y       X                 C
## 5:      5                 C                Y       X                 C
## 6:      5                 C                Y       X                 C
##    SURF_TYP SURF_TY2 TERRAIN COMP_DIR COUNTY FUNC_CLS MEDBARTY ST_FUNC
## 1:        A                R        N     38       43               R1
## 2:        A                R        N     38       43               R1
## 3:        A                R        N     38       43               R1
## 4:        A                R        N     38       43               R1
## 5:        A                R        N     38       43               R1
## 6:        A                R        N     38       43               R1
##    RTE_NBR          HPMS ROAD_INV SPD_LIMT BEGMP ENDMP LSHLDWID MEDWID
## 1:     195 6195380374900      195       60 35.95 36.64        0      0
## 2:     195 6195380374900      195       60 35.95 36.64        0      0
## 3:     195 6195380374900      195       60 35.95 36.64        0      0
## 4:     195 6195380374900      195       60 35.95 36.64        0      0
## 5:     195 6195380374900      195       60 35.95 36.64        0      0
## 6:     195 6195380374900      195       60 35.95 36.64        0      0
##    NO_LANE1 NO_LANE2 NO_LANES RSHLDWID RSHL_WD2    SEG_LNG  lanewid
## 1: 1.961917 1.977804        4        0        0 0.05879853 13.23761
## 2: 1.961917 1.977804        4        0        0 0.05879853 13.23761
## 3: 1.961917 1.977804        4        0        0 0.05879853 13.23761
## 4: 1.961917 1.977804        4        0        0 0.05879853 13.23761
## 5: 1.961917 1.977804        4        0        0 0.05879853 13.23761
## 6: 1.961917 1.977804        4        0        0 0.05879853 13.23761
##    rdwy_wd1 rdwy_wd2 rdwy_wid     AADT      mvmt rodwycls ORN_FID_1 Total
## 1: 51.96074        0 51.96074 10130.56 0.2192078       10  31759.53    10
## 2: 51.96074        0 51.96074 10130.56 0.2192078       10  31759.53    10
## 3: 51.96074        0 51.96074 10130.56 0.2192078       10  31759.53    10
## 4: 51.96074        0 51.96074 10130.56 0.2192078       10  31759.53    10
## 5: 51.96074        0 51.96074 10130.56 0.2192078       10  31759.53    10
## 6: 51.96074        0 51.96074 10130.56 0.2192078       10  31759.53    10
##    Fatal Injury PDO DAYMTH Crash   spd_av spd_pv   spd_ft     date Month
## 1:     0      2   8   0101     0       NA     NA       NA 01012015    01
## 2:     0      2   8   0101     0       NA     NA       NA 01012015    01
## 3:     0      2   8   0101     0       NA     NA       NA 01012015    01
## 4:     0      2   8   0101     0 15.55477     NA 15.55477 01012015    01
## 5:     0      2   8   0101     0       NA     NA       NA 01012015    01
## 6:     0      2   8   0101     0       NA     NA       NA 01012015    01
##    Day Year MonthDay
## 1:  01 2015    01_01
## 2:  01 2015    01_01
## 3:  01 2015    01_01
## 4:  01 2015    01_01
## 5:  01 2015    01_01
## 6:  01 2015    01_01
day1<- df_RMU[,-c(1)] %>% group_by(MonthDay) %>% summarize(Speed_All_Mean=mean(spd_av, na.rm=TRUE))
day1
## # A tibble: 365 x 2
##    MonthDay Speed_All_Mean
##    <chr>             <dbl>
##  1 01_01              36.6
##  2 01_02              35.9
##  3 01_03              37.0
##  4 01_04              33.9
##  5 01_05              36.0
##  6 01_06              37.5
##  7 01_07              37.3
##  8 01_08              36.5
##  9 01_09              36.0
## 10 01_10              35.8
## # ... with 355 more rows
# Step 2: Examine Data
speed_clean <- tsclean(ts(day1$Speed_All_Mean))
plot.ts(speed_clean)

# ggplot() + geom_line(data = Q1, aes(x = TimeStamp, y = speed_clean)) + ylab('Cleaned Speed Records')

day1$cnt_ma = ma(speed_clean, order=7) # using the clean count with no outliers
day1$cnt_ma30 = ma(speed_clean, order=30)
# Step 3: Decompose Your Data
count_ma = ts(na.omit(speed_clean), frequency=30)
decomp = stl(count_ma, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp)

# Step 4: Stationarity
# statinary test
adf.test(count_ma, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  count_ma
## Dickey-Fuller = -3.7714, Lag order = 7, p-value = 0.0207
## alternative hypothesis: stationary
adf.test(deseasonal_cnt, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  deseasonal_cnt
## Dickey-Fuller = -3.7924, Lag order = 7, p-value = 0.01965
## alternative hypothesis: stationary
d1 = diff(deseasonal_cnt)
adf.test(d1, alternative = "stationary")
## Warning in adf.test(d1, alternative = "stationary"): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  d1
## Dickey-Fuller = -10.073, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# Step 5: Autocorrelations and Choosing Model Order
# check ACF and PACF
acf2(count_ma)

##          ACF  PACF
##   [1,]  0.49  0.49
##   [2,]  0.56  0.43
##   [3,]  0.51  0.24
##   [4,]  0.49  0.14
##   [5,]  0.53  0.19
##   [6,]  0.43  0.00
##   [7,]  0.63  0.35
##   [8,]  0.40 -0.09
##   [9,]  0.49  0.05
##  [10,]  0.45  0.02
##  [11,]  0.41 -0.04
##  [12,]  0.46  0.05
##  [13,]  0.40  0.05
##  [14,]  0.56  0.19
##  [15,]  0.35 -0.08
##  [16,]  0.46  0.04
##  [17,]  0.40 -0.03
##  [18,]  0.39  0.01
##  [19,]  0.46  0.09
##  [20,]  0.38  0.02
##  [21,]  0.52  0.09
##  [22,]  0.35 -0.03
##  [23,]  0.43 -0.03
##  [24,]  0.37 -0.03
##  [25,]  0.37  0.03
##  [26,]  0.40 -0.05
##  [27,]  0.31 -0.06
##  [28,]  0.49  0.14
##  [29,]  0.31 -0.04
##  [30,]  0.38 -0.03
##  [31,]  0.31 -0.05
##  [32,]  0.35  0.05
##  [33,]  0.37  0.00
##  [34,]  0.29 -0.01
##  [35,]  0.47  0.13
##  [36,]  0.27 -0.06
##  [37,]  0.37  0.01
##  [38,]  0.32  0.02
##  [39,]  0.32 -0.01
##  [40,]  0.34 -0.01
##  [41,]  0.25 -0.06
##  [42,]  0.44  0.07
##  [43,]  0.24 -0.01
##  [44,]  0.32 -0.06
##  [45,]  0.29  0.02
##  [46,]  0.29  0.02
##  [47,]  0.33  0.03
##  [48,]  0.26  0.00
##  [49,]  0.39  0.00
##  [50,]  0.24  0.02
##  [51,]  0.33  0.04
##  [52,]  0.31  0.05
##  [53,]  0.27 -0.03
##  [54,]  0.33  0.01
##  [55,]  0.26  0.00
##  [56,]  0.36 -0.04
##  [57,]  0.22 -0.02
##  [58,]  0.27 -0.09
##  [59,]  0.24 -0.04
##  [60,]  0.25  0.03
##  [61,]  0.26 -0.04
##  [62,]  0.19 -0.04
##  [63,]  0.30  0.00
##  [64,]  0.20  0.06
##  [65,]  0.25  0.01
##  [66,]  0.23  0.02
##  [67,]  0.22 -0.02
##  [68,]  0.24 -0.01
##  [69,]  0.16 -0.04
##  [70,]  0.27 -0.05
##  [71,]  0.17  0.01
##  [72,]  0.21 -0.02
##  [73,]  0.18 -0.03
##  [74,]  0.21  0.05
##  [75,]  0.22  0.02
##  [76,]  0.11 -0.09
##  [77,]  0.27  0.10
##  [78,]  0.12 -0.04
##  [79,]  0.20  0.03
##  [80,]  0.16 -0.03
##  [81,]  0.12 -0.10
##  [82,]  0.14 -0.07
##  [83,]  0.06 -0.04
##  [84,]  0.21  0.02
##  [85,]  0.06 -0.03
##  [86,]  0.12 -0.01
##  [87,]  0.10 -0.04
##  [88,]  0.09  0.01
##  [89,]  0.14  0.06
##  [90,]  0.05  0.00
##  [91,]  0.20  0.10
##  [92,]  0.03 -0.06
##  [93,]  0.14  0.04
##  [94,]  0.08 -0.05
##  [95,]  0.08  0.03
##  [96,]  0.10 -0.02
##  [97,]  0.00 -0.08
##  [98,]  0.15  0.03
##  [99,]  0.03  0.02
## [100,]  0.09 -0.01
## [101,]  0.04 -0.02
## [102,]  0.03 -0.05
## [103,]  0.06 -0.03
## [104,] -0.03  0.01
## [105,]  0.09 -0.02
## [106,]  0.00  0.05
## [107,]  0.06  0.00
## [108,]  0.03  0.04
## [109,]  0.00 -0.06
## [110,]  0.02 -0.02
## [111,] -0.05  0.00
## [112,]  0.12  0.08
## [113,] -0.04 -0.05
## [114,]  0.03 -0.03
## [115,] -0.01 -0.01
## [116,] -0.03 -0.03
## [117,]  0.01  0.01
## [118,] -0.05  0.08
## [119,]  0.06 -0.06
## [120,] -0.08 -0.04
acf2(deseasonal_cnt)

##          ACF  PACF
##   [1,]  0.50  0.50
##   [2,]  0.57  0.43
##   [3,]  0.53  0.24
##   [4,]  0.51  0.14
##   [5,]  0.53  0.18
##   [6,]  0.44 -0.01
##   [7,]  0.64  0.36
##   [8,]  0.41 -0.11
##   [9,]  0.51  0.06
##  [10,]  0.46  0.01
##  [11,]  0.42 -0.04
##  [12,]  0.47  0.04
##  [13,]  0.41  0.06
##  [14,]  0.57  0.19
##  [15,]  0.37 -0.08
##  [16,]  0.47  0.02
##  [17,]  0.41 -0.02
##  [18,]  0.39 -0.01
##  [19,]  0.48  0.12
##  [20,]  0.39  0.02
##  [21,]  0.53  0.09
##  [22,]  0.36 -0.04
##  [23,]  0.43 -0.04
##  [24,]  0.38 -0.03
##  [25,]  0.38  0.05
##  [26,]  0.41 -0.04
##  [27,]  0.32 -0.06
##  [28,]  0.50  0.14
##  [29,]  0.32 -0.03
##  [30,]  0.36 -0.09
##  [31,]  0.33 -0.01
##  [32,]  0.36  0.07
##  [33,]  0.38 -0.01
##  [34,]  0.30  0.00
##  [35,]  0.47  0.11
##  [36,]  0.28 -0.08
##  [37,]  0.37  0.04
##  [38,]  0.33  0.00
##  [39,]  0.33 -0.01
##  [40,]  0.34 -0.02
##  [41,]  0.27 -0.06
##  [42,]  0.44  0.07
##  [43,]  0.25  0.01
##  [44,]  0.32 -0.06
##  [45,]  0.31  0.02
##  [46,]  0.29  0.02
##  [47,]  0.34  0.03
##  [48,]  0.26 -0.02
##  [49,]  0.40  0.04
##  [50,]  0.25  0.03
##  [51,]  0.34  0.04
##  [52,]  0.32  0.04
##  [53,]  0.28 -0.04
##  [54,]  0.33  0.00
##  [55,]  0.26  0.03
##  [56,]  0.37 -0.04
##  [57,]  0.23 -0.02
##  [58,]  0.27 -0.08
##  [59,]  0.25 -0.04
##  [60,]  0.24 -0.01
##  [61,]  0.27 -0.02
##  [62,]  0.20 -0.03
##  [63,]  0.30 -0.01
##  [64,]  0.21  0.07
##  [65,]  0.25  0.00
##  [66,]  0.23  0.01
##  [67,]  0.23  0.02
##  [68,]  0.25 -0.02
##  [69,]  0.17 -0.04
##  [70,]  0.26 -0.06
##  [71,]  0.18 -0.01
##  [72,]  0.21  0.00
##  [73,]  0.19 -0.01
##  [74,]  0.21  0.04
##  [75,]  0.23  0.03
##  [76,]  0.12 -0.11
##  [77,]  0.28  0.10
##  [78,]  0.12 -0.07
##  [79,]  0.20  0.06
##  [80,]  0.17 -0.03
##  [81,]  0.13 -0.10
##  [82,]  0.14 -0.09
##  [83,]  0.06 -0.05
##  [84,]  0.21  0.02
##  [85,]  0.07  0.00
##  [86,]  0.13 -0.01
##  [87,]  0.11 -0.04
##  [88,]  0.09 -0.01
##  [89,]  0.14  0.08
##  [90,]  0.04 -0.03
##  [91,]  0.20  0.12
##  [92,]  0.04 -0.06
##  [93,]  0.14  0.03
##  [94,]  0.09 -0.07
##  [95,]  0.07  0.05
##  [96,]  0.10 -0.05
##  [97,]  0.00 -0.04
##  [98,]  0.15  0.01
##  [99,]  0.04  0.03
## [100,]  0.08 -0.06
## [101,]  0.05  0.00
## [102,]  0.03 -0.05
## [103,]  0.07  0.00
## [104,] -0.03  0.00
## [105,]  0.10 -0.01
## [106,]  0.00  0.00
## [107,]  0.05  0.03
## [108,]  0.03 -0.01
## [109,]  0.00 -0.02
## [110,]  0.02 -0.02
## [111,] -0.05  0.01
## [112,]  0.11  0.05
## [113,] -0.05 -0.05
## [114,]  0.02 -0.03
## [115,]  0.00  0.04
## [116,] -0.02 -0.03
## [117,]  0.01  0.02
## [118,] -0.05  0.06
## [119,]  0.06 -0.04
## [120,] -0.09 -0.08

Seasonility Not in Consideration

# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=FALSE)
## Series: deseasonal_cnt 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.8769
## s.e.   0.0275
## 
## sigma^2 estimated as 0.5475:  log likelihood=-407.08
## AIC=818.15   AICc=818.18   BIC=825.95
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality not considered]')

# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)

Seasonility in Consideration

# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=TRUE)
## Series: deseasonal_cnt 
## ARIMA(0,1,3)(2,0,0)[30] 
## 
## Coefficients:
##           ma1     ma2      ma3     sar1     sar2
##       -0.9411  0.1711  -0.1092  -0.0628  -0.0520
## s.e.   0.0531  0.0762   0.0544   0.0593   0.0598
## 
## sigma^2 estimated as 0.5427:  log likelihood=-403.6
## AIC=819.21   AICc=819.44   BIC=842.59
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=TRUE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality considered]')

# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)