# Step 1: Load R Packages
### options(repos='http://cran.rstudio.com/')
#install.packages("astsa")
#install.packages('ggplot2')
#install.packages('forecast')
#install.packages('tseries')
#install.packages("data.table")
library(astsa)
library(forecast)
library(tseries)
library(zoo)
library(tseries)
library(data.table)
library(dplyr)
library(tidyr)
library(naniar)
library(stringr)
library(ggplot2)
library(DT)
library(lubridate)
library(ggpubr)
mytype = 'R2'
setwd("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/")
load("./two-lane_undivided_WA_reduce_withCrash.rData")
setwd(paste0("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/",mytype))
df_R2 <- W_2un_nomed
dim(df_R2)
## [1] 23080608 64
df_R2$spd_av = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_ALL_VEHICLES
df_R2$spd_pv = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_PASSENGER_VEHICLES
df_R2$spd_ft = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_FREIGHT_TRUCKS
### Remove outliers
df_R2$spd_av = ifelse(df_R2$spd_av <120, df_R2$spd_av, NA)
df_R2$spd_pv = ifelse(df_R2$spd_pv <120, df_R2$spd_pv, NA)
df_R2$spd_ft = ifelse(df_R2$spd_ft <120, df_R2$spd_ft, NA)
### Month, Day
df_R2$date <- as.character(df_R2$DATE)
df_R2$date <- str_pad(df_R2$DATE, 8, pad = "0")
df_R2$Month <- substr(df_R2$date, start = 1, stop = 2)
df_R2$Day <- substr(df_R2$date, start = 3, stop = 4)
df_R2$Year <- substr(df_R2$date, start = 5, stop = 8)
df_R2$MonthDay <- paste0(df_R2$Month,"_", df_R2$Day)
head(df_R2)
## TimeStamp TMC V1 DATE EPOCH15
## 1: 114N15190_0701_0 114N15190 1598113 7012015 0
## 2: 114N15190_0701_1 114N15190 1598114 7012015 1
## 3: 114N15190_0701_10 114N15190 1598123 7012015 10
## 4: 114N15190_0701_11 114N15190 1598124 7012015 11
## 5: 114N15190_0701_12 114N15190 1598125 7012015 12
## 6: 114N15190_0701_13 114N15190 1598126 7012015 13
## Travel_TIME_ALL_VEHICLES Travel_TIME_PASSENGER_VEHICLES
## 1: NA NA
## 2: NA NA
## 3: NA NA
## 4: NA NA
## 5: NA NA
## 6: NA NA
## Travel_TIME_FREIGHT_TRUCKS NP ADMIN_LEVE ADMIN_LE_1 ADMIN_LE_2 DISTANCE
## 1: NA N USA Washington Jefferson 1.10912
## 2: NA N USA Washington Jefferson 1.10912
## 3: NA N USA Washington Jefferson 1.10912
## 4: NA N USA Washington Jefferson 1.10912
## 5: NA N USA Washington Jefferson 1.10912
## 6: NA N USA Washington Jefferson 1.10912
## ROAD_NUMBE ROAD_NAME LATITUDE LONGITUDE ROAD_DIREC ORN_FID FID_1
## 1: WA-116 48.02978 -122.781 Westbound 23369.53 8774.035
## 2: WA-116 48.02978 -122.781 Westbound 23369.53 8774.035
## 3: WA-116 48.02978 -122.781 Westbound 23369.53 8774.035
## 4: WA-116 48.02978 -122.781 Westbound 23369.53 8774.035
## 5: WA-116 48.02978 -122.781 Westbound 23369.53 8774.035
## 6: WA-116 48.02978 -122.781 Westbound 23369.53 8774.035
## ACCESS LSHL_TY2 LSHL_TYP MED_TYPE NHS_IND PRK_ZNE RSHL_TY2 RSHL_TYP
## 1: 3 <NA> B <NA> Y <NA> <NA> B
## 2: 3 <NA> B <NA> Y <NA> <NA> B
## 3: 3 <NA> B <NA> Y <NA> <NA> B
## 4: 3 <NA> B <NA> Y <NA> <NA> B
## 5: 3 <NA> B <NA> Y <NA> <NA> B
## 6: 3 <NA> B <NA> Y <NA> <NA> B
## SURF_TYP SURF_TY2 TERRAIN COMP_DIR COUNTY FUNC_CLS MEDBARTY ST_FUNC
## 1: B <NA> R NE 16 45 <NA> R3
## 2: B <NA> R NE 16 45 <NA> R3
## 3: B <NA> R NE 16 45 <NA> R3
## 4: B <NA> R NE 16 45 <NA> R3
## 5: B <NA> R NE 16 45 <NA> R3
## 6: B <NA> R NE 16 45 <NA> R3
## RTE_NBR HPMS ROAD_INV SPD_LIMT BEGMP ENDMP LSHLDWID MEDWID
## 1: 116 61161606111A0 116 0 0 1.12 5.431342 0
## 2: 116 61161606111A0 116 0 0 1.12 5.431342 0
## 3: 116 61161606111A0 116 0 0 1.12 5.431342 0
## 4: 116 61161606111A0 116 0 0 1.12 5.431342 0
## 5: 116 61161606111A0 116 0 0 1.12 5.431342 0
## 6: 116 61161606111A0 116 0 0 1.12 5.431342 0
## NO_LANE1 NO_LANE2 NO_LANES RSHLDWID RSHL_WD2 SEG_LNG lanewid rdwy_wd1
## 1: 1 1 2 5.11466 0 0.14983 13.6313 27.17227
## 2: 1 1 2 5.11466 0 0.14983 13.6313 27.17227
## 3: 1 1 2 5.11466 0 0.14983 13.6313 27.17227
## 4: 1 1 2 5.11466 0 0.14983 13.6313 27.17227
## 5: 1 1 2 5.11466 0 0.14983 13.6313 27.17227
## 6: 1 1 2 5.11466 0 0.14983 13.6313 27.17227
## rdwy_wd2 rdwy_wid AADT mvmt rodwycls ORN_FID_1 Total Fatal Injury
## 1: 0 27.17227 6517 0.3580292 8 23369.53 2 0 0
## 2: 0 27.17227 6517 0.3580292 8 23369.53 2 0 0
## 3: 0 27.17227 6517 0.3580292 8 23369.53 2 0 0
## 4: 0 27.17227 6517 0.3580292 8 23369.53 2 0 0
## 5: 0 27.17227 6517 0.3580292 8 23369.53 2 0 0
## 6: 0 27.17227 6517 0.3580292 8 23369.53 2 0 0
## PDO DAYMTH Crash spd_av spd_pv spd_ft date Month Day Year MonthDay
## 1: 2 0701 0 NA NA NA 07012015 07 01 2015 07_01
## 2: 2 0701 0 NA NA NA 07012015 07 01 2015 07_01
## 3: 2 0701 0 NA NA NA 07012015 07 01 2015 07_01
## 4: 2 0701 0 NA NA NA 07012015 07 01 2015 07_01
## 5: 2 0701 0 NA NA NA 07012015 07 01 2015 07_01
## 6: 2 0701 0 NA NA NA 07012015 07 01 2015 07_01
day1<- df_R2[,-c(1)] %>% group_by(MonthDay) %>% summarize(Speed_All_Mean=mean(spd_av, na.rm=TRUE))
day1
## # A tibble: 365 x 2
## MonthDay Speed_All_Mean
## <chr> <dbl>
## 1 01_01 49.4
## 2 01_02 48.0
## 3 01_03 48.1
## 4 01_04 44.6
## 5 01_05 47.7
## 6 01_06 48.5
## 7 01_07 48.2
## 8 01_08 48.2
## 9 01_09 48.9
## 10 01_10 48.5
## # ... with 355 more rows
# Step 2: Examine Data
speed_clean <- tsclean(ts(day1$Speed_All_Mean))
plot.ts(speed_clean)
# ggplot() + geom_line(data = Q1, aes(x = TimeStamp, y = speed_clean)) + ylab('Cleaned Speed Records')
day1$cnt_ma = ma(speed_clean, order=7) # using the clean count with no outliers
day1$cnt_ma30 = ma(speed_clean, order=30)
# Step 3: Decompose Your Data
count_ma = ts(na.omit(speed_clean), frequency=30)
decomp = stl(count_ma, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp)
# Step 4: Stationarity
# statinary test
adf.test(count_ma, alternative = "stationary")
##
## Augmented Dickey-Fuller Test
##
## data: count_ma
## Dickey-Fuller = -1.7171, Lag order = 7, p-value = 0.6963
## alternative hypothesis: stationary
adf.test(deseasonal_cnt, alternative = "stationary")
##
## Augmented Dickey-Fuller Test
##
## data: deseasonal_cnt
## Dickey-Fuller = -1.6652, Lag order = 7, p-value = 0.7182
## alternative hypothesis: stationary
d1 = diff(deseasonal_cnt)
adf.test(d1, alternative = "stationary")
## Warning in adf.test(d1, alternative = "stationary"): p-value smaller than
## printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: d1
## Dickey-Fuller = -7.1508, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# Step 5: Autocorrelations and Choosing Model Order
# check ACF and PACF
acf2(count_ma)
## ACF PACF
## [1,] 0.85 0.85
## [2,] 0.80 0.27
## [3,] 0.77 0.16
## [4,] 0.74 0.09
## [5,] 0.72 0.08
## [6,] 0.71 0.10
## [7,] 0.72 0.16
## [8,] 0.67 -0.14
## [9,] 0.60 -0.21
## [10,] 0.57 -0.02
## [11,] 0.53 -0.05
## [12,] 0.50 0.00
## [13,] 0.50 0.08
## [14,] 0.48 0.00
## [15,] 0.44 -0.06
## [16,] 0.40 0.01
## [17,] 0.37 -0.02
## [18,] 0.34 -0.01
## [19,] 0.32 0.04
## [20,] 0.31 -0.02
## [21,] 0.30 0.00
## [22,] 0.27 -0.01
## [23,] 0.25 0.03
## [24,] 0.22 -0.02
## [25,] 0.21 0.02
## [26,] 0.21 0.04
## [27,] 0.20 0.01
## [28,] 0.20 0.03
## [29,] 0.17 -0.09
## [30,] 0.14 -0.06
## [31,] 0.11 -0.10
## [32,] 0.09 -0.01
## [33,] 0.11 0.10
## [34,] 0.11 0.06
## [35,] 0.11 -0.01
## [36,] 0.07 -0.08
## [37,] 0.06 0.02
## [38,] 0.05 0.06
## [39,] 0.03 -0.04
## [40,] 0.04 0.01
## [41,] 0.05 -0.01
## [42,] 0.03 -0.09
## [43,] 0.01 -0.03
## [44,] -0.01 0.00
## [45,] -0.02 0.01
## [46,] -0.04 0.01
## [47,] -0.03 0.01
## [48,] -0.02 0.00
## [49,] -0.03 0.04
## [50,] -0.05 -0.04
## [51,] -0.05 0.04
## [52,] -0.06 0.02
## [53,] -0.06 0.00
## [54,] -0.06 -0.02
## [55,] -0.06 -0.04
## [56,] -0.06 0.00
## [57,] -0.08 -0.03
## [58,] -0.08 0.01
## [59,] -0.09 -0.03
## [60,] -0.11 -0.06
## [61,] -0.10 0.00
## [62,] -0.10 -0.01
## [63,] -0.10 0.02
## [64,] -0.12 -0.02
## [65,] -0.12 0.05
## [66,] -0.12 -0.01
## [67,] -0.13 0.00
## [68,] -0.13 -0.05
## [69,] -0.12 0.05
## [70,] -0.13 -0.03
## [71,] -0.14 0.02
## [72,] -0.13 0.07
## [73,] -0.14 -0.03
## [74,] -0.12 0.07
## [75,] -0.12 -0.09
## [76,] -0.12 -0.01
## [77,] -0.13 -0.04
## [78,] -0.13 0.01
## [79,] -0.12 0.02
## [80,] -0.12 0.02
## [81,] -0.12 -0.10
## [82,] -0.11 0.01
## [83,] -0.12 0.03
## [84,] -0.14 -0.05
## [85,] -0.14 0.04
## [86,] -0.12 0.02
## [87,] -0.12 0.03
## [88,] -0.12 0.00
## [89,] -0.11 0.02
## [90,] -0.09 0.05
## [91,] -0.10 -0.03
## [92,] -0.11 -0.03
## [93,] -0.08 0.03
## [94,] -0.09 -0.03
## [95,] -0.09 -0.04
## [96,] -0.06 0.10
## [97,] -0.06 -0.05
## [98,] -0.07 0.02
## [99,] -0.06 0.02
## [100,] -0.05 -0.04
## [101,] -0.06 -0.02
## [102,] -0.05 0.03
## [103,] -0.04 -0.06
## [104,] -0.04 0.01
## [105,] -0.03 0.13
## [106,] -0.02 -0.04
## [107,] -0.01 0.01
## [108,] -0.01 0.04
## [109,] 0.01 0.01
## [110,] 0.00 -0.06
## [111,] -0.01 -0.02
## [112,] 0.01 0.02
## [113,] 0.03 0.08
## [114,] 0.02 -0.07
## [115,] 0.03 -0.01
## [116,] 0.04 0.02
## [117,] 0.04 0.01
## [118,] 0.03 0.02
## [119,] 0.04 -0.01
## [120,] 0.05 -0.03
## ACF PACF
## [1,] 0.86 0.86
## [2,] 0.81 0.26
## [3,] 0.78 0.16
## [4,] 0.75 0.08
## [5,] 0.73 0.07
## [6,] 0.72 0.09
## [7,] 0.73 0.15
## [8,] 0.67 -0.15
## [9,] 0.60 -0.21
## [10,] 0.58 -0.01
## [11,] 0.53 -0.06
## [12,] 0.51 0.01
## [13,] 0.50 0.09
## [14,] 0.49 -0.01
## [15,] 0.44 -0.07
## [16,] 0.40 0.00
## [17,] 0.37 -0.01
## [18,] 0.35 0.00
## [19,] 0.33 0.05
## [20,] 0.32 -0.02
## [21,] 0.31 -0.01
## [22,] 0.28 -0.02
## [23,] 0.25 0.01
## [24,] 0.23 -0.02
## [25,] 0.21 0.03
## [26,] 0.21 0.04
## [27,] 0.21 0.01
## [28,] 0.20 0.01
## [29,] 0.17 -0.09
## [30,] 0.14 -0.09
## [31,] 0.11 -0.07
## [32,] 0.09 -0.01
## [33,] 0.11 0.11
## [34,] 0.11 0.07
## [35,] 0.10 -0.01
## [36,] 0.07 -0.08
## [37,] 0.06 0.03
## [38,] 0.05 0.06
## [39,] 0.03 -0.05
## [40,] 0.04 0.03
## [41,] 0.05 -0.02
## [42,] 0.03 -0.10
## [43,] 0.01 -0.02
## [44,] -0.02 -0.02
## [45,] -0.03 -0.01
## [46,] -0.04 0.01
## [47,] -0.03 0.02
## [48,] -0.02 0.01
## [49,] -0.02 0.06
## [50,] -0.05 -0.05
## [51,] -0.05 0.04
## [52,] -0.06 0.01
## [53,] -0.06 -0.01
## [54,] -0.06 -0.02
## [55,] -0.06 -0.03
## [56,] -0.05 0.01
## [57,] -0.07 -0.03
## [58,] -0.08 0.02
## [59,] -0.09 -0.04
## [60,] -0.11 -0.09
## [61,] -0.10 0.03
## [62,] -0.10 -0.03
## [63,] -0.10 0.04
## [64,] -0.12 -0.01
## [65,] -0.12 0.05
## [66,] -0.13 -0.03
## [67,] -0.13 0.01
## [68,] -0.13 -0.05
## [69,] -0.12 0.04
## [70,] -0.13 -0.01
## [71,] -0.14 0.00
## [72,] -0.13 0.07
## [73,] -0.14 -0.01
## [74,] -0.12 0.06
## [75,] -0.13 -0.12
## [76,] -0.12 -0.01
## [77,] -0.13 -0.04
## [78,] -0.13 0.02
## [79,] -0.12 0.05
## [80,] -0.12 0.00
## [81,] -0.12 -0.10
## [82,] -0.12 -0.01
## [83,] -0.12 0.02
## [84,] -0.14 -0.05
## [85,] -0.14 0.06
## [86,] -0.12 0.02
## [87,] -0.11 0.03
## [88,] -0.12 0.01
## [89,] -0.11 0.02
## [90,] -0.10 0.01
## [91,] -0.11 -0.03
## [92,] -0.11 -0.05
## [93,] -0.09 0.06
## [94,] -0.09 -0.02
## [95,] -0.09 -0.03
## [96,] -0.07 0.08
## [97,] -0.07 -0.04
## [98,] -0.07 0.02
## [99,] -0.06 0.01
## [100,] -0.06 -0.03
## [101,] -0.06 -0.04
## [102,] -0.05 0.04
## [103,] -0.04 -0.04
## [104,] -0.04 0.02
## [105,] -0.03 0.11
## [106,] -0.02 -0.05
## [107,] -0.01 0.00
## [108,] 0.00 0.05
## [109,] 0.01 0.04
## [110,] 0.00 -0.10
## [111,] -0.01 0.00
## [112,] 0.01 -0.02
## [113,] 0.03 0.09
## [114,] 0.02 -0.08
## [115,] 0.04 0.00
## [116,] 0.04 0.02
## [117,] 0.04 0.02
## [118,] 0.03 0.02
## [119,] 0.04 -0.01
## [120,] 0.04 -0.05
Seasonility Not in Consideration
# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=FALSE)
## Series: deseasonal_cnt
## ARIMA(3,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2
## 0.8595 -0.2881 -0.1150 -1.3642 0.5770
## s.e. 0.1900 0.0883 0.0804 0.1853 0.1387
##
## sigma^2 estimated as 0.09534: log likelihood=-86.53
## AIC=185.05 AICc=185.29 BIC=208.44
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality not considered]')
# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)
Seasonility in Consideration
# Step 6: Fitting an ARIMA model
auto.arima(deseasonal_cnt, seasonal=TRUE)
## Series: deseasonal_cnt
## ARIMA(2,1,2)(0,0,2)[30]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## 0.9856 -0.3316 -1.4786 0.6152 -0.0251 -0.2670
## s.e. 0.1716 0.0863 0.1698 0.1280 0.0678 0.0848
##
## sigma^2 estimated as 0.09196: log likelihood=-81.67
## AIC=177.33 AICc=177.65 BIC=204.61
# Step 7: Evaluate and Iterate
# (try different model)
fit<-auto.arima(deseasonal_cnt, seasonal=TRUE)
tsdisplay(residuals(fit), lag.max=45, main='Model Residuals [Seasonality considered]')
# step 8 forcast
fcast <- forecast(fit, h=30)
plot(fcast)