Washington Rural Two-Lane

Subasish Das and Chaolun Ma

2018-11-12

Washington Conflated Data (Rural Two-Lane)

library(data.table)
library(dplyr)
library(wavelets)
library(tidyr)
library(naniar)
library(stringr)
library(ggplot2)
library(DT)
library(lubridate)
library(ggpubr)



mytype = 'R2'
setwd("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/")
load("./two-lane_undivided_WA_reduce_withCrash.rData")
setwd(paste0("/scratch/user/cma16/Task4_Deliverable2/Process4/AllCrash/FacilityBased/",mytype))

df_R2 <- W_2un_nomed
dim(df_R2)
## [1] 23080608       64
### Calculating Speed
df_R2$spd_av = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_ALL_VEHICLES
df_R2$spd_pv = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_PASSENGER_VEHICLES
df_R2$spd_ft = 3600*df_R2$DISTANCE/df_R2$Travel_TIME_FREIGHT_TRUCKS

df_R2$spd_av = ifelse(df_R2$spd_av <120, df_R2$spd_av, NA)
df_R2$spd_pv = ifelse(df_R2$spd_pv <120, df_R2$spd_pv, NA)
df_R2$spd_ft = ifelse(df_R2$spd_ft <120, df_R2$spd_ft, NA)

### Month, Day
df_R2$date <- as.character(df_R2$DATE)
df_R2$date <- str_pad(df_R2$DATE, 8, pad = "0")
df_R2$Month <- substr(df_R2$date, start = 1, stop = 2)
df_R2$Day   <- substr(df_R2$date, start = 3, stop = 4)
df_R2$Year  <- substr(df_R2$date, start = 5, stop = 8)

head(df_R2,2)
##           TimeStamp       TMC      V1    DATE EPOCH15
## 1: 114N15190_0701_0 114N15190 1598113 7012015       0
## 2: 114N15190_0701_1 114N15190 1598114 7012015       1
##    Travel_TIME_ALL_VEHICLES Travel_TIME_PASSENGER_VEHICLES
## 1:                       NA                             NA
## 2:                       NA                             NA
##    Travel_TIME_FREIGHT_TRUCKS NP ADMIN_LEVE ADMIN_LE_1 ADMIN_LE_2 DISTANCE
## 1:                         NA  N        USA Washington  Jefferson  1.10912
## 2:                         NA  N        USA Washington  Jefferson  1.10912
##    ROAD_NUMBE ROAD_NAME LATITUDE LONGITUDE ROAD_DIREC  ORN_FID    FID_1
## 1:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
## 2:     WA-116           48.02978  -122.781  Westbound 23369.53 8774.035
##    ACCESS LSHL_TY2 LSHL_TYP MED_TYPE NHS_IND PRK_ZNE RSHL_TY2 RSHL_TYP
## 1:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
## 2:      3     <NA>        B     <NA>       Y    <NA>     <NA>        B
##    SURF_TYP SURF_TY2 TERRAIN COMP_DIR COUNTY FUNC_CLS MEDBARTY ST_FUNC
## 1:        B     <NA>       R       NE     16       45     <NA>      R3
## 2:        B     <NA>       R       NE     16       45     <NA>      R3
##    RTE_NBR          HPMS ROAD_INV SPD_LIMT BEGMP ENDMP LSHLDWID MEDWID
## 1:     116 61161606111A0      116        0     0  1.12 5.431342      0
## 2:     116 61161606111A0      116        0     0  1.12 5.431342      0
##    NO_LANE1 NO_LANE2 NO_LANES RSHLDWID RSHL_WD2 SEG_LNG lanewid rdwy_wd1
## 1:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
## 2:        1        1        2  5.11466        0 0.14983 13.6313 27.17227
##    rdwy_wd2 rdwy_wid AADT      mvmt rodwycls ORN_FID_1 Total Fatal Injury
## 1:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
## 2:        0 27.17227 6517 0.3580292        8  23369.53     2     0      0
##    PDO DAYMTH Crash spd_av spd_pv spd_ft     date Month Day Year
## 1:   2   0701     0     NA     NA     NA 07012015    07  01 2015
## 2:   2   0701     0     NA     NA     NA 07012015    07  01 2015
#####
df1= df_R2[,c("spd_av","spd_pv","spd_ft")]
df2= df_R2[,c("date","spd_av","spd_pv","spd_ft")]
df3= df_R2[,c("Month","spd_av","spd_pv","spd_ft")]
df4= df_R2[,c("Day","spd_av","spd_pv","spd_ft")]
df5= df_R2[,c("Year","spd_av","spd_pv","spd_ft")]


######################################################
ConvEpoc2HM <- function(x) {
  # for a given epoc number, get its hour:min
  yy <- x*15
  y.hr <- yy %/% 60
  y.min <- yy %% 60
  x <- paste(str_pad(y.hr, 2, side = 'left', pad='0'), 
                     str_pad(y.min, 2, side = 'left', pad='0'), 
                     '00', sep = ':')
}

df_R2$Hour1 <- ConvEpoc2HM(df_R2$EPOCH15)
DATE4 <- paste(strptime(df_R2$date, format = "%m%d%Y", tz =""), df_R2$Hour1, sep = ' ')
df_R2$PCT_TIME <- as.POSIXct(DATE4, tz ="", format = "%Y-%m-%d %H:%M:%OS")
df_R2$Hour <- strftime(df_R2$PCT_TIME, format="%H")
df_R2$DOW <- wday(df_R2$PCT_TIME, label = TRUE)


df6= df_R2[,c("Hour","spd_av","spd_pv","spd_ft")]
df7= df_R2[,c("DOW","spd_av","spd_pv","spd_ft")]

################################################################





#### Operating Speed by Month
long <- melt(df3, id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+facet_grid(. ~ "variable")+
geom_hline(yintercept=52, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 54740310 rows containing non-finite values
## (stat_ydensity).
## Warning: Removed 54740310 rows containing non-finite values (stat_boxplot).

#### Operating Speed by Month [All Vehicles]
long <- melt(df3[,c(1,2)], id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
         add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
  geom_hline(yintercept=52, linetype="dashed", 
             color = "red", size=1)+labs(title="Operating Speed (All vehicles) by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 16440591 rows containing non-finite values
## (stat_ydensity).
## Warning: Removed 16440591 rows containing non-finite values (stat_boxplot).

#### Operating Speed by Month [Passenger Cars]
long <- melt(df3[,c(1,3)], id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
         add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
  geom_hline(yintercept=52, linetype="dashed", 
             color = "red", size=1)+labs(title="Operating Speed (Passenger Cars) by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 18617983 rows containing non-finite values
## (stat_ydensity).
## Warning: Removed 18617983 rows containing non-finite values (stat_boxplot).

#### Operating Speed by Month [Fright Trucks]
long <- melt(df3[,c(1,4)], id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
         add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
  geom_hline(yintercept=52, linetype="dashed", 
             color = "red", size=1)+labs(title="Operating Speed (Freight Trucks) by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 19681736 rows containing non-finite values
## (stat_ydensity).
## Warning: Removed 19681736 rows containing non-finite values (stat_boxplot).

gg_miss_upset(df1)

gg_miss_var(df3, facet = Month, show_pct = TRUE)

gg_miss_var(df4, facet = Day, show_pct = TRUE)

gg_miss_var(df5, facet = Year, show_pct = TRUE)

gg_miss_var(df2, facet = date, show_pct = TRUE)

### Missingness by Date
df2a <- df2 %>%
  group_by(date) %>%
  miss_var_summary()
datatable(
  df2a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Month
df3a <- df3 %>%
  group_by(Month) %>%
  miss_var_summary()
datatable(
  df3a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Day
df4a <- df4 %>%
  group_by(Day) %>%
  miss_var_summary()
datatable(
  df4a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Year
df5a <- df5 %>%
  group_by(Year) %>%
  miss_var_summary()
datatable(
  df5a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### TMC Level
tmc1 <- df_R2[,c('TMC')] %>%
  group_by(TMC) %>%
  summarize(Count=n())
datatable(
  tmc1, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
#### Operating Speed by Hour
long <- melt(df6, id.vars = c("Hour"))
long$Hour <- as.factor(long$Hour)
ggviolin(long, "Hour", "value", fill = "Hour",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
geom_hline(yintercept=54, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by Hour", x="Hour",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 54740310 rows containing non-finite values
## (stat_ydensity).
## Warning: Removed 54740310 rows containing non-finite values (stat_boxplot).

#### Operating Speed by DOW
long <- melt(df7, id.vars = c("DOW"))
ggviolin(long, "DOW", "value", fill = "DOW",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
geom_hline(yintercept=54, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by DOW", x="Day of Week",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 54740310 rows containing non-finite values
## (stat_ydensity).

## Warning: Removed 54740310 rows containing non-finite values (stat_boxplot).

gg_miss_var(df6, facet = Hour, show_pct = TRUE)

### Missingness by Hour
df6a <- df6 %>%
  group_by(Hour) %>%
  miss_var_summary()
datatable(
  df6a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by DOW
df7a <- df7 %>%
  group_by(DOW) %>%
  miss_var_summary()
datatable(
  df7a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)