Ohio Rural Interstate Roadways

Subasish Das and Choalun Ma

2018-11-11

Ohio Conflated Data (Rural Interstate)

library(data.table)
library(dplyr)
library(tidyr)
library(naniar)
library(stringr)
library(ggplot2)
library(DT)
library(lubridate)
library(ggpubr)

mytype = 'RI'
setwd("/scratch/user/cma16/Task4_Deliverable2/OHprocess4/AllCrash/FacilityBased/")
load("./OH_Principal_Arterial_Rural_Interstate_1_TMC_TT_SI_reduce_withCrash.rData")

setwd(paste0("/scratch/user/cma16/Task4_Deliverable2/OHprocess4/AllCrash/FacilityBased/",mytype))
df_RMC <- b02a
dim(df_RMC)
## [1] 3039720      48
### Calculating Speed
df_RMC$spd_av = 3600*df_RMC$DISTANCE/df_RMC$Travel_TIME_ALL_VEHICLES
df_RMC$spd_pv = 3600*df_RMC$DISTANCE/df_RMC$Travel_TIME_PASSENGER_VEHICLES
df_RMC$spd_ft = 3600*df_RMC$DISTANCE/df_RMC$Travel_TIME_FREIGHT_TRUCKS


### Month, Day
df_RMC$date <- as.character(df_RMC$DATE)
df_RMC$date <- str_pad(df_RMC$DATE, 8, pad = "0")
df_RMC$Month <- substr(df_RMC$date, start = 1, stop = 2)
df_RMC$Day   <- substr(df_RMC$date, start = 3, stop = 4)
df_RMC$Year  <- substr(df_RMC$date, start = 5, stop = 8)

ConvEpoc2HM <- function(x) {
  # for a given epoc number, get its hour:min
  y.hr <- x
  y.min <- 0
  x <- paste(str_pad(y.hr, 2, side = 'left', pad='0'), 
             str_pad(y.min, 2, side = 'left', pad='0'), 
             '00', sep = ':')
}


df_RMC$Hour1 <- ConvEpoc2HM(df_RMC$EPOCH1h)
DATE4 <- paste(strptime(df_RMC$date, format = "%m%d%Y", tz =""), df_RMC$Hour1, sep = ' ')
df_RMC$PCT_TIME <- as.POSIXct(DATE4, tz ="", format = "%Y-%m-%d %H:%M:%OS")
df_RMC$Hour <- strftime(df_RMC$PCT_TIME, format="%H")
df_RMC$DOW <- wday(df_RMC$PCT_TIME, label = TRUE)


head(df_RMC,2)
##           TimeStamp       TMC    DATE EPOCH1h Travel_TIME_ALL_VEHICLES
## 1: 108N05179_0101_0 108N05179 1012015       0                       30
## 2: 108N05179_0101_1 108N05179 1012015       1                        5
##    Travel_TIME_PASSENGER_VEHICLES Travel_TIME_FREIGHT_TRUCKS ADMIN_LEVE
## 1:                             NA                         30        USA
## 2:                              5                         NA        USA
##    ADMIN_LE_1 ADMIN_LE_2 DISTANCE ROAD_NUMBE ROAD_NAME LATITUDE LONGITUDE
## 1:       Ohio       Wood  0.08766       I-75           41.16766 -83.64961
## 2:       Ohio       Wood  0.08766       I-75           41.16766 -83.64961
##    ROAD_DIREC ORN_FID COUNTY divided SURF_TYP NHS_CDE HPMS ACCESS  AADT_YR
## 1: Southbound 33004.3    WOO       D        G       N           F 12.03248
## 2: Southbound 33004.3    WOO       D        G       N           F 12.03248
##    FED_FACI PK_LANES MED_TYPE FED_MEDW BEGMP ENDMP   SEG_LNG cnty_rte
## 1:        2        4 4.032475 65.93505     0 25.24 0.2367525 WOO0075R
## 2:        2        4 4.032475 65.93505     0 25.24 0.2367525 WOO0075R
##    rte_nbr     aadt  aadt_bc  aadt_pt surf_wid no_lanes func_cls rodwycls
## 1:   0075R 49134.52 13491.41 35643.11       48        4        1        6
## 2:   0075R 49134.52 13491.41 35643.11       48        4        1        6
##    Total K A B C O DAYMTH Crash  spd_av  spd_pv  spd_ft     date Month Day
## 1:     0 0 0 0 0 0   0101     0 10.5192      NA 10.5192 01012015    01  01
## 2:     0 0 0 0 0 0   0101     0 63.1152 63.1152      NA 01012015    01  01
##    Year    Hour1            PCT_TIME Hour DOW
## 1: 2015 00:00:00 2015-01-01 00:00:00   00 Thu
## 2: 2015 01:00:00 2015-01-01 01:00:00   01 Thu
#####
df1= df_RMC[,c("spd_av", "spd_pv", "spd_ft")]
df2= df_RMC[,c("date","spd_av","spd_pv","spd_ft")]
df3= df_RMC[,c("Month","spd_av","spd_pv","spd_ft")]
df4= df_RMC[,c("Day","spd_av","spd_pv","spd_ft")]
df5= df_RMC[,c("Year","spd_av","spd_pv","spd_ft")]
df6= df_RMC[,c("Hour","spd_av","spd_pv","spd_ft")]
df7= df_RMC[,c("DOW","spd_av","spd_pv","spd_ft")]

Operating Speed

#### Operating Speed by Month
long <- melt(df3, id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+facet_grid(. ~ "variable")+
geom_hline(yintercept=63, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")

#### Operating Speed by Month [All Vehicles]
long <- melt(df3[,c(1,2)], id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
         add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
  geom_hline(yintercept=65, linetype="dashed", 
             color = "red", size=1)+labs(title="Operating Speed (All vehicles) by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")

#### Operating Speed by Month [Passenger Cars]
long <- melt(df3[,c(1,3)], id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
         add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
  geom_hline(yintercept=68, linetype="dashed", 
             color = "red", size=1)+labs(title="Operating Speed (Passenger Cars) by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")

#### Operating Speed by Month [Fright Trucks]
long <- melt(df3[,c(1,4)], id.vars = c("Month"))
ggviolin(long, "Month", "value", fill = "Month",
         add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
  geom_hline(yintercept=62, linetype="dashed", 
             color = "red", size=1)+labs(title="Operating Speed (Freight Trucks) by Month", x="Month",y="Speed (mph)")+ theme(legend.position="none")

#### Operating Speed by DOW
long <- melt(df7, id.vars = c("DOW"))
ggviolin(long, "DOW", "value", fill = "DOW",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
geom_hline(yintercept=63, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by Day of Week", x="Day of Week",y="Speed (mph)")+ theme(legend.position="none")

#### Operating Speed by Hour
long <- melt(df6, id.vars = c("Hour"))
long$Hour <- as.factor(long$Hour)
ggviolin(long, "Hour", "value", fill = "Hour",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
geom_hline(yintercept=63, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by Hour", x="Hour",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 381865 rows containing non-finite values (stat_ydensity).
## Warning: Removed 381865 rows containing non-finite values (stat_boxplot).

#### Operating Speed by Hour
long <- melt(df6, id.vars = c("Hour"))
long$Hour <- as.factor(long$Hour)
ggviolin(long, "Hour", "value", fill = "Hour",
   add = "boxplot", add.params = list(fill = "white"))+coord_flip()+
geom_hline(yintercept=63, linetype="dashed", 
                color = "red", size=1)+labs(title="Operating Speed by Hour", x="Hour",y="Speed (mph)")+ theme(legend.position="none")
## Warning: Removed 381865 rows containing non-finite values (stat_ydensity).
## Warning: Removed 381865 rows containing non-finite values (stat_boxplot).

Missing Value Plots

# vis_miss(df1, warn_large_data=F)
theme_set(theme_bw(base_size = 18))

gg_miss_var(df3, facet = Month, show_pct = TRUE)

gg_miss_var(df4, facet = Day, show_pct = TRUE)

gg_miss_var(df5, facet = Year, show_pct = TRUE)

gg_miss_var(df6, facet = Hour, show_pct = TRUE)

gg_miss_var(df7, facet = DOW, show_pct = TRUE)

gg_miss_var(df2, facet = date, show_pct = TRUE)

Missing Value Tables

### Missingness by Date
df2a <- df2 %>%
  group_by(date) %>%
  miss_var_summary()
datatable(
  df2a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Hour
df6a <- df6 %>%
  group_by(Hour) %>%
  miss_var_summary()
datatable(
  df6a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Month
df3a <- df3 %>%
  group_by(Month) %>%
  miss_var_summary()
datatable(
  df3a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by DOW
df7a <- df7 %>%
  group_by(DOW) %>%
  miss_var_summary()
datatable(
  df7a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Day
df4a <- df4 %>%
  group_by(Day) %>%
  miss_var_summary()
datatable(
  df4a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### Missingness by Year
df5a <- df5 %>%
  group_by(Year) %>%
  miss_var_summary()
datatable(
  df5a, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)
### TMC Level
tmc1 <- df_RMC[,c("TMC")] %>%
  group_by(TMC) %>%
  summarize(Count=n())
datatable(
  tmc1, extensions = 'Buttons', options = list(
    dom = 'Bfrtip',
    buttons = c('csv', 'excel', 'print')
  )
)