Read in TxDoT Crash data

Mapping
Supporting Activism
Texas
Harris County
Read in the crash data from TxDoT for Harris county
Author

Alan Jackson

Published

April 12, 2025

filenames <- list.files(path = inpath, pattern="*csv$")

filenames <- paste(inpath, filenames, sep="/")

DF <- read_csv(filenames[1], col_types = cols(.default = col_character()))

for (i in filenames[-1]) {
  print(i)
  df <- read_csv(i, col_types = cols(.default = col_character()))
  DF <- full_join(DF, df)
}
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2015_20250412162933_crash_20160101-20161231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2015_20250412163026_crash_20150101-20151231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162001_crash_20230101-20231231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162131_crash_20220101-20221231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162247_crash_20210101-20211231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162436_crash_20200101-20201231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162547_crash_20190101-20191231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162656_crash_20180101-20181231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2023_20250412161759_crash_20240101-20241231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2023_20250412162001_crash_20230401-20231231_HARRIS.csv"
df <- DF %>% select(Crash_ID, Crash_Fatal_Fl, Crash_Date,   Crash_Time, Rpt_City_ID,
Rpt_Block_Num, Rpt_Street_Pfx,  Rpt_Street_Name, Rpt_Street_Sfx, At_Intrsct_Fl,
Latitude, Longitude, Street_Name, Street_Nbr, Tot_Injry_Cnt, Death_Cnt,
Harm_Evnt_ID, Crash_Sev_ID)

df <- left_join(df, City, by="Rpt_City_ID")
df <- left_join(df, Severity, by="Crash_Sev_ID")
df <- left_join(df, Harm, by="Harm_Evnt_ID")

#   Create datetime 

df <- df %>% 
  mutate(Date=mdy_hm(paste(Crash_Date, Crash_Time)))

# df <- df %>% 
#   rename(City=City.x, Severity=Severity.x) %>% 
#   select(-City.y, -Severity.y)
# df <- left_join(df, Charge, by="Charge_Cat_ID")

#   Save raw data

# saveRDS(DF, paste(path, "Raw_Crash_Data_2015_2024.rds", sep="/"))

Do some quality control checks

# Look for NA's in the various fields

df %>%
    map_df(function(x) sum(is.na(x))) %>%
    gather(feature, num_nulls) %>%
    print(n = 100)
# A tibble: 22 × 2
   feature         num_nulls
   <chr>               <int>
 1 Crash_ID                0
 2 Crash_Fatal_Fl          0
 3 Crash_Date              0
 4 Crash_Time              0
 5 Rpt_City_ID             0
 6 Rpt_Block_Num       38057
 7 Rpt_Street_Pfx     852638
 8 Rpt_Street_Name         3
 9 Rpt_Street_Sfx     232054
10 At_Intrsct_Fl           0
11 Latitude           135457
12 Longitude          135457
13 Street_Name             2
14 Street_Nbr         875180
15 Tot_Injry_Cnt           0
16 Death_Cnt               0
17 Harm_Evnt_ID            0
18 Crash_Sev_ID            0
19 City               404369
20 Severity                0
21 Harmed                  0
22 Date                    0
sort(unique(df$Crash_Fatal_Fl))
[1] "N" "Y"
sort(unique(df$Crash_Date)) %>%  head()
[1] "01/01/2015" "01/01/2016" "01/01/2017" "01/01/2018" "01/01/2019"
[6] "01/01/2020"
#  make it a date

df$Crash_Date <- mdy(df$Crash_Date)

df %>% 
  ggplot(aes(Crash_Date)) +
    geom_histogram()

sort(unique(df$Crash_Time)) %>% head()
[1] "00:00 AM" "01:00 AM" "01:00 PM" "01:01 AM" "01:01 PM" "01:02 AM"
#   make it a time

df <- df %>% 
  mutate(Crash_Time=ymd_hm(paste("2000/1/1",Crash_Time)))

df %>% 
  ggplot(aes(Crash_Time)) +
    geom_histogram()

sort(unique(df$Rpt_City_ID))
  [1] "1013" "1061" "107"  "1073" "108"  "111"  "112"  "1159" "1185" "121" 
 [11] "1322" "1323" "1326" "1348" "135"  "1353" "136"  "1389" "14"   "141" 
 [21] "1444" "1506" "1532" "1535" "160"  "164"  "169"  "1926" "2071" "208" 
 [31] "211"  "212"  "218"  "219"  "22"   "223"  "228"  "243"  "244"  "254" 
 [41] "255"  "260"  "264"  "269"  "278"  "283"  "29"   "291"  "295"  "296" 
 [51] "298"  "30"   "304"  "311"  "315"  "317"  "325"  "326"  "33"   "3313"
 [61] "333"  "346"  "359"  "369"  "37"   "379"  "382"  "383"  "391"  "405" 
 [71] "408"  "409"  "413"  "414"  "422"  "430"  "434"  "435"  "441"  "442" 
 [81] "4461" "447"  "453"  "459"  "46"   "4728" "478"  "5006" "523"  "5644"
 [91] "5659" "57"   "59"   "6"    "625"  "647"  "672"  "6756" "843"  "851" 
[101] "855"  "8746" "899"  "92"   "952"  "9999"
sort(unique(df$Rpt_Block_Num)) %>%  head()
[1] "#1"     "`10261" "`123"   "`12406" "`19200" "0"     
#   Show all non-numeric addresses
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>%  head(20)
 [1] "35OO"       "UNIT"       "UNIT BLOCK" "1400O"      "UNIT"      
 [6] "1-99"       "135OO"      "45OO"       "11OOO"      "123OO"     
[11] "5-99"       "18,000"     "O"          "2372F"      "1500`"     
[16] "UNIT BLOCK" "1400 BLK"   "UNIT BLOCK" "1100 BLOCK" "UNIT"      
#   Remove non-numeric stuff

df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, " BLOCK","")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "O","0")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "^\\d+-","")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "PP","00")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, " 1/2","")
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "\\D+","")

#   Show all non-numeric addresses
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>%  head(20)
 [1] "0RE"       "0CK"       "0A"        "0NE"       "0WN"       "0B"       
 [7] "121000UTH" "0CK"       "0CK"       "0CK"       "0CK"       "0NE"      
[13] "0CK"       "176000E"   "00T"       "0CK"       "0CK"       "0CK"      
[19] "0CK"       "0CK"      
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "0CK","")
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "0\\D+","")

df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>%  head(20)
character(0)
sort(unique(df$Rpt_Street_Pfx))
[1] "E"  "N"  "NE" "NW" "S"  "SE" "SW" "W" 
sort(unique(df$Rpt_Street_Sfx))
 [1] "AVE"  "BLVD" "CIR"  "CT"   "CV"   "DR"   "EXPY" "FWY"  "HWY"  "LN"  
[11] "LOOP" "PARK" "PATH" "PKWY" "PL"   "PT"   "RD"   "ST"   "TRC"  "TRL" 
[21] "WAY" 
sort(unique(df$Severity))
[1] "FATAL"                    "NON-INCAPACITATING"      
[3] "NOT INJURED"              "POSSIBLE INJURY"         
[5] "SUSPECTED SERIOUS INJURY" "UNKNOWN"                 
sort(unique(df$Harmed))
 [1] "Animal"                     "Fixed Object"              
 [3] "Motor Vehicle in Transport" "Other Non-Collision"       
 [5] "Other Object"               "Overturned"                
 [7] "Parked Car"                 "Pedal Cyclist"             
 [9] "Pedestrian"                 "Rail Road"                 
df %>% 
  ggplot(aes(x=Harmed)) +
  geom_bar() +
  coord_flip()

#     Save file out

# saveRDS(df, paste(path, "Cleaned_2015_2024.rds", sep="/"))