filenames <- list.files(path = inpath, pattern="*csv$")
filenames <- paste(inpath, filenames, sep="/")
DF <- read_csv(filenames[1], col_types = cols(.default = col_character()))
for (i in filenames[-1]) {
print(i)
df <- read_csv(i, col_types = cols(.default = col_character()))
DF <- full_join(DF, df)
}
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2015_20250412162933_crash_20160101-20161231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2015_20250412163026_crash_20150101-20151231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162001_crash_20230101-20231231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162131_crash_20220101-20221231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162247_crash_20210101-20211231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162436_crash_20200101-20201231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162547_crash_20190101-20191231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2018_20250412162656_crash_20180101-20181231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2023_20250412161759_crash_20240101-20241231_HARRIS.csv"
[1] "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Traffic_crashes/Unzipped/extract_public_2023_20250412162001_crash_20230401-20231231_HARRIS.csv"
df <- DF %>% select(Crash_ID, Crash_Fatal_Fl, Crash_Date, Crash_Time, Rpt_City_ID,
Rpt_Block_Num, Rpt_Street_Pfx, Rpt_Street_Name, Rpt_Street_Sfx, At_Intrsct_Fl,
Latitude, Longitude, Street_Name, Street_Nbr, Tot_Injry_Cnt, Death_Cnt,
Harm_Evnt_ID, Crash_Sev_ID)
df <- left_join(df, City, by="Rpt_City_ID")
df <- left_join(df, Severity, by="Crash_Sev_ID")
df <- left_join(df, Harm, by="Harm_Evnt_ID")
# Create datetime
df <- df %>%
mutate(Date=mdy_hm(paste(Crash_Date, Crash_Time)))
# df <- df %>%
# rename(City=City.x, Severity=Severity.x) %>%
# select(-City.y, -Severity.y)
# df <- left_join(df, Charge, by="Charge_Cat_ID")
# Save raw data
# saveRDS(DF, paste(path, "Raw_Crash_Data_2015_2024.rds", sep="/"))
Do some quality control checks
# Look for NA's in the various fields
df %>%
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
print(n = 100)
# A tibble: 22 × 2
feature num_nulls
<chr> <int>
1 Crash_ID 0
2 Crash_Fatal_Fl 0
3 Crash_Date 0
4 Crash_Time 0
5 Rpt_City_ID 0
6 Rpt_Block_Num 38057
7 Rpt_Street_Pfx 852638
8 Rpt_Street_Name 3
9 Rpt_Street_Sfx 232054
10 At_Intrsct_Fl 0
11 Latitude 135457
12 Longitude 135457
13 Street_Name 2
14 Street_Nbr 875180
15 Tot_Injry_Cnt 0
16 Death_Cnt 0
17 Harm_Evnt_ID 0
18 Crash_Sev_ID 0
19 City 404369
20 Severity 0
21 Harmed 0
22 Date 0
sort(unique(df$Crash_Fatal_Fl))
sort(unique(df$Crash_Date)) %>% head()
[1] "01/01/2015" "01/01/2016" "01/01/2017" "01/01/2018" "01/01/2019"
[6] "01/01/2020"
# make it a date
df$Crash_Date <- mdy(df$Crash_Date)
df %>%
ggplot(aes(Crash_Date)) +
geom_histogram()
sort(unique(df$Crash_Time)) %>% head()
[1] "00:00 AM" "01:00 AM" "01:00 PM" "01:01 AM" "01:01 PM" "01:02 AM"
# make it a time
df <- df %>%
mutate(Crash_Time=ymd_hm(paste("2000/1/1",Crash_Time)))
df %>%
ggplot(aes(Crash_Time)) +
geom_histogram()
sort(unique(df$Rpt_City_ID))
[1] "1013" "1061" "107" "1073" "108" "111" "112" "1159" "1185" "121"
[11] "1322" "1323" "1326" "1348" "135" "1353" "136" "1389" "14" "141"
[21] "1444" "1506" "1532" "1535" "160" "164" "169" "1926" "2071" "208"
[31] "211" "212" "218" "219" "22" "223" "228" "243" "244" "254"
[41] "255" "260" "264" "269" "278" "283" "29" "291" "295" "296"
[51] "298" "30" "304" "311" "315" "317" "325" "326" "33" "3313"
[61] "333" "346" "359" "369" "37" "379" "382" "383" "391" "405"
[71] "408" "409" "413" "414" "422" "430" "434" "435" "441" "442"
[81] "4461" "447" "453" "459" "46" "4728" "478" "5006" "523" "5644"
[91] "5659" "57" "59" "6" "625" "647" "672" "6756" "843" "851"
[101] "855" "8746" "899" "92" "952" "9999"
sort(unique(df$Rpt_Block_Num)) %>% head()
[1] "#1" "`10261" "`123" "`12406" "`19200" "0"
# Show all non-numeric addresses
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>% head(20)
[1] "35OO" "UNIT" "UNIT BLOCK" "1400O" "UNIT"
[6] "1-99" "135OO" "45OO" "11OOO" "123OO"
[11] "5-99" "18,000" "O" "2372F" "1500`"
[16] "UNIT BLOCK" "1400 BLK" "UNIT BLOCK" "1100 BLOCK" "UNIT"
# Remove non-numeric stuff
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, " BLOCK","")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "O","0")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "^\\d+-","")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, "PP","00")
df$Rpt_Block_Num <- str_replace_all(df$Rpt_Block_Num, " 1/2","")
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "\\D+","")
# Show all non-numeric addresses
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>% head(20)
[1] "0RE" "0CK" "0A" "0NE" "0WN" "0B"
[7] "121000UTH" "0CK" "0CK" "0CK" "0CK" "0NE"
[13] "0CK" "176000E" "00T" "0CK" "0CK" "0CK"
[19] "0CK" "0CK"
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "0CK","")
df$Rpt_Block_Num <- str_replace(df$Rpt_Block_Num, "0\\D+","")
df$Rpt_Block_Num[grepl("\\D+", df$Rpt_Block_Num)] %>% head(20)
sort(unique(df$Rpt_Street_Pfx))
[1] "E" "N" "NE" "NW" "S" "SE" "SW" "W"
sort(unique(df$Rpt_Street_Sfx))
[1] "AVE" "BLVD" "CIR" "CT" "CV" "DR" "EXPY" "FWY" "HWY" "LN"
[11] "LOOP" "PARK" "PATH" "PKWY" "PL" "PT" "RD" "ST" "TRC" "TRL"
[21] "WAY"
sort(unique(df$Severity))
[1] "FATAL" "NON-INCAPACITATING"
[3] "NOT INJURED" "POSSIBLE INJURY"
[5] "SUSPECTED SERIOUS INJURY" "UNKNOWN"
[1] "Animal" "Fixed Object"
[3] "Motor Vehicle in Transport" "Other Non-Collision"
[5] "Other Object" "Overturned"
[7] "Parked Car" "Pedal Cyclist"
[9] "Pedestrian" "Rail Road"
df %>%
ggplot(aes(x=Harmed)) +
geom_bar() +
coord_flip()
# Save file out
# saveRDS(df, paste(path, "Cleaned_2015_2024.rds", sep="/"))