library(tidyverse)
library(stringr)
<- "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Houston_Permits/"
savepath
<- "https://www.houstontx.gov/planning/DevelopRegs/docs_pdfs/Permit_eReport/"
url_begin
::opts_chunk$set(echo = TRUE) knitr
Read Houston City Permit Data
Read in Permit Data
The permit data is stored weekly on a website either as a docx table of an excel spreadsheet, and once as an html file, the choice of which seemingly random. Additionally the names of the files are inconsistent and while having the date in the name, not named in a consistent fashion. These files are obviously generated each week by hand - which is a terrible indictment of the fourth largest city in the country, but there it is. Earlier I had read in data through March 2022. So this will extend that data up through 9 Sep 2024. But wait! It gets worse. At least one supposed xls file is actually html! Jerks.
Step 1 Read in the files
Some sample names: docs_pdfs/Permit_eReport/2024/Web-eReport-02-12-24.docx docs_pdfs/Permit_eReport/2024/Web-eReport-02-19-24.docx docs_pdfs/Permit_eReport/2024/Web%20eReport%2002-26-24.html docs_pdfs/Permit_eReport/2024/Web%20eReport%2003-04-24.docx docs_pdfs/Permit_eReport/2024/Web%20eReport%2003-11-24.docx docs_pdfs/Permit_eReport/2023/Web-eReport-07-17-23.xls docs_pdfs/Permit_eReport/2023/Web-eReport-07-24-23.xls
Naah, this is a nightmare. I’ll just take the webpage and edit it by hand to create a list of files to download.
replace blanks with %20, and trim up to the year folder. Where it exists.
# Read in list of files and read them in
<- read_csv(paste0(savepath, "Permits_2022-9Sep2024.txt"))
filenames
for (filename in filenames$Filenames) { # read files in and save them
<- stringr::str_replace_all(filename, " ", "%20")
filename # print(paste("Read file", filename))
if (str_detect(filename, "^#")) {next} # bad file, commented out
<- str_remove(filename, "^202[0-4]/")
outname # If already donloaded, skip
if (!file.exists(paste0(savepath, outname))) {
# print(paste("Output", outname))
download.file(paste0(url_begin, filename), destfile=paste0(savepath, outname), mode="wb")
}
}
# Now let's start trying to tear them apart, Note that there are a handful of excel
# spreadsheets in and amongst the docx tables. And one html. Gaaa...
# And some are corrupt. I'll mark those in the input file of names.
for (filename in filenames$Filenames) { # read files in and extract
if (str_detect(filename, "^#")) {next} # bad file, commented out
<- str_remove(filename, "^202[0-4]/")
outname # print(paste("Input", outname))
<- paste0(savepath, outname, ".rds")
fileout if (file.exists(fileout)) {next} # skip is already made file
if (str_detect(outname, "xls")) { # Oh for God's sake, some are excel????
# but is really excel or are you just trying to screw with me?
<- system2(command="file", args=paste0(paste0(savepath, outname)),
file_query stdout=TRUE)
if (stringr::str_detect(file_query, "HTML")) {print("HTML file")
# print("---- xls but html")
<- rvest::read_html(paste0(paste0(savepath, outname))) %>%
tmp ::html_table()
rvest<- tmp[[1]]
tmp <- tmp %>% # remove headers and trailers
tmp filter(stringr::str_detect(X1, "^77")) %>%
mutate(X5=stringr::str_replace_all(X5, " *", " ")) # shrink long strings of blanks
names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
saveRDS(tmp, fileout)
}else {
print("---- xml")
<- readxl::read_excel(paste0(savepath, outname), col_names=FALSE)
tmp <- tmp[-c(1,4),]
tmp <- tmp[,1:6]
tmp names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
<- tmp %>% mutate(Permit_date =
tmp as.Date(as.numeric(Permit_date),
origin = "1899-12-30"))
saveRDS(tmp, fileout)
}
}else if (str_detect(outname, "html")){ # the odd html file, but honest at least
# print("---- html")
<- rvest::read_html(paste0(paste0(savepath, outname))) %>%
tmp ::html_table()
rvest<- tmp[[1]]
tmp <- tmp %>% # remove headers and trailers
tmp filter(stringr::str_detect(X1, "^77")) %>%
select(X1, X2, X3, X4, X5, X6) %>%
mutate(X5=stringr::str_replace_all(X5, " *", " ")) # shrink long strings of blanks
names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
saveRDS(tmp, fileout)
else { # and some are docx?
} <- docxtractr::read_docx(paste0(savepath, outname))
report <- docxtractr::docx_extract_all_tbls(report)
tables <- tables[[1]]
tmp <- tmp[-c(1,2),]
tmp <- tmp[,1:6]
tmp names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
saveRDS(tmp, fileout)
} }
Combine
Hopefully the individual files are all compatible now and combining them will be a simple task.
# If already done this, then skip
if (!file.exists(paste0(savepath, "Permit_data_2022-9Sep2024.rds"))) {
<- NULL
df
for (filename in filenames$Filenames) { # read files in and extract
if (str_detect(filename, "^#")) {next} # bad file, commented out
<- str_remove(filename, "^202[0-4]/")
outname # print(paste("Input", outname))
<- paste0(savepath, outname, ".rds")
fileout #------------------- now try combining
<- readRDS(fileout)
foo $Permit_date <- lubridate::as_date(foo$Permit_date)
foo<- bind_rows(df, foo)
df
}
<- df[!df$Zipcode=="",] # eliminate blank lines
df <- df[!is.na(df$Permit_date),] # eliminate nearly blank lines
df <- df[!df$Zipcode=="Zip Code",]
df
saveRDS(df, paste0(savepath, "Permit_data_2022-9Sep2024.rds"))
}
Now we need to Geocode the data
Let’s clean up the address field where necessary, and then use my handy, dandy Houston geocoder to add lat longs to the data.
<- readRDS(paste0(savepath, "Permit_data_2022-9Sep2024.rds"))
df
library(postmastr)
# detach("package:GeocodeHou", unload=TRUE)
library(GeocodeHou)
# Protect Nasa Rd 1, it is unique in having a number following RD
<- df %>% mutate(Address=str_replace(Address, "NASA RD 1", "NASA ROAD 1"))
foo <- foo %>% # head(5000) %>%
foo # get rid of extra white space
mutate(Address=str_squish(Address)) %>%
# get rid of address like 325 1/2. Get rid of the fraction
string_replace( "Address", "^(\\d+)\\s?\\d/\\d ", "\\1 ", "Apply") %>%
# Collapse farm roads
string_replace("Output", " F M ", " FM ", "Apply") %>%
# change & to AND
string_replace("Output", "&", "AND", "Apply") %>%
# Remove BLD (Building) designator
string_replace("Output", " BLD\\s?[\\w]+$", "", "Apply") %>%
# Take care of Buffalo Speedway
string_replace("Output", "BUFFALO SPEEDWAY", "BUFFALO SPWY", "Apply") %>%
# streets designated are Private are special and hard to geocode.
# We will remove that designation, but then flag it in a comments column
mutate(Special=ifelse(str_detect(Address, "\\(PVT\\)"), "Private", "")) %>%
# Remove anything in parenthesis
string_replace("Output", "\\(.+\\)", "", "Apply") %>%
# Remove "A AND B"
string_replace("Output", " A\\s?AND\\s?B", "", "Apply") %>%
# Remove "FL" for Floor
string_replace("Output", " FL[R]?\\s?\\w+$", " ", "Apply") %>%
# Remove "STE" for Suite
string_replace("Output", " STE [A-Z0-9-\\.]+$", " ", "Apply") %>%
# Remove "." but carefully. Collapse extra white space afterwards
mutate(Output=str_replace_all(Output, "\\.", " ")) %>%
mutate(Output=str_squish(Output)) %>%
# Remove stuff with dashes in them except John-A
mutate(Output=str_replace(Output, "JOHN-A", "JOHN:A")) %>%
string_replace("Output", " [\\w]+-[\\w]+$", "", "Apply") %>%
mutate(Output=str_replace(Output, "JOHN:A", "JOHN-A")) %>%
# Convert ST to SAINT
string_replace("Output", "^(\\d+) ST ", "\\1 SAINT ", "Apply") %>%
string_replace("Output", "^(\\d+ [NSEW]) ST ", "\\1 SAINT ", "Apply") %>%
# Correct Parkway abbreviation
string_replace("Output", " PKY", " PKWY", "Apply") %>%
# Correct Plaza streets
string_replace("Output",
"(LAWNDALE|LOVE|GREENWAY|MEYERLAND|WILCREST) PLAZA",
"\\1 PLZ", "Apply") %>%
# Clean up stuff added to AVENUE
string_replace("Output", "AVENUE ([A-Z]{1}) .+", "AVENUE \\1", "Apply") %>%
# Clean up stuff added to Riverway
string_replace("Output", "RIVERWAY \\d+", "RIVERWAY", "Apply") %>%
# Clean up Sam Houston Pkwy
string_replace("Output", "SAM HOUSTON NORTH", "SAM HOUSTON", "Apply") %>%
string_replace("Output", "SAM HOUSTON SOUTH", "SAM HOUSTON", "Apply") %>%
string_replace("Output", "SAM HOUSTON EAST", "SAM HOUSTON", "Apply") %>%
string_replace("Output", "SAM HOUSTON WEST", "SAM HOUSTON", "Apply") %>%
# Repair staff sgt macario
string_replace("Output", "S/SGT MACARIO", "SSGT MACARIO", "Apply") %>%
# Repair avenida
string_replace("Output", "AMERICAS .+$", "AMERICAS", "Apply") %>%
# Repair el camino real
string_replace("Output", "EL CAMINO REAL .+$", "EL CAMINO REAL", "Apply") %>%
# Repair LOOP
string_replace("Output", "NORTH LOOP .+$", "NORTH LOOP FWY", "Apply") %>%
string_replace("Output", "SOUTH LOOP .+$", "SOUTH LOOP FWY", "Apply") %>%
string_replace("Output", "EAST LOOP .+$", "EAST LOOP FWY", "Apply") %>%
string_replace("Output", "WEST LOOP .+$", "WEST LOOP FWY", "Apply")
# Road types built into a big "or" string
<- paste(c(" ALY", "AVE", "BLF", "BLVD", "BND", "BR", "BRG", "BRK",
Types "BYP", "CIR", "COR", "CRES", "CRK", "CRST", "CT", "CTR",
"CTS", "CV", "CYN", "DL", "DR", "DRS", "EST", "ESTS",
"EXT", "FLD", "FLS", "FRK", "FRST", "FRY", "FWY", "GDN",
"GDNS", "GLN", "GRN", "GRV", "HBR", "HL", "HLS", "HOLW",
"HTS", "HVN", "HWY", "INLT", "IS", "KNL", "KNLS", "LDG",
"LK", "LKS", "LN", "LNDG", "MDW", "MDWS",
"ML", "MNR", "MTN", "PATH", "PKWY",
"PL", "PLZ", "PNE", "PNES", "PR", "PRT", "PSGE", "PT",
"RD", "RDG", "RIV", "RST", "SHR", "SHRS",
"SPG", "SPWY", "SQ", "ST", "STA", "STRM", "TER",
"TRCE", "TRL", "VIS", "VL", "VLG", "VLY", "VW",
"WLS", "XING"), collapse="| ")
# Removed MT, WALK, and SPUR as they caused problems, put in More_types
# Delete anything appearing after the road type designator
<- foo %>%
foo string_replace("Output", paste0("( ",Types,") .+$"), "\\1", "Apply")
<- " MALL| WAY| PLACE| WALK| SPUR| MT"
More_types
<- foo %>%
foo string_replace("Output", paste0("(",More_types,") \\w?\\d+$"), "\\1", "Apply")
# Fix farm roads: change F M to FM and delete stuff following designation
<- "FM 1960|FM 2100|FM 2351|FM 529|FM 2920|FM 1485|FM 2855|FM 1093|FM 2234|FM 362|FM 1942|FM 1314|FM 1463|FM 723|FM 1464|FM 686|FM 2978|FM 528|FM 521 |FM 1098|FM 149|FM 1959|FM 359|FM 1488|FM 249|FM 2917|FM 1736|FM 526"
Farm_roads
<- foo %>%
foo string_replace("Output", paste0("(",Farm_roads,") .+$"), "\\1", "Apply")
# Unprotect Nasa Rd 1, it is unique in having a number following RD
<- foo %>% mutate(Output=str_replace(Output, "NASA ROAD 1", "NASA RD 1"))
foo
# What are ambiguous suffixes?
# Names from GeocodeHou
<- df_names_only$Street_type %>% sort() %>% unique()
All_types
<- All_types[2:length(All_types)] # get rid of blank
All_types
<- paste0(" ", All_types)
All_types
<- paste0(All_types, collapse="|")
All_types
<- df_names_only %>%
foobar group_by(Street_name) %>%
summarize(last(Street_name)) %>%
mutate(Hit=str_detect(Street_name, All_types))
# Parse the addresses using postmastr
# Protect names like Avenue N,S,E,W
<- foo %>%
foo mutate(Output=str_replace(Output, "AVENUE ", "XAVENUE"))
pm_dictionary(type = "state", filter = c("TX"), case = "title", locale = "us")
# A tibble: 2 × 2
state.output state.input
<chr> <chr>
1 TX TX
2 TX Texas
<- pm_dictionary(type = "directional",
dirs filter = c("N", "S", "E", "W"), locale = "us")
<- pm_dictionary(type = "state", filter = "TX",
TX case = c("title", "upper"), locale = "us")
<- pm_append(type="suffix", input=c("SPEEDWAY", "SPWY", "PLACE"),
hou output=c("SPWY", "SPWY", "PL"))
<- pm_dictionary(type="suffix", append=hou, case="upper")
Type_dict
<- pm_identify(foo, var="Output") # add ID fields
foo
<- pm_prep(foo, var="Output", type="street") # Prep data
foo2
<- pm_houseFrac_parse(foo2)
foo2
<- pm_house_parse(foo2)
foo2
<- pm_streetDir_parse(foo2, dirs)
foo2
<- pm_streetSuf_parse(foo2, Type_dict)
foo2
<- pm_street_parse(foo2)
foo2
<- foo2 %>%
foo2 mutate(pm.street=str_replace(pm.street, " 1 At 2", " 1/2"))
<- foo2 %>%
foo2 mutate(pm.street=str_to_upper(pm.street)) %>%
mutate(pm.streetSuf=str_to_upper(pm.streetSuf)) %>%
mutate(pm.preDir=replace_na(pm.preDir, "")) %>%
mutate(pm.streetSuf=replace_na(pm.streetSuf, ""))
<- pm_replace(street=foo2, source=foo)
foo
# Unprotect AVENUE NSEW
<- foo %>%
foo mutate(pm.street = str_replace(pm.street, "XAVENUE", "AVENUE ")) %>%
mutate(Output = str_replace(Output, "XAVENUE", "AVENUE "))
Now we geocode
We have cleaned up the addresses, and then parsed them into component parts. To geocode we work our way through the parsed parts looking for matches. Exact matches are great, but sometimes we need to do some repair work on the failures to get a good match. About 50 matches per second is the rate that I see.
<- NULL
Exact_match <- NULL
Failed_match
for (i in 1:nrow(foo)){ # first look for exact matches
# if (i%%1000==0) { print(paste("---",i,"---")) } # it is slow, I need reassurance
<- match_exactly(foo[i,]$pm.house, foo[i,]$pm.preDir, foo[i,]$pm.street,
tmp $pm.streetSuf, foo[i,]$Zipcode)
foo[i,]if (tmp$Success){ # success
<- cbind(foo[i,], tmp) %>%
Exact_match select(pm.id, pm.house, pm.preDir, pm.street, pm.streetSuf, Zipcode, Lat, Lon) %>%
rbind(., Exact_match)
else { # Fail exact match
} <- cbind(foo[i,], tmp) %>%
Failed_match select(pm.id, pm.house, pm.preDir, pm.street, pm.streetSuf, Zipcode, Fail,
%>%
Lat, Lon) rbind(., Failed_match)
}
}
saveRDS(Exact_match, paste0(savepath, "Keep_Exactmatch.rds"))
saveRDS(Failed_match, paste0(savepath, "Keep_Failedmatch.rds"))
Now let’s work on the failed matches. I have broken code chunk out because the previous chunk is cached - it runs rather slowly so let’s only run it once!
Note that we got 54,063 exact matches, and failed for 8906 addresses
# Try to fix a bad zipcode
<- Failed_match %>% mutate(Correction=NA)
Failed_match
for (i in 1:nrow(Failed_match)){
# if (i%%100==0) { print(paste("---",i,"---")) }
<- Failed_match[i,]
target <- repair_zipcode(target$pm.house, target$pm.preDir, target$pm.street,
tmp $pm.streetSuf, target$Zipcode)
targetif (tmp$Success){ # success
$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Fail <- paste(Failed_match[i,]$Fail, "Zipcode")
Failed_match[i,]$Correction <- tmp$New_zipcode
Failed_match[i,]
} }
Look for bad names
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Street_name") {next} # skip if name isn't the issue
if (Failed_match[i,]$Lat > 0) {next} # skip if zipcode resolved it
# if (i%%10==0) { print(paste("---",i,"---")) }
<- Failed_match[i,]
target <- repair_name(target$pm.house, target$pm.preDir, target$pm.street,
tmp $pm.streetSuf, target$Zipcode)
targetif (tmp$Success){ # success
$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$New_name
Failed_match[i,]else {
} $Fail <- paste("Street_name",tmp$Fail)
Failed_match[i,]
} }
Bad types
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Street_type") {next} # skip if type isn't the issue
# if (i%%10==0) { print(paste("---",i,"---")) }
<- Failed_match[i,]
target <- repair_type(target$pm.house, target$pm.preDir, target$pm.street,
tmp $pm.streetSuf, target$Zipcode)
targetif (tmp$Success){ # success
$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$New_type
Failed_match[i,]else {
} $Fail <- paste("Street_type",tmp$Fail)
Failed_match[i,]
} }
Bad numbers
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Street_num") {next} # skip if name isn't the issue
# if (i%%10==0) { print(paste("---",i,"---")) }
<- Failed_match[i,]
target <- repair_number(target$pm.house, target$pm.preDir, target$pm.street,
tmp $pm.streetSuf, target$Zipcode)
targetif (tmp$Success){ # success
$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$Result
Failed_match[i,]else {
} $Fail <- paste("Street_num",tmp$Fail)
Failed_match[i,]
} }
Bad prefix
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Prefix") {next} # skip if name isn't the issue
# if (i%%10==0) { print(paste("---",i,"---")) }
<- Failed_match[i,]
target <- repair_prefix(target$pm.house, target$pm.preDir, target$pm.street,
tmp $pm.streetSuf, target$Zipcode)
targetif (tmp$Success){ # success
$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$New_prefix
Failed_match[i,]else {
} $Fail <- paste("Prefix",tmp$Fail)
Failed_match[i,]
} }
Managed to recover 3,670 addresses and correct them, so about 5,300 addresses out of the original 62,969 were not able to be geocoded. I suspect I need to update my street database - since it has been a couple of years - and that would certainly catch a few more addresses.
Now we will reconsolidate our data to make a final file
saveRDS(Exact_match, paste0(savepath, "Keep_Exactmatch_final.rds"))
saveRDS(Failed_match, paste0(savepath, "Keep_Failedmatch_final.rds"))
<- left_join(foo, select(Exact_match, pm.id, Lat, Lon), by="pm.id")
Final
<- left_join(Final,
Final select(Failed_match, pm.id, Lat, Lon, Fail, Correction),
by="pm.id")
<- Final %>%
Final rowwise() %>%
mutate(Lat.x=ifelse(is.na(Lat.x), Lat.y, Lat.x)) %>%
mutate(Lon.x=ifelse(is.na(Lon.x), Lon.y, Lon.x)) %>%
ungroup() %>%
rename(Lon=Lon.x, Lat=Lat.x) %>%
select(-Lat.y, -Lon.y)
##### Apply corrections for each field
# Prefix
<- Final %>%
Final mutate(pm.preDir=case_when(
replace_na(str_detect(Fail, "^Prefix$"), FALSE) ~ Correction,
TRUE ~ pm.preDir
))
# Name
<- Final %>%
Final mutate(pm.street=case_when(
replace_na(str_detect(Fail, "^Street_name$"), FALSE) ~ Correction,
TRUE ~ pm.street
))
# Zipcode
<- Final %>%
Final mutate(Zipcode=case_when(
replace_na(str_detect(Fail, "^Street_name Zipcode$"), FALSE) ~ Correction,
TRUE ~ Zipcode
))
# Type
<- Final %>%
Final mutate(pm.streetSuf=case_when(
replace_na(str_detect(Fail, "^Street_type$"), FALSE) ~ Correction,
TRUE ~ pm.streetSuf
))
# Rename some columns to better document
<- Final %>%
Final rename(Number=pm.house, Prefix=pm.preDir, Name=pm.street, Type=pm.streetSuf,
Original_address=Address, Cleaned_address=Output)
saveRDS(Final, paste0(savepath, "Clean_Final_2022-9Sep2024.rds"))