library(tidyverse)
library(stringr)
savepath <- "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Houston_Permits/"
url_begin <- "https://www.houstontx.gov/planning/DevelopRegs/docs_pdfs/Permit_eReport/"
knitr::opts_chunk$set(echo = TRUE)Read Houston City Permit Data
Read in Permit Data
The permit data is stored weekly on a website either as a docx table of an excel spreadsheet, and once as an html file, the choice of which seemingly random. Additionally the names of the files are inconsistent and while having the date in the name, not named in a consistent fashion. These files are obviously generated each week by hand - which is a terrible indictment of the fourth largest city in the country, but there it is. Earlier I had read in data through March 2022. So this will extend that data up through 9 Sep 2024. But wait! It gets worse. At least one supposed xls file is actually html! Jerks.
Step 1 Read in the files
Some sample names: docs_pdfs/Permit_eReport/2024/Web-eReport-02-12-24.docx docs_pdfs/Permit_eReport/2024/Web-eReport-02-19-24.docx docs_pdfs/Permit_eReport/2024/Web%20eReport%2002-26-24.html docs_pdfs/Permit_eReport/2024/Web%20eReport%2003-04-24.docx docs_pdfs/Permit_eReport/2024/Web%20eReport%2003-11-24.docx docs_pdfs/Permit_eReport/2023/Web-eReport-07-17-23.xls docs_pdfs/Permit_eReport/2023/Web-eReport-07-24-23.xls
Naah, this is a nightmare. I’ll just take the webpage and edit it by hand to create a list of files to download.
replace blanks with %20, and trim up to the year folder. Where it exists.
# Read in list of files and read them in
filenames <- read_csv(paste0(savepath, "Permits_2022-9Sep2024.txt"))
for (filename in filenames$Filenames) { # read files in and save them
filename <- stringr::str_replace_all(filename, " ", "%20")
# print(paste("Read file", filename))
if (str_detect(filename, "^#")) {next} # bad file, commented out
outname <- str_remove(filename, "^202[0-4]/")
# If already donloaded, skip
if (!file.exists(paste0(savepath, outname))) {
# print(paste("Output", outname))
download.file(paste0(url_begin, filename), destfile=paste0(savepath, outname), mode="wb")
}
}
# Now let's start trying to tear them apart, Note that there are a handful of excel
# spreadsheets in and amongst the docx tables. And one html. Gaaa...
# And some are corrupt. I'll mark those in the input file of names.
for (filename in filenames$Filenames) { # read files in and extract
if (str_detect(filename, "^#")) {next} # bad file, commented out
outname <- str_remove(filename, "^202[0-4]/")
# print(paste("Input", outname))
fileout <- paste0(savepath, outname, ".rds")
if (file.exists(fileout)) {next} # skip is already made file
if (str_detect(outname, "xls")) { # Oh for God's sake, some are excel????
# but is really excel or are you just trying to screw with me?
file_query <- system2(command="file", args=paste0(paste0(savepath, outname)),
stdout=TRUE)
if (stringr::str_detect(file_query, "HTML")) {print("HTML file")
# print("---- xls but html")
tmp <- rvest::read_html(paste0(paste0(savepath, outname))) %>%
rvest::html_table()
tmp <- tmp[[1]]
tmp <- tmp %>% # remove headers and trailers
filter(stringr::str_detect(X1, "^77")) %>%
mutate(X5=stringr::str_replace_all(X5, " *", " ")) # shrink long strings of blanks
names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
saveRDS(tmp, fileout)
}
else {
print("---- xml")
tmp <- readxl::read_excel(paste0(savepath, outname), col_names=FALSE)
tmp <- tmp[-c(1,4),]
tmp <- tmp[,1:6]
names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
tmp <- tmp %>% mutate(Permit_date =
as.Date(as.numeric(Permit_date),
origin = "1899-12-30"))
saveRDS(tmp, fileout)
}
}
else if (str_detect(outname, "html")){ # the odd html file, but honest at least
# print("---- html")
tmp <- rvest::read_html(paste0(paste0(savepath, outname))) %>%
rvest::html_table()
tmp <- tmp[[1]]
tmp <- tmp %>% # remove headers and trailers
filter(stringr::str_detect(X1, "^77")) %>%
select(X1, X2, X3, X4, X5, X6) %>%
mutate(X5=stringr::str_replace_all(X5, " *", " ")) # shrink long strings of blanks
names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
saveRDS(tmp, fileout)
} else { # and some are docx?
report <- docxtractr::read_docx(paste0(savepath, outname))
tables <- docxtractr::docx_extract_all_tbls(report)
tmp <- tables[[1]]
tmp <- tmp[-c(1,2),]
tmp <- tmp[,1:6]
names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no",
"Address", "Comments")
saveRDS(tmp, fileout)
}
}Combine
Hopefully the individual files are all compatible now and combining them will be a simple task.
# If already done this, then skip
if (!file.exists(paste0(savepath, "Permit_data_2022-9Sep2024.rds"))) {
df <- NULL
for (filename in filenames$Filenames) { # read files in and extract
if (str_detect(filename, "^#")) {next} # bad file, commented out
outname <- str_remove(filename, "^202[0-4]/")
# print(paste("Input", outname))
fileout <- paste0(savepath, outname, ".rds")
#------------------- now try combining
foo <- readRDS(fileout)
foo$Permit_date <- lubridate::as_date(foo$Permit_date)
df <- bind_rows(df, foo)
}
df <- df[!df$Zipcode=="",] # eliminate blank lines
df <- df[!is.na(df$Permit_date),] # eliminate nearly blank lines
df <- df[!df$Zipcode=="Zip Code",]
saveRDS(df, paste0(savepath, "Permit_data_2022-9Sep2024.rds"))
}Now we need to Geocode the data
Let’s clean up the address field where necessary, and then use my handy, dandy Houston geocoder to add lat longs to the data.
df <- readRDS(paste0(savepath, "Permit_data_2022-9Sep2024.rds"))
library(postmastr)
# detach("package:GeocodeHou", unload=TRUE)
library(GeocodeHou)
# Protect Nasa Rd 1, it is unique in having a number following RD
foo <- df %>% mutate(Address=str_replace(Address, "NASA RD 1", "NASA ROAD 1"))
foo <- foo %>% # head(5000) %>%
# get rid of extra white space
mutate(Address=str_squish(Address)) %>%
# get rid of address like 325 1/2. Get rid of the fraction
string_replace( "Address", "^(\\d+)\\s?\\d/\\d ", "\\1 ", "Apply") %>%
# Collapse farm roads
string_replace("Output", " F M ", " FM ", "Apply") %>%
# change & to AND
string_replace("Output", "&", "AND", "Apply") %>%
# Remove BLD (Building) designator
string_replace("Output", " BLD\\s?[\\w]+$", "", "Apply") %>%
# Take care of Buffalo Speedway
string_replace("Output", "BUFFALO SPEEDWAY", "BUFFALO SPWY", "Apply") %>%
# streets designated are Private are special and hard to geocode.
# We will remove that designation, but then flag it in a comments column
mutate(Special=ifelse(str_detect(Address, "\\(PVT\\)"), "Private", "")) %>%
# Remove anything in parenthesis
string_replace("Output", "\\(.+\\)", "", "Apply") %>%
# Remove "A AND B"
string_replace("Output", " A\\s?AND\\s?B", "", "Apply") %>%
# Remove "FL" for Floor
string_replace("Output", " FL[R]?\\s?\\w+$", " ", "Apply") %>%
# Remove "STE" for Suite
string_replace("Output", " STE [A-Z0-9-\\.]+$", " ", "Apply") %>%
# Remove "." but carefully. Collapse extra white space afterwards
mutate(Output=str_replace_all(Output, "\\.", " ")) %>%
mutate(Output=str_squish(Output)) %>%
# Remove stuff with dashes in them except John-A
mutate(Output=str_replace(Output, "JOHN-A", "JOHN:A")) %>%
string_replace("Output", " [\\w]+-[\\w]+$", "", "Apply") %>%
mutate(Output=str_replace(Output, "JOHN:A", "JOHN-A")) %>%
# Convert ST to SAINT
string_replace("Output", "^(\\d+) ST ", "\\1 SAINT ", "Apply") %>%
string_replace("Output", "^(\\d+ [NSEW]) ST ", "\\1 SAINT ", "Apply") %>%
# Correct Parkway abbreviation
string_replace("Output", " PKY", " PKWY", "Apply") %>%
# Correct Plaza streets
string_replace("Output",
"(LAWNDALE|LOVE|GREENWAY|MEYERLAND|WILCREST) PLAZA",
"\\1 PLZ", "Apply") %>%
# Clean up stuff added to AVENUE
string_replace("Output", "AVENUE ([A-Z]{1}) .+", "AVENUE \\1", "Apply") %>%
# Clean up stuff added to Riverway
string_replace("Output", "RIVERWAY \\d+", "RIVERWAY", "Apply") %>%
# Clean up Sam Houston Pkwy
string_replace("Output", "SAM HOUSTON NORTH", "SAM HOUSTON", "Apply") %>%
string_replace("Output", "SAM HOUSTON SOUTH", "SAM HOUSTON", "Apply") %>%
string_replace("Output", "SAM HOUSTON EAST", "SAM HOUSTON", "Apply") %>%
string_replace("Output", "SAM HOUSTON WEST", "SAM HOUSTON", "Apply") %>%
# Repair staff sgt macario
string_replace("Output", "S/SGT MACARIO", "SSGT MACARIO", "Apply") %>%
# Repair avenida
string_replace("Output", "AMERICAS .+$", "AMERICAS", "Apply") %>%
# Repair el camino real
string_replace("Output", "EL CAMINO REAL .+$", "EL CAMINO REAL", "Apply") %>%
# Repair LOOP
string_replace("Output", "NORTH LOOP .+$", "NORTH LOOP FWY", "Apply") %>%
string_replace("Output", "SOUTH LOOP .+$", "SOUTH LOOP FWY", "Apply") %>%
string_replace("Output", "EAST LOOP .+$", "EAST LOOP FWY", "Apply") %>%
string_replace("Output", "WEST LOOP .+$", "WEST LOOP FWY", "Apply")
# Road types built into a big "or" string
Types <- paste(c(" ALY", "AVE", "BLF", "BLVD", "BND", "BR", "BRG", "BRK",
"BYP", "CIR", "COR", "CRES", "CRK", "CRST", "CT", "CTR",
"CTS", "CV", "CYN", "DL", "DR", "DRS", "EST", "ESTS",
"EXT", "FLD", "FLS", "FRK", "FRST", "FRY", "FWY", "GDN",
"GDNS", "GLN", "GRN", "GRV", "HBR", "HL", "HLS", "HOLW",
"HTS", "HVN", "HWY", "INLT", "IS", "KNL", "KNLS", "LDG",
"LK", "LKS", "LN", "LNDG", "MDW", "MDWS",
"ML", "MNR", "MTN", "PATH", "PKWY",
"PL", "PLZ", "PNE", "PNES", "PR", "PRT", "PSGE", "PT",
"RD", "RDG", "RIV", "RST", "SHR", "SHRS",
"SPG", "SPWY", "SQ", "ST", "STA", "STRM", "TER",
"TRCE", "TRL", "VIS", "VL", "VLG", "VLY", "VW",
"WLS", "XING"), collapse="| ")
# Removed MT, WALK, and SPUR as they caused problems, put in More_types
# Delete anything appearing after the road type designator
foo <- foo %>%
string_replace("Output", paste0("( ",Types,") .+$"), "\\1", "Apply")
More_types <- " MALL| WAY| PLACE| WALK| SPUR| MT"
foo <- foo %>%
string_replace("Output", paste0("(",More_types,") \\w?\\d+$"), "\\1", "Apply")
# Fix farm roads: change F M to FM and delete stuff following designation
Farm_roads <- "FM 1960|FM 2100|FM 2351|FM 529|FM 2920|FM 1485|FM 2855|FM 1093|FM 2234|FM 362|FM 1942|FM 1314|FM 1463|FM 723|FM 1464|FM 686|FM 2978|FM 528|FM 521 |FM 1098|FM 149|FM 1959|FM 359|FM 1488|FM 249|FM 2917|FM 1736|FM 526"
foo <- foo %>%
string_replace("Output", paste0("(",Farm_roads,") .+$"), "\\1", "Apply")
# Unprotect Nasa Rd 1, it is unique in having a number following RD
foo <- foo %>% mutate(Output=str_replace(Output, "NASA ROAD 1", "NASA RD 1"))
# What are ambiguous suffixes?
# Names from GeocodeHou
All_types <- df_names_only$Street_type %>% sort() %>% unique()
All_types <- All_types[2:length(All_types)] # get rid of blank
All_types <- paste0(" ", All_types)
All_types <- paste0(All_types, collapse="|")
foobar <- df_names_only %>%
group_by(Street_name) %>%
summarize(last(Street_name)) %>%
mutate(Hit=str_detect(Street_name, All_types))
# Parse the addresses using postmastr
# Protect names like Avenue N,S,E,W
foo <- foo %>%
mutate(Output=str_replace(Output, "AVENUE ", "XAVENUE"))
pm_dictionary(type = "state", filter = c("TX"), case = "title", locale = "us")# A tibble: 2 × 2
state.output state.input
<chr> <chr>
1 TX TX
2 TX Texas
dirs <- pm_dictionary(type = "directional",
filter = c("N", "S", "E", "W"), locale = "us")
TX <- pm_dictionary(type = "state", filter = "TX",
case = c("title", "upper"), locale = "us")
hou <- pm_append(type="suffix", input=c("SPEEDWAY", "SPWY", "PLACE"),
output=c("SPWY", "SPWY", "PL"))
Type_dict <- pm_dictionary(type="suffix", append=hou, case="upper")
foo <- pm_identify(foo, var="Output") # add ID fields
foo2 <- pm_prep(foo, var="Output", type="street") # Prep data
foo2 <- pm_houseFrac_parse(foo2)
foo2 <- pm_house_parse(foo2)
foo2 <- pm_streetDir_parse(foo2, dirs)
foo2 <- pm_streetSuf_parse(foo2, Type_dict)
foo2 <- pm_street_parse(foo2)
foo2 <- foo2 %>%
mutate(pm.street=str_replace(pm.street, " 1 At 2", " 1/2"))
foo2 <- foo2 %>%
mutate(pm.street=str_to_upper(pm.street)) %>%
mutate(pm.streetSuf=str_to_upper(pm.streetSuf)) %>%
mutate(pm.preDir=replace_na(pm.preDir, "")) %>%
mutate(pm.streetSuf=replace_na(pm.streetSuf, ""))
foo <- pm_replace(street=foo2, source=foo)
# Unprotect AVENUE NSEW
foo <- foo %>%
mutate(pm.street = str_replace(pm.street, "XAVENUE", "AVENUE ")) %>%
mutate(Output = str_replace(Output, "XAVENUE", "AVENUE "))Now we geocode
We have cleaned up the addresses, and then parsed them into component parts. To geocode we work our way through the parsed parts looking for matches. Exact matches are great, but sometimes we need to do some repair work on the failures to get a good match. About 50 matches per second is the rate that I see.
Exact_match <- NULL
Failed_match <- NULL
for (i in 1:nrow(foo)){ # first look for exact matches
# if (i%%1000==0) { print(paste("---",i,"---")) } # it is slow, I need reassurance
tmp <- match_exactly(foo[i,]$pm.house, foo[i,]$pm.preDir, foo[i,]$pm.street,
foo[i,]$pm.streetSuf, foo[i,]$Zipcode)
if (tmp$Success){ # success
Exact_match <- cbind(foo[i,], tmp) %>%
select(pm.id, pm.house, pm.preDir, pm.street, pm.streetSuf, Zipcode, Lat, Lon) %>%
rbind(., Exact_match)
} else { # Fail exact match
Failed_match <- cbind(foo[i,], tmp) %>%
select(pm.id, pm.house, pm.preDir, pm.street, pm.streetSuf, Zipcode, Fail,
Lat, Lon) %>%
rbind(., Failed_match)
}
}
saveRDS(Exact_match, paste0(savepath, "Keep_Exactmatch.rds"))
saveRDS(Failed_match, paste0(savepath, "Keep_Failedmatch.rds"))Now let’s work on the failed matches. I have broken code chunk out because the previous chunk is cached - it runs rather slowly so let’s only run it once!
Note that we got 54,063 exact matches, and failed for 8906 addresses
# Try to fix a bad zipcode
Failed_match <- Failed_match %>% mutate(Correction=NA)
for (i in 1:nrow(Failed_match)){
# if (i%%100==0) { print(paste("---",i,"---")) }
target <- Failed_match[i,]
tmp <- repair_zipcode(target$pm.house, target$pm.preDir, target$pm.street,
target$pm.streetSuf, target$Zipcode)
if (tmp$Success){ # success
Failed_match[i,]$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Fail <- paste(Failed_match[i,]$Fail, "Zipcode")
Failed_match[i,]$Correction <- tmp$New_zipcode
}
}Look for bad names
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Street_name") {next} # skip if name isn't the issue
if (Failed_match[i,]$Lat > 0) {next} # skip if zipcode resolved it
# if (i%%10==0) { print(paste("---",i,"---")) }
target <- Failed_match[i,]
tmp <- repair_name(target$pm.house, target$pm.preDir, target$pm.street,
target$pm.streetSuf, target$Zipcode)
if (tmp$Success){ # success
Failed_match[i,]$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$New_name
} else {
Failed_match[i,]$Fail <- paste("Street_name",tmp$Fail)
}
}Bad types
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Street_type") {next} # skip if type isn't the issue
# if (i%%10==0) { print(paste("---",i,"---")) }
target <- Failed_match[i,]
tmp <- repair_type(target$pm.house, target$pm.preDir, target$pm.street,
target$pm.streetSuf, target$Zipcode)
if (tmp$Success){ # success
Failed_match[i,]$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$New_type
} else {
Failed_match[i,]$Fail <- paste("Street_type",tmp$Fail)
}
}Bad numbers
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Street_num") {next} # skip if name isn't the issue
# if (i%%10==0) { print(paste("---",i,"---")) }
target <- Failed_match[i,]
tmp <- repair_number(target$pm.house, target$pm.preDir, target$pm.street,
target$pm.streetSuf, target$Zipcode)
if (tmp$Success){ # success
Failed_match[i,]$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$Result
} else {
Failed_match[i,]$Fail <- paste("Street_num",tmp$Fail)
}
}Bad prefix
for (i in 1:nrow(Failed_match)){
if (Failed_match[i,]$Fail!="Prefix") {next} # skip if name isn't the issue
# if (i%%10==0) { print(paste("---",i,"---")) }
target <- Failed_match[i,]
tmp <- repair_prefix(target$pm.house, target$pm.preDir, target$pm.street,
target$pm.streetSuf, target$Zipcode)
if (tmp$Success){ # success
Failed_match[i,]$Lat <- tmp$Lat
Failed_match[i,]$Lon <- tmp$Lon
Failed_match[i,]$Correction <- tmp$New_prefix
} else {
Failed_match[i,]$Fail <- paste("Prefix",tmp$Fail)
}
}Managed to recover 3,670 addresses and correct them, so about 5,300 addresses out of the original 62,969 were not able to be geocoded. I suspect I need to update my street database - since it has been a couple of years - and that would certainly catch a few more addresses.
Now we will reconsolidate our data to make a final file
saveRDS(Exact_match, paste0(savepath, "Keep_Exactmatch_final.rds"))
saveRDS(Failed_match, paste0(savepath, "Keep_Failedmatch_final.rds"))
Final <- left_join(foo, select(Exact_match, pm.id, Lat, Lon), by="pm.id")
Final <- left_join(Final,
select(Failed_match, pm.id, Lat, Lon, Fail, Correction),
by="pm.id")
Final <- Final %>%
rowwise() %>%
mutate(Lat.x=ifelse(is.na(Lat.x), Lat.y, Lat.x)) %>%
mutate(Lon.x=ifelse(is.na(Lon.x), Lon.y, Lon.x)) %>%
ungroup() %>%
rename(Lon=Lon.x, Lat=Lat.x) %>%
select(-Lat.y, -Lon.y)
##### Apply corrections for each field
# Prefix
Final <- Final %>%
mutate(pm.preDir=case_when(
replace_na(str_detect(Fail, "^Prefix$"), FALSE) ~ Correction,
TRUE ~ pm.preDir
))
# Name
Final <- Final %>%
mutate(pm.street=case_when(
replace_na(str_detect(Fail, "^Street_name$"), FALSE) ~ Correction,
TRUE ~ pm.street
))
# Zipcode
Final <- Final %>%
mutate(Zipcode=case_when(
replace_na(str_detect(Fail, "^Street_name Zipcode$"), FALSE) ~ Correction,
TRUE ~ Zipcode
))
# Type
Final <- Final %>%
mutate(pm.streetSuf=case_when(
replace_na(str_detect(Fail, "^Street_type$"), FALSE) ~ Correction,
TRUE ~ pm.streetSuf
))
# Rename some columns to better document
Final <- Final %>%
rename(Number=pm.house, Prefix=pm.preDir, Name=pm.street, Type=pm.streetSuf,
Original_address=Address, Cleaned_address=Output)
saveRDS(Final, paste0(savepath, "Clean_Final_2022-9Sep2024.rds"))