Read Houston City Permit Data

Mapping
Houston
Data
Read in the city of Houston permit data and cleanup. It really is a dog’s breakfast, different filetypes, inconsistent filenames, really just a PITA.
Author

Alan Jackson

Published

September 14, 2024

Read in Permit Data

The permit data is stored weekly on a website either as a docx table of an excel spreadsheet, and once as an html file, the choice of which seemingly random. Additionally the names of the files are inconsistent and while having the date in the name, not named in a consistent fashion. These files are obviously generated each week by hand - which is a terrible indictment of the fourth largest city in the country, but there it is. Earlier I had read in data through March 2022. So this will extend that data up through 9 Sep 2024. But wait! It gets worse. At least one supposed xls file is actually html! Jerks.

library(tidyverse)
library(stringr)

savepath <-  "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Houston_Permits/"

url_begin <- "https://www.houstontx.gov/planning/DevelopRegs/docs_pdfs/Permit_eReport/"

knitr::opts_chunk$set(echo = TRUE)

Step 1 Read in the files

Some sample names: docs_pdfs/Permit_eReport/2024/Web-eReport-02-12-24.docx docs_pdfs/Permit_eReport/2024/Web-eReport-02-19-24.docx docs_pdfs/Permit_eReport/2024/Web%20eReport%2002-26-24.html docs_pdfs/Permit_eReport/2024/Web%20eReport%2003-04-24.docx docs_pdfs/Permit_eReport/2024/Web%20eReport%2003-11-24.docx docs_pdfs/Permit_eReport/2023/Web-eReport-07-17-23.xls docs_pdfs/Permit_eReport/2023/Web-eReport-07-24-23.xls

Naah, this is a nightmare. I’ll just take the webpage and edit it by hand to create a list of files to download.

replace blanks with %20, and trim up to the year folder. Where it exists.

#   Read in list of files and read them in

filenames <- read_csv(paste0(savepath, "Permits_2022-9Sep2024.txt"))

for (filename in filenames$Filenames) { # read files in and save them
  filename <- stringr::str_replace_all(filename, " ", "%20")
  # print(paste("Read file", filename))
  if (str_detect(filename, "^#")) {next} # bad file, commented out
  outname <- str_remove(filename, "^202[0-4]/")
  #   If already donloaded, skip
  if (!file.exists(paste0(savepath, outname))) {
    # print(paste("Output", outname))
    download.file(paste0(url_begin, filename), destfile=paste0(savepath, outname), mode="wb")
  }
}

#   Now let's start trying to tear them apart, Note that there are a handful of excel
#   spreadsheets in and amongst the docx tables. And one html. Gaaa...
#   And some are corrupt. I'll mark those in the input file of names.

for (filename in filenames$Filenames) { # read files in and extract
  if (str_detect(filename, "^#")) {next} # bad file, commented out
  outname <- str_remove(filename, "^202[0-4]/")
  # print(paste("Input", outname))
  fileout <- paste0(savepath, outname, ".rds")
  if (file.exists(fileout)) {next} # skip is already made file
  
  if (str_detect(outname, "xls")) { # Oh for God's sake, some are excel????
    # but is really excel or are you just trying to screw with me?
      file_query <- system2(command="file", args=paste0(paste0(savepath, outname)),
                            stdout=TRUE)
      if (stringr::str_detect(file_query, "HTML")) {print("HTML file")
        # print("----  xls but html")
        tmp <- rvest::read_html(paste0(paste0(savepath, outname))) %>% 
          rvest::html_table()
        tmp <- tmp[[1]]
        tmp <- tmp %>% #  remove headers and trailers
          filter(stringr::str_detect(X1, "^77")) %>% 
          mutate(X5=stringr::str_replace_all(X5, "  *", " ")) # shrink long strings of blanks
        names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no", 
                      "Address", "Comments")
        saveRDS(tmp, fileout)
      }
      else {
        print("----  xml")
        tmp <- readxl::read_excel(paste0(savepath, outname), col_names=FALSE)  
        tmp <- tmp[-c(1,4),] 
        tmp <- tmp[,1:6] 
        names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no", 
                      "Address", "Comments")
        tmp <- tmp %>% mutate(Permit_date = 
                                as.Date(as.numeric(Permit_date), 
                                        origin = "1899-12-30"))
        saveRDS(tmp, fileout)
      }
  }
    else if (str_detect(outname, "html")){ # the odd html file, but honest at least
      # print("----  html")
      tmp <- rvest::read_html(paste0(paste0(savepath, outname))) %>% 
        rvest::html_table()
      tmp <- tmp[[1]]
      tmp <- tmp %>% #  remove headers and trailers
        filter(stringr::str_detect(X1, "^77")) %>% 
        select(X1, X2, X3, X4, X5, X6) %>% 
        mutate(X5=stringr::str_replace_all(X5, "  *", " ")) # shrink long strings of blanks
      names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no", 
                    "Address", "Comments")
      saveRDS(tmp, fileout)
    } else { # and some are docx?
      report <- docxtractr::read_docx(paste0(savepath, outname))
      tables <- docxtractr::docx_extract_all_tbls(report)
      tmp <- tables[[1]]
      tmp <- tmp[-c(1,2),] 
      tmp <- tmp[,1:6] 
      names(tmp) <- c("Zipcode", "Permit_date", "Permit_type", "Project_no", 
                     "Address", "Comments")
      saveRDS(tmp, fileout)

    }
}

Combine

Hopefully the individual files are all compatible now and combining them will be a simple task.

#   If already done this, then skip
if (!file.exists(paste0(savepath, "Permit_data_2022-9Sep2024.rds"))) {

  df <- NULL
  
  for (filename in filenames$Filenames) { # read files in and extract
    if (str_detect(filename, "^#")) {next} # bad file, commented out
    outname <- str_remove(filename, "^202[0-4]/")
    # print(paste("Input", outname))
    fileout <- paste0(savepath, outname, ".rds")
    #-------------------  now try combining
    
    foo <- readRDS(fileout)
    foo$Permit_date <- lubridate::as_date(foo$Permit_date)
    df <- bind_rows(df, foo)
  }
  
  df <- df[!df$Zipcode=="",]  # eliminate blank lines
  df <- df[!is.na(df$Permit_date),] # eliminate nearly blank lines
  df <- df[!df$Zipcode=="Zip Code",]
  
  saveRDS(df, paste0(savepath, "Permit_data_2022-9Sep2024.rds"))
}

Now we need to Geocode the data

Let’s clean up the address field where necessary, and then use my handy, dandy Houston geocoder to add lat longs to the data.

df <- readRDS(paste0(savepath, "Permit_data_2022-9Sep2024.rds"))

library(postmastr)
# detach("package:GeocodeHou", unload=TRUE)
library(GeocodeHou)

# Protect Nasa Rd 1, it is unique in having a number following RD

foo <- df %>% mutate(Address=str_replace(Address, "NASA RD 1", "NASA ROAD 1"))
foo <- foo %>% # head(5000) %>% 
#   get rid of extra white space
       mutate(Address=str_squish(Address)) %>% 
#   get rid of address like 325 1/2. Get rid of the fraction
       string_replace( "Address", "^(\\d+)\\s?\\d/\\d ", "\\1 ", "Apply") %>% 
#   Collapse farm roads 
       string_replace("Output", " F M ", " FM ", "Apply") %>% 
#   change & to AND
       string_replace("Output", "&", "AND", "Apply") %>% 
#   Remove BLD (Building) designator
       string_replace("Output", " BLD\\s?[\\w]+$", "", "Apply") %>% 
#   Take care of Buffalo Speedway
       string_replace("Output", "BUFFALO SPEEDWAY", "BUFFALO SPWY", "Apply") %>% 
#   streets designated are Private are special and hard to geocode.
#   We will remove that designation, but then flag it in a comments column
  mutate(Special=ifelse(str_detect(Address, "\\(PVT\\)"), "Private", "")) %>% 
#   Remove anything in parenthesis
       string_replace("Output", "\\(.+\\)", "", "Apply") %>% 
#   Remove "A AND B"
       string_replace("Output", " A\\s?AND\\s?B", "", "Apply") %>% 
#   Remove "FL" for Floor
       string_replace("Output", " FL[R]?\\s?\\w+$", " ", "Apply") %>% 
#   Remove "STE" for Suite
       string_replace("Output", " STE [A-Z0-9-\\.]+$", " ", "Apply") %>% 
#   Remove "." but carefully. Collapse extra white space afterwards
       mutate(Output=str_replace_all(Output, "\\.", " ")) %>% 
       mutate(Output=str_squish(Output)) %>%  
#   Remove stuff with dashes in them except John-A
       mutate(Output=str_replace(Output, "JOHN-A", "JOHN:A")) %>% 
       string_replace("Output", " [\\w]+-[\\w]+$", "", "Apply") %>% 
       mutate(Output=str_replace(Output, "JOHN:A", "JOHN-A")) %>% 
#   Convert ST to SAINT
       string_replace("Output", "^(\\d+) ST ", "\\1 SAINT ", "Apply") %>% 
       string_replace("Output", "^(\\d+ [NSEW]) ST ", "\\1 SAINT ", "Apply") %>% 
#   Correct Parkway abbreviation
       string_replace("Output", " PKY", " PKWY", "Apply") %>% 
#   Correct Plaza streets
       string_replace("Output", 
                      "(LAWNDALE|LOVE|GREENWAY|MEYERLAND|WILCREST) PLAZA", 
                      "\\1 PLZ", "Apply") %>% 
#   Clean up stuff added to AVENUE
     string_replace("Output", "AVENUE ([A-Z]{1}) .+", "AVENUE \\1", "Apply") %>%   
#   Clean up stuff added to Riverway
       string_replace("Output", "RIVERWAY \\d+", "RIVERWAY", "Apply") %>%  
#   Clean up Sam Houston Pkwy
       string_replace("Output", "SAM HOUSTON NORTH", "SAM HOUSTON", "Apply") %>%  
       string_replace("Output", "SAM HOUSTON SOUTH", "SAM HOUSTON", "Apply") %>%  
       string_replace("Output", "SAM HOUSTON EAST",  "SAM HOUSTON", "Apply") %>%  
       string_replace("Output", "SAM HOUSTON WEST",  "SAM HOUSTON", "Apply") %>%   
#   Repair staff sgt macario
       string_replace("Output", "S/SGT MACARIO", "SSGT MACARIO", "Apply") %>%  
#   Repair avenida
       string_replace("Output", "AMERICAS .+$", "AMERICAS", "Apply") %>% 
#   Repair el camino real
       string_replace("Output", "EL CAMINO REAL .+$", "EL CAMINO REAL", "Apply") %>% 
#   Repair LOOP
       string_replace("Output", "NORTH LOOP .+$", "NORTH LOOP FWY", "Apply") %>% 
       string_replace("Output", "SOUTH LOOP .+$", "SOUTH LOOP FWY", "Apply") %>% 
       string_replace("Output", "EAST LOOP .+$", "EAST LOOP FWY", "Apply") %>% 
       string_replace("Output", "WEST LOOP .+$", "WEST LOOP FWY", "Apply") 

#   Road types built into a big "or" string

Types <- paste(c(" ALY", "AVE", "BLF", "BLVD", "BND", "BR", "BRG", "BRK", 
                 "BYP", "CIR",  "COR", "CRES", "CRK", "CRST", "CT", "CTR",
                 "CTS", "CV", "CYN", "DL", "DR",  "DRS", "EST", "ESTS",
                  "EXT", "FLD", "FLS", "FRK", "FRST", "FRY", "FWY", "GDN", 
                  "GDNS", "GLN", "GRN", "GRV", "HBR", "HL", "HLS", "HOLW",
                  "HTS", "HVN", "HWY",  "INLT", "IS", "KNL", "KNLS", "LDG",
                  "LK", "LKS", "LN", "LNDG", "MDW", "MDWS",
                  "ML", "MNR", "MTN", "PATH", "PKWY",
                  "PL",  "PLZ", "PNE", "PNES", "PR", "PRT", "PSGE", "PT",
                  "RD", "RDG", "RIV", "RST", "SHR", "SHRS",
                  "SPG", "SPWY", "SQ", "ST", "STA", "STRM", "TER",
                  "TRCE", "TRL", "VIS", "VL", "VLG", "VLY", "VW", 
                  "WLS",  "XING"), collapse="| ")
#   Removed MT, WALK, and SPUR as they caused problems, put in More_types

#   Delete anything appearing after the road type designator

 foo <- foo %>% 
   string_replace("Output", paste0("( ",Types,") .+$"), "\\1", "Apply")  

 More_types <- " MALL| WAY| PLACE| WALK| SPUR| MT"
 
 foo <- foo %>% 
   string_replace("Output", paste0("(",More_types,") \\w?\\d+$"), "\\1", "Apply")  

 #    Fix farm roads: change F M to FM and delete stuff following designation
 
Farm_roads <- "FM 1960|FM 2100|FM 2351|FM 529|FM 2920|FM 1485|FM 2855|FM 1093|FM 2234|FM 362|FM 1942|FM 1314|FM 1463|FM 723|FM 1464|FM 686|FM 2978|FM 528|FM 521 |FM 1098|FM 149|FM 1959|FM 359|FM 1488|FM 249|FM 2917|FM 1736|FM 526"

foo <- foo %>% 
  string_replace("Output", paste0("(",Farm_roads,") .+$"), "\\1", "Apply")

# Unprotect Nasa Rd 1, it is unique in having a number following RD

foo <- foo %>% mutate(Output=str_replace(Output, "NASA ROAD 1", "NASA RD 1"))

#     What are ambiguous suffixes?

#   Names from GeocodeHou
All_types <- df_names_only$Street_type %>% sort() %>% unique() 
  
All_types <- All_types[2:length(All_types)] # get rid of blank

All_types <- paste0(" ", All_types)

All_types <- paste0(All_types, collapse="|")

foobar <- df_names_only %>% 
  group_by(Street_name) %>% 
    summarize(last(Street_name)) %>% 
  mutate(Hit=str_detect(Street_name, All_types))

#   Parse the addresses using postmastr

#   Protect names like Avenue N,S,E,W

foo <- foo %>% 
  mutate(Output=str_replace(Output, "AVENUE ", "XAVENUE"))

pm_dictionary(type = "state", filter = c("TX"), case = "title", locale = "us")
# A tibble: 2 × 2
  state.output state.input
  <chr>        <chr>      
1 TX           TX         
2 TX           Texas      
dirs <- pm_dictionary(type = "directional", 
                      filter = c("N", "S", "E", "W"), locale = "us")
TX <- pm_dictionary(type = "state", filter = "TX", 
                    case = c("title", "upper"), locale = "us")

hou <- pm_append(type="suffix", input=c("SPEEDWAY", "SPWY", "PLACE"), 
                 output=c("SPWY", "SPWY", "PL"))
Type_dict <- pm_dictionary(type="suffix", append=hou, case="upper")

foo <- pm_identify(foo, var="Output") # add ID fields

foo2 <- pm_prep(foo, var="Output", type="street") # Prep data

foo2 <- pm_houseFrac_parse(foo2)

foo2 <- pm_house_parse(foo2)

foo2 <- pm_streetDir_parse(foo2, dirs)

foo2 <- pm_streetSuf_parse(foo2, Type_dict)
 
foo2 <- pm_street_parse(foo2)

foo2 <- foo2 %>% 
  mutate(pm.street=str_replace(pm.street, " 1 At 2", " 1/2")) 

foo2 <- foo2 %>% 
  mutate(pm.street=str_to_upper(pm.street)) %>% 
  mutate(pm.streetSuf=str_to_upper(pm.streetSuf)) %>% 
  mutate(pm.preDir=replace_na(pm.preDir, "")) %>% 
  mutate(pm.streetSuf=replace_na(pm.streetSuf, ""))

foo <- pm_replace(street=foo2, source=foo)

#   Unprotect AVENUE NSEW

foo <- foo %>% 
  mutate(pm.street = str_replace(pm.street, "XAVENUE", "AVENUE ")) %>% 
  mutate(Output = str_replace(Output, "XAVENUE", "AVENUE "))

Now we geocode

We have cleaned up the addresses, and then parsed them into component parts. To geocode we work our way through the parsed parts looking for matches. Exact matches are great, but sometimes we need to do some repair work on the failures to get a good match. About 50 matches per second is the rate that I see.

Exact_match <- NULL
Failed_match <- NULL

for (i in 1:nrow(foo)){ # first look for exact matches
  # if (i%%1000==0) { print(paste("---",i,"---")) } # it is slow, I need reassurance
  tmp <- match_exactly(foo[i,]$pm.house, foo[i,]$pm.preDir, foo[i,]$pm.street,
                       foo[i,]$pm.streetSuf, foo[i,]$Zipcode)
  if (tmp$Success){ #   success
    Exact_match <- cbind(foo[i,], tmp) %>% 
      select(pm.id, pm.house, pm.preDir, pm.street, pm.streetSuf, Zipcode, Lat, Lon) %>% 
      rbind(., Exact_match)
  } else { #  Fail exact match
    Failed_match <- cbind(foo[i,], tmp) %>% 
      select(pm.id, pm.house, pm.preDir, pm.street, pm.streetSuf, Zipcode, Fail,
             Lat, Lon) %>% 
      rbind(., Failed_match)
  }
}

saveRDS(Exact_match, paste0(savepath, "Keep_Exactmatch.rds"))
saveRDS(Failed_match, paste0(savepath, "Keep_Failedmatch.rds"))

Now let’s work on the failed matches. I have broken code chunk out because the previous chunk is cached - it runs rather slowly so let’s only run it once!

Note that we got 54,063 exact matches, and failed for 8906 addresses

#   Try to fix a bad zipcode

Failed_match <- Failed_match %>% mutate(Correction=NA)

for (i in 1:nrow(Failed_match)){
  # if (i%%100==0) { print(paste("---",i,"---")) }
  target <- Failed_match[i,]
  tmp <- repair_zipcode(target$pm.house, target$pm.preDir, target$pm.street,
                       target$pm.streetSuf, target$Zipcode)
  if (tmp$Success){ #   success
    Failed_match[i,]$Lat <- tmp$Lat
    Failed_match[i,]$Lon <- tmp$Lon
    Failed_match[i,]$Fail <- paste(Failed_match[i,]$Fail, "Zipcode")
    Failed_match[i,]$Correction <- tmp$New_zipcode
  } 
}

Look for bad names

for (i in 1:nrow(Failed_match)){
  if (Failed_match[i,]$Fail!="Street_name") {next} # skip if name isn't the issue
  if (Failed_match[i,]$Lat > 0) {next} # skip if zipcode resolved it
  # if (i%%10==0) { print(paste("---",i,"---")) }
  target <- Failed_match[i,]
  tmp <- repair_name(target$pm.house, target$pm.preDir, target$pm.street,
                       target$pm.streetSuf, target$Zipcode)
  if (tmp$Success){ #   success
    Failed_match[i,]$Lat <- tmp$Lat
    Failed_match[i,]$Lon <- tmp$Lon
    Failed_match[i,]$Correction <- tmp$New_name
  } else {
    Failed_match[i,]$Fail <- paste("Street_name",tmp$Fail)
  }
}

Bad types

for (i in 1:nrow(Failed_match)){
  if (Failed_match[i,]$Fail!="Street_type") {next} # skip if type isn't the issue
  # if (i%%10==0) { print(paste("---",i,"---")) }
  target <- Failed_match[i,]
  tmp <- repair_type(target$pm.house, target$pm.preDir, target$pm.street,
                       target$pm.streetSuf, target$Zipcode)
  if (tmp$Success){ #   success
    Failed_match[i,]$Lat <- tmp$Lat
    Failed_match[i,]$Lon <- tmp$Lon
    Failed_match[i,]$Correction <- tmp$New_type
  } else {
    Failed_match[i,]$Fail <- paste("Street_type",tmp$Fail)
  }
}

Bad numbers

for (i in 1:nrow(Failed_match)){
  if (Failed_match[i,]$Fail!="Street_num") {next} # skip if name isn't the issue
  # if (i%%10==0) { print(paste("---",i,"---")) }
  target <- Failed_match[i,]
  tmp <- repair_number(target$pm.house, target$pm.preDir, target$pm.street,
                       target$pm.streetSuf, target$Zipcode)
  if (tmp$Success){ #   success
    Failed_match[i,]$Lat <- tmp$Lat
    Failed_match[i,]$Lon <- tmp$Lon
    Failed_match[i,]$Correction <- tmp$Result
  } else {
    Failed_match[i,]$Fail <- paste("Street_num",tmp$Fail)
  }
}

Bad prefix

for (i in 1:nrow(Failed_match)){
  if (Failed_match[i,]$Fail!="Prefix") {next} # skip if name isn't the issue
  # if (i%%10==0) { print(paste("---",i,"---")) }
  target <- Failed_match[i,]
  tmp <- repair_prefix(target$pm.house, target$pm.preDir, target$pm.street,
                       target$pm.streetSuf, target$Zipcode)
  if (tmp$Success){ #   success
    Failed_match[i,]$Lat <- tmp$Lat
    Failed_match[i,]$Lon <- tmp$Lon
    Failed_match[i,]$Correction <- tmp$New_prefix
  } else {
    Failed_match[i,]$Fail <- paste("Prefix",tmp$Fail)
  }
}

Managed to recover 3,670 addresses and correct them, so about 5,300 addresses out of the original 62,969 were not able to be geocoded. I suspect I need to update my street database - since it has been a couple of years - and that would certainly catch a few more addresses.

Now we will reconsolidate our data to make a final file

saveRDS(Exact_match, paste0(savepath, "Keep_Exactmatch_final.rds"))
saveRDS(Failed_match, paste0(savepath, "Keep_Failedmatch_final.rds"))

Final <- left_join(foo, select(Exact_match, pm.id, Lat, Lon), by="pm.id")

Final <- left_join(Final,
                   select(Failed_match, pm.id, Lat, Lon, Fail, Correction),
                   by="pm.id")

Final <- Final %>% 
  rowwise() %>% 
    mutate(Lat.x=ifelse(is.na(Lat.x), Lat.y, Lat.x)) %>% 
    mutate(Lon.x=ifelse(is.na(Lon.x), Lon.y, Lon.x)) %>% 
  ungroup() %>% 
  rename(Lon=Lon.x, Lat=Lat.x) %>% 
  select(-Lat.y, -Lon.y)

#####   Apply corrections for each field

#   Prefix

Final <- Final %>% 
  mutate(pm.preDir=case_when(
    replace_na(str_detect(Fail, "^Prefix$"), FALSE) ~ Correction,
    TRUE                                            ~ pm.preDir
  ))

#   Name

Final <- Final %>% 
  mutate(pm.street=case_when(
    replace_na(str_detect(Fail, "^Street_name$"), FALSE) ~ Correction,
    TRUE                                                 ~ pm.street
  ))

#   Zipcode

Final <- Final %>% 
  mutate(Zipcode=case_when(
    replace_na(str_detect(Fail, "^Street_name Zipcode$"), FALSE) ~ Correction,
    TRUE                                                         ~ Zipcode
  ))

#   Type

Final <- Final %>% 
  mutate(pm.streetSuf=case_when(
    replace_na(str_detect(Fail, "^Street_type$"), FALSE) ~ Correction,
    TRUE                                                 ~ pm.streetSuf
  ))


#   Rename some columns to better document

Final <- Final %>% 
  rename(Number=pm.house, Prefix=pm.preDir, Name=pm.street, Type=pm.streetSuf,
         Original_address=Address, Cleaned_address=Output)

saveRDS(Final, paste0(savepath, "Clean_Final_2022-9Sep2024.rds"))