FEMA flood assistance data

FEMA
Supporting Activism
Analyze the flood assistance data from FEMA
Author

Alan Jackson

Published

March 20, 2025

FEMA dataset

Looking at the OpenFEMA Dataset: Individuals and Households Program - Valid Registrations - v1

This is about 25 million records, csv format, 9.4 Gb. I’m only interested in data pertaining to flooding, so the first step will be to pull records that are related to flooding. Data is from 2002 up to when I downloaded the data, March 2025.

There is also a much smaller dataset we will load that has disaster summaries that will let us attach names to the various disasters.

library(tidyverse)
library(gt)
library(leaflet)

googlecrs <- "EPSG:4326"

path <- "/home/ajackson/Dropbox/Rprojects/ERD/Data/FEMA_HousingAssist/"
datapath <- "/home/ajackson/Dropbox/Rprojects/Curated_Data_Files/Zip_Yr_Pop/"

#   Read CSV, all as character.

#   Commented out lines because I don't want to do that more than once

# df <- read_csv(paste0(path, "IndividualsAndHouseholdsProgramValidRegistrations.csv"),
#                col_types=stringr::str_dup("c", 71))
# 
# nanoparquet::write_parquet(df, paste0(path, "IndividualsAndHouseholdsProgramValidRegistrations.parquet"))
# 
# df <- df %>%
#   mutate(declarationDate=date(ymd_hms(declarationDate)))

#   Let's read in the disaster declaration summaries as well

summaries <- read_csv(paste0(path, "DisasterDeclarationsSummaries.csv"),
               col_types=stringr::str_dup("c", 28))

Determine how to pull out flood records

# df_test <- df %>% slice_sample(n=50000)  #    Grab 50,000 records

# saveRDS(df_test, paste0(path, "df_test.rds"))

df_test <- readRDS(paste0(path, "df_test.rds"))


df_test %>% 
  mutate(waterLevel=as.numeric(waterLevel)) %>% 
  ggplot(aes(x=waterLevel)) +
    geom_histogram() +
  labs(
    title="Water Level distribution"
  )

df_test %>% 
  ggplot(aes(x=highWaterLocation)) +
    geom_bar() +
    coord_flip() +
  labs(
    title="High Water Location"
  )

df_test %>% 
  ggplot(aes(x=floodDamage)) +
    geom_bar() +
    coord_flip() +
  labs(
    title="Flood Damage Flag"
  )

df_test %>% 
  mutate(floodDamageAmount=as.numeric(floodDamageAmount)) %>% 
  ggplot(aes(x=floodDamageAmount)) +
    geom_histogram() +
  labs(
    title="Flood Damage Amount"
  )

#   make a table

Damages <- tribble(~Variable, ~Flood)

Damages_value <- df_test %>% 
  mutate(waterLevel=as.numeric(waterLevel)) %>% 
  filter(waterLevel>0) %>% 
  count()

Damages[1,]$Variable <- "Water Level"
Damages[1,]$Flood <- Damages_value$n[1]

Damages_value <- df_test %>% 
  filter(!is.na(highWaterLocation)) %>% 
  count()

Damages[2,]$Variable <- "High Water Location"
Damages[2,]$Flood <- Damages_value$n[1]

Damages_value <- df_test %>% 
  filter(floodDamage=="1") %>% 
  count()

Damages[3,]$Variable <- "Flood Damage Flag"
Damages[3,]$Flood <- Damages_value$n[1]

Damages_value <- df_test %>% 
  mutate(floodDamageAmount=as.numeric(floodDamageAmount)) %>% 
  filter(floodDamageAmount>0) %>% 
  count()

Damages[4,]$Variable <- "Flood Damage Value"
Damages[4,]$Flood <- Damages_value$n[1]
   
Damages %>% 
  gt()
Variable Flood
Water Level 6718
High Water Location 6868
Flood Damage Flag 5633
Flood Damage Value 5468
#   Look at combinations

foo <- df_test %>% 
  filter(!is.na(highWaterLocation)) 

df_test %>% 
  mutate(floodDamageAmount=as.numeric(floodDamageAmount)) %>% 
  filter(floodDamageAmount>0) %>% 
  filter(floodDamage=="0")  
# A tibble: 0 × 71
# ℹ 71 variables: incidentType <chr>, declarationDate <date>,
#   disasterNumber <chr>, county <chr>, damagedStateAbbreviation <chr>,
#   damagedCity <chr>, damagedZipCode <chr>, applicantAge <chr>,
#   householdComposition <chr>, occupantsUnderTwo <chr>, occupants2to5 <chr>,
#   occupants6to18 <chr>, occupants19to64 <chr>, occupants65andOver <chr>,
#   grossIncome <chr>, ownRent <chr>, primaryResidence <chr>,
#   residenceType <chr>, homeOwnersInsurance <chr>, floodInsurance <chr>, …
#   Nothing from above. But there can be flagged flood damage with a $0 amount.

Lets look at some simple statistics

Some abbreviations:

Abbrev Full Name
IHP Individual Housing Program
HA Housing Assistance
ONA Other Needs Assistance
TSA Transitional Sheltering Assistance
#   top 10 states for number of applications

df2 %>% 
  group_by(damagedStateAbbreviation) %>% 
    summarize(Applications=n(), 
              Accepted=sum(ihpEligible|haEligible|onaEligible|tsaEligible)) %>% 
  ungroup() %>% 
  mutate(Rejected=Applications-Accepted) %>% 
  arrange(desc(Applications)) %>% 
  rename(State=damagedStateAbbreviation) %>% 
  head(10) %>% 
  mutate(State = fct_reorder(State, Applications)) %>%
  select(-Applications) %>% 
  pivot_longer(!State, names_to = "Status", values_to = "Applications") %>% 
  ggplot(aes(y=Applications, x=State, fill=Status)) + 
    geom_col() +
    scale_y_continuous(labels = scales::label_comma())+
    scale_fill_manual(values =c('green','red')) +
    coord_flip() +
  labs(title="Number of Flood Disaster Assistance Applications by State",
       subtitle="From 2002 until March 2025, Top Ten")

#   top 10 zipcodes for number of applications

df2 %>% 
  group_by(damagedZipCode) %>% 
    summarize(Applications=n(), 
              Accepted=sum(ihpEligible|haEligible|onaEligible|tsaEligible),
              State=first(damagedStateAbbreviation)) %>% 
  ungroup() %>% 
  mutate(Rejected=Applications-Accepted) %>% 
  arrange(desc(Applications)) %>% 
  rename(Zip=damagedZipCode) %>% 
  head(10) %>% 
  mutate(Zip=paste(Zip, '-', State)) %>% 
  mutate(Zip = fct_reorder(Zip, Applications)) %>%
  select(-Applications, -State) %>% 
  pivot_longer(!Zip, names_to = "Status", values_to = "Applications") %>% 
  ggplot(aes(y=Applications, x=Zip, fill=Status)) + 
    geom_col() +
    scale_y_continuous(labels = scales::label_comma())+
    scale_fill_manual(values =c('green','red')) +
    coord_flip() +
  labs(title="Number of Flood Disaster Assistance Applications by Zip",
       subtitle="From 2002 until March 2025, Top Ten")

#   top 10 zipcodes for number of disasters

df2 %>% 
  group_by(damagedZipCode) %>% 
    summarize(Applications=n(), 
              Floods=as.integer(n_distinct(declarationDate)),
              State=first(damagedStateAbbreviation)
              ) %>% 
  ungroup() %>% 
  arrange(desc(Floods), desc(Applications)) %>% 
  rename(Zip=damagedZipCode) %>% 
  head(15) %>% 
  mutate(Zip=paste(Zip, '-', State)) %>% 
  mutate(Zip = fct_reorder(Zip, Applications)) %>%
  select(-State) %>% 
  mutate(Floods=fct(as.character(Floods))) %>% 
  ggplot(aes(y=Applications, x=Zip, fill=Floods)) + 
    geom_col() +
    scale_y_continuous(labels = scales::label_comma())+
    scale_fill_manual(values =c('#F0E442','#0072B2', "green"), name="# Floods") +
    coord_flip() +
  labs(title="Number of Flood Disaster Assistance Applications by Zip",
       subtitle="From 2002 until March 2025, top fifteen by number of floods")

# Distribution of # floods by zipcode

df2 %>% 
  group_by(damagedZipCode) %>% 
    summarize(Applications=n(), 
              Floods=n_distinct(declarationDate),
              State=first(damagedStateAbbreviation)
              ) %>% 
  ungroup() %>% 
  rename(Zip=damagedZipCode) %>% 
  ggplot(aes(x=Floods)) + 
    geom_histogram(bins=13) +
  labs(title="Number of Flood Events by Zip Code",
       subtitle="From 2002 until March 2025",
       x="Number of Flood Events",
       y="Number of Zip Codes")

Let’s make a map of number of flood events, for # > 4

foo <- df2 %>%
  group_by(damagedZipCode) %>%
    summarize(Applications=n(),
              Floods=n_distinct(declarationDate),
              State=first(damagedStateAbbreviation)
              ) %>%
  ungroup() %>%
  filter(Floods>4) %>%
  filter(State!="PR") %>% #    Drop Puerto Rico
  rename(Zip=damagedZipCode)

 #    Get zipcode polygons

ZCTA_geom <- readRDS(paste0(datapath, "ZCTA_geometry.rds"))

foo2 <- left_join(foo, ZCTA_geom, by=join_by(Zip==GEOID)) %>%
  select(-NAME, -variable, -estimate, -moe) %>%
  sf::st_as_sf() %>% 
  sf::st_transform(crs=googlecrs)

#   make map

pal <- colorNumeric("YlOrBr",
                        c(min(foo2$Floods, na.rm=TRUE),
                          max(foo2$Floods, na.rm=TRUE)),
                        na.color = "transparent")
 
map <- leaflet(data=foo2) %>% addTiles() %>% 
  addPolygons(data=foo2,
              weight=2, 
              color="black",
              fillColor = pal(foo2$Floods),
        popup = paste(
        "<div class='leaflet-popup-scrolled' style='max-width:150px;max-height:200px'>",
        "Zip code:", foo2$Zip, "<br>",
        "Applications:", foo2$Applications, "<br>",
        "# of floods:", foo2$Floods, 
        "</div>"
      ))

map
# htmlwidgets::saveWidget(map, paste0(path, "test.html"), selfcontained = TRUE)

# tmap::tmap_options(basemaps="OpenStreetMap")
# tmap::tmap_mode("view")
# 
# tmap::tm_shape(foo2) +
#   tmap::tm_polygons("Floods", fill_alpha=0.5, popup.vars=c("Zip", 
#                                                 "Applications",
#                                                 "Floods"
#                                                 )) 
#     
# Distribution of # applications per # floods and per zip

df2 %>% 
  group_by(damagedZipCode, declarationDate) %>% 
    summarize(Applications=n(), 
              State=first(damagedStateAbbreviation)
              ) %>% 
  ungroup() %>% 
  rename(Zip=damagedZipCode) %>% 
  filter(Applications>1000) %>% 
  ggplot(aes(x=Applications)) + 
    geom_histogram() +
  labs(title="Number of Applications by Zip Code and Event",
       subtitle="From 2002 until March 2025",
       y="Number of Flood Events",
       x="Number of Applications")

Add # applications to each zip and flood and then filter out small numbers

If there are fewer than 10 applications for a given flood within a single zipcode, I think that is noise and I don’t care.

Applications <- df2 %>% 
  group_by(damagedZipCode, disasterNumber) %>% 
    summarize(Applications=n() 
              ) %>% 
  ungroup() %>% 
  rename(Zip=damagedZipCode) %>% 
  filter(Applications>=10)

df3 <- inner_join(df2, Applications, join_by(damagedZipCode==Zip, 
                                             disasterNumber==disasterNumber)) %>% 
  filter(damagedStateAbbreviation !="PR")  #    Drop Puerto Rico

By_zip_flood <- df3 %>% 
  group_by(damagedZipCode, disasterNumber) %>% 
    summarize(Applications=first(Applications),
              declarationDate=first(declarationDate),
              Owner=sum(ownRent=="Owner"),
              State=first(damagedStateAbbreviation)
              ) %>% 
  ungroup() %>% 
  inner_join(., summaries %>% 
                   select(disasterNumber, declarationTitle) %>% 
                   distinct(),
             by="disasterNumber")

Add in the census data

This is tricky, since ZCTA’s (zip codes) change. However, what I care about is the population now - in several cases, after a major disaster, the population fell significantly. If there are fewer than 100 households in the ZCTA today, then that ZCTA is no longer relevant.

#   Add in population (number of households)

Census_data <- readRDS(paste0(datapath, "Pop_House_Poverty_by_ZCTA_year.rds"))

#   Extract only zipcodes that have flooded

Census_data <- Census_data %>% 
  filter(ZCTA %in% Applications$Zip)

# Look for ZCTA's which had the highest percent of applications with >5 floods

Num_floods <- By_zip_flood %>% 
  group_by(damagedZipCode) %>% 
    summarize(Num_Floods=n()) %>% 
  filter(Num_Floods>5)

#   Combine with data by zip and flood

Num_floods <- inner_join(Num_floods, By_zip_flood, by="damagedZipCode")

#   Create list of zipcodes with >5 floods

Many_floods <- Num_floods %>% 
  group_by(damagedZipCode) %>% 
    summarize(Num_Floods=first(Num_Floods),
              State=first(State),
              Applications=sum(Applications)) %>% 
  ungroup() %>% 
  rename(ZCTA=damagedZipCode)

#   Add average households for each zip

Census_avg <- Census_data %>% 
  group_by(ZCTA) %>% 
    summarize(M_Pop=mean(Pop), #  mean Pop
              M_Household=mean(Household), #  mean households
              M_Poverty=mean(Poverty), #  mean poverty
              D_Pop=last(Pop)-first(Pop), # delta Pop
              D_Poverty=last(Poverty)-first(Poverty), # delta poverty
              D_Poverty_pct=D_Poverty/M_Poverty, #  % delta poverty
              D_Pop_pct=D_Pop/M_Pop, #  % delta pop
              M_Pov_pct=M_Poverty/M_Pop) %>% #  % mean poverty
  ungroup()

Many_floods <- Many_floods %>% 
  inner_join(., Census_avg, by="ZCTA") %>% 
  mutate(Pct_app=Applications/M_Household)

Many_floods %>% 
  ggplot(aes(x=D_Pop)) +
    geom_histogram() +
  labs(title="Distribution of Population Changes in Zip Codes",
       subtitle="Where the number of floods was >5 between 2011 and 2023",
       x="Population in 2023 - Population in 2011")

Many_floods %>% 
  filter(Pct_app>0.5) %>% # Applications as pct of pop > 0.5 total over all floods
  select(ZCTA, D_Pop_pct, D_Poverty_pct) %>% 
  pivot_longer(!ZCTA, names_to="Group", values_to="Pct") %>% 
  ggplot(aes(x=Pct, fill=Group)) +
    geom_histogram(position="identity", alpha=0.3, bins=20) +
  labs(title="Zip Codes where Applications as Pct of Pop > 50%, and >= 5 Flood Events",
       subtitle="Note that while total population declines, the percent in poverty rises",
       x="Percent change in population 2011-2023") +
  scale_fill_discrete(labels = c("D_Pop_pct" = "Total Pop", 
                                 "D_Poverty_pct" = "Poverty Pop"))

Many_floods %>% 
  filter(Pct_app>0.5) %>% # Applications as pct of pop > 0.5 total over all floods
  filter(D_Pop_pct<0) %>% # Overall population decreased
  filter(D_Poverty_pct>0) %>% # percent in poverty increased
  select(ZCTA, State, Applications, Num_Floods, M_Pop, D_Pop, D_Poverty) %>% 
  mutate(M_Pop=signif(M_Pop,3)) %>% 
  arrange(desc(M_Pop)) %>% 
  gt() %>% 
  cols_label(
    ZCTA="Zip Code",
    Num_Floods="# Floods",
    M_Pop="Population",
    D_Pop="Pop Change",
    D_Poverty="Poverty Change"
  )
Zip Code State Applications # Floods Population Pop Change Poverty Change
60617 IL 14522 6 82900 -9515 601
60628 IL 22152 6 67500 -17717 56
60619 IL 16978 6 63100 -7809 1217
70065 LA 11428 6 51100 -2377 593
60643 IL 11504 6 49800 -1839 196
60409 IL 11419 6 36700 -1820 695
60827 IL 8207 6 27800 -3087 159
77632 TX 5648 6 23100 -1839 436
60419 IL 9699 6 22400 -2002 125
60473 IL 7712 6 22200 -767 468
60104 IL 7026 6 18900 -706 43
60478 IL 3143 6 16900 -75 223
60429 IL 3525 6 15500 -1686 351
77078 TX 2508 6 15400 -993 141
33931 FL 5159 6 10100 -1861 151
60155 IL 2570 6 7890 -22 110
70344 LA 2203 7 5870 -1166 101
70733 LA 594 6 1510 -405 106
24818 WV 236 6 1250 -213 4
41348 KY 659 6 1250 -437 65
41669 KY 172 6 966 -164 39
70358 LA 1087 6 827 -233 69
#   Let's look at the top 12 zips by Increasing poverty, Decreasing Population,
#   and average number in poverty > 50

Top_12 <- Many_floods %>% 
  filter(M_Poverty>=50) %>% #  at least 50 households in poverty on average 
  filter(D_Pop_pct<D_Poverty_pct) %>% # Poverty increasing more than population
  filter(D_Poverty_pct>0) %>% # Increasing Poverty
  arrange(desc(D_Poverty_pct)) %>% #  sort by normalized Poverty change
  head(12)

Census_top_12 <- Census_data %>% 
  inner_join(., Top_12, by="ZCTA")

Census_top_12 %>% 
  mutate(Pop_pct=Pop/M_Pop,
         Poverty_pct=Poverty/Pop) %>%
  mutate(Zip=paste(ZCTA, State, signif(Pct_app,2))) %>% 
  select(Zip, Year, Poverty_pct) %>% 
  # pivot_longer(cols=-c(Year, Zip), names_to="Measure", values_to="Value") %>% 
  
  ggplot(aes(x=Year, y=Poverty_pct*100)) + 
  geom_point() +
  geom_smooth() +
  facet_wrap(vars(Zip)) + 
  labs(title="Top 12 Zip Codes by Increasing Poverty from 2011-2023",
       subtitle="Where poverty is increasing more than the population",
       y="Percent Households in Poverty")

Census_top_12 %>% 
  mutate(Pop_pct=100*Pop/M_Pop,
         Poverty_pct=100*Poverty/M_Pop) %>%
  mutate(Zip=paste(ZCTA, State, signif(Pct_app,2))) %>% 
  select(Zip, Year, Poverty_pct, Pop_pct) %>% 
  pivot_longer(cols=-c(Year, Zip), names_to="Measure", values_to="Value") %>% 
  
  ggplot(aes(x=Year, y=Value, color=Measure)) + 
  geom_point() +
  geom_smooth() +
  facet_wrap(vars(Zip)) +
  labs(title="Top 12 Zip Codes by Increasing Poverty from 2011-2023",
       subtitle="Where poverty is increasing more than the population",
       caption="Heading is Zip, State, #applications/Population",
       y="Percent Households (normalized to mean")