acfrs_filename_govname_ncesid.Rmd

---
title: "Initial Mapping School Districts: ACFRs & NCES"
output: html_document
date: '2022-08-31'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
#library(threadr)
```
Using a list of file names in ACFrs (provided by Marc) to map with the names in NCES list. 
Result: "final_match.csv". 

NCES name = ACFRs name
HSD = High School District
CHSD = Community High School District
CUSD = Community Unit School District

CCSD = Community Consolidated School District
Spec Educ Coop = Special Education Cooperative
ROE = Regional Office of Education
sd = public schools district
USD = School District

UD =  Consolidated School District

# Example of 63 School Districts

This is a hand-matched example of 63 school district in NCES data & ACFRs. 

Analyzing the pattern of match between these 2 datasets to help improving match in the full dataset. 

```{r}
example_match_60sd <- rio::import(here::here("data", "Mappings for Largest School Districts (2).xlsx")) %>% 
  rename(name = `Name in ACFR System`, 
         nces_name = `District Name`) %>% 
  select(name, nces_name, `State District ID`, `State...7`) 

example_match_60sd_clean <- example_match_60sd %>% 
   mutate(nces_name = str_to_lower(nces_name),
         name = str_to_lower(name)) %>% 
  
  # remove terms in acfrs, 1st time
  mutate(name = str_remove_all(name, "(the)?\\s*school district\\s*(of)?|county|independent school district|district school board|\\s*board of education\\s* (of the)?|public schools|the school board of|board of education|public school system")) %>% 

  # remove terms in acfrs, second time
  mutate(name = str_trim(name, side = "both")) %>% 
  mutate(name = str_remove_all(name, "^of\\s*|-$|municipalno.*|\\.|'s|’s|[0-9]*$")) %>% 
  mutate(name = str_trim(name, side = "both")) %>% # need to repeat b/c after removing words, space remains
  
  
  # remove terms in nces, 1st time
  mutate(nces_name = str_remove_all(nces_name, "(the)?\\s*school district\\s*(of)?|independent school district|district( school board)?|board of education|isd|public schools|schools|(co)? pblc schs|county|(city )?sd")) %>% 
  
  # remove terms in nces, 2nd time
  
  mutate(nces_name = str_trim(nces_name, side = "both")) %>% 
  mutate(nces_name = str_remove_all(nces_name, "^of\\s*|-$|municipalno.*|\\.|'s|’s|[0-9]*$")) %>% 
  mutate(nces_name = str_trim(nces_name, side = "both")) 
  
example_matched <- example_match_60sd_clean %>% 
  # check of the 2 name cols are identical
  mutate(same_name = ifelse(name == nces_name, TRUE, FALSE)) %>% 
  filter(same_name == TRUE)

examples_NOT_matched <- example_match_60sd_clean %>% 
  # check of the 2 name cols are identical
  mutate(same_name = ifelse(name == nces_name, TRUE, FALSE)) %>% 
  filter(same_name == FALSE)

```

# Round 1
## ACFRs
```{r}
acfrs_file_name <- rio::import("data/sd_list.xlsx") %>% 
  dplyr::mutate(state = str_split(acfrs_file_name, " ",simplify = TRUE)[ ,1]) %>% 
  mutate(acfrs_original_name = str_remove_all(acfrs_file_name, "2020.pdf"),
        acfrs_original_name = str_sub(acfrs_original_name, 3),
        acfrs_original_name = str_to_lower(acfrs_original_name)) %>% 
  select(state, acfrs_original_name)
```


```{r}
acfrs_school_districts <- acfrs_file_name %>% 
  mutate(name = str_to_lower(acfrs_original_name)) %>% 
  mutate(name = str_remove_all(name, "no\\.|#|'")) %>% 
  mutate(name = str_replace_all(name, "/", " ")) %>%
  mutate(name = str_replace_all(name, "\\.", " ")) %>% 
  mutate(name = str_replace_all(name, "-", " ")) %>% 
  
  mutate(name = str_remove_all(name, "(community consolidated school district)|(community consolidated schools district)|(joint unified school district)|(center unified school district)"),
         #name = str_remove_all(name, ""),
         name = str_remove_all(name, "(consolidated high school district)|(consolidated independent school district)"),
         name = str_remove_all(name, "(union high school district)|(city school district)|(union elementary school district)|(union school district)|(county unified school district)|(joint unified school district)"), # cali
         name = str_remove_all(name, "(county school district)|(county independent school district)"),
         name = str_remove_all(name, "(community unit school district)|(community unit district)"),
         name = str_remove_all(name, "(public school district)|(public schools district)|(independent school district)|(district school board)"),
         #OH
         name = str_remove_all(name, "(union exempted village school district)|(exempted village school district)|(county joint vocational school district)"), 
         name = str_remove_all(name, "(high school district)|(local school district)"),
         name = str_remove_all(name, "educational service district"),
         name = str_remove_all(name, "fractional township"), 
         name = str_remove_all(name, "(the school districts of)|(board of education)|(the school board of)|(public school system)"),
         name = str_remove_all(name, "unified school district"),
         name = str_remove_all(name, "(community school district)|(comm unit school)|(community school dist)"),
         name = str_remove_all(name,"(elementary school district)|(elementary scool district)"),
         name = str_remove_all(name,"public schools"),
         name = str_remove_all(name,"grade school district"),
         name = str_remove_all(name, "(school district)|(comm sch dist)|(elem sch dist)|(sch dist)|(ind sch dist)")) %>% 
  mutate(name = str_squish(name))
```

## Govt name in NCES
```{r}
# File Paul sent to Marc: "dataformarc" file. Email Sep 7, 2022
govname_nces_id <- rio::import("data/censusID_necesID_link.xlsx") %>% 
  # name in this file is government unit name - also the name in file "Govt_Units_2021_Final.xlsx", sheet 3 + sheet 4
  rename(gov_unit_original_name = name) %>% 
  mutate(gov_unit_original_name = str_to_lower(gov_unit_original_name)) %>% 
  mutate(name = str_trim(gov_unit_original_name)) %>% 
  rename(ncesid = `NCES Agency Identification Number`, 
         censusid = idcensus) %>% 
select(censusid, ncesid, gov_unit_original_name, name) 

# NCES list only has 13,713: data downloaded Jan 27, 2020. Not including charter schools. Student > 1. 
nces <- rio::import(here::here("data", "ncesdata_DBBFFFC.xlsx"), skip = 14) %>% 
  select(`NCES District ID`, `District Name`, `County Name*`, City, State, `Students*`) %>% 
  rename(nces_original_name = `District Name`,
    county_nces = `County Name*`, 
    state = State, 
    student = `Students*`, 
    ncesid = `NCES District ID`,
    city_nces = City
   ) 
```

## Cleaning govt unit name
```{r}
census_gov_unit <- nces %>% left_join(govname_nces_id) %>% 
  
  # Only get those 13,713 in NCES to match with ACFRs
  mutate(name = str_replace_all(name, "\\.", " ")) %>% 
  mutate(name = str_replace_all(name, "/", " ")) %>% 
  mutate(name = str_replace_all(name, "-|&", " ")) %>% 
  
  mutate(name = str_remove_all(name, "(community consolidated school district)"),
         name = str_remove_all(name, "(community unit school district)|(community unit)"),
         name = str_remove_all(name, "consolidated school district"), 
         name = str_remove_all(name, "county school district"),
         name = str_remove_all(name, "community consolidated schools district"),
         name = str_remove_all(name, "(community high school district)|(high school district)"), 
         name = str_remove_all(name, "community consolidated school district"),
         name = str_remove_all(name, "consolidated high school district"),
         name = str_remove_all(name,"(city unified sch dist)|(joint unified school district)"), 
        name = str_remove_all(name, "(unified school district)|(union high school dist)|(co office of ed)|(unified sch dist)|(unified school dist)|(union elem sch dist)|(co unif sch dist)|(union elementary sch dist)"),
         
         name = str_remove_all(name,"(public school district)|(public schools)|(unit school district)|(union school district)"),
         name = str_remove_all(name,"(elementary school district)|(elementary scool district)|(elem school district)|(elementary school dist)"),
        # Ohio
        name = str_remove_all(name, "(local school district)|(local sch dist)|(local school dist)|(jt voc sch dist)|(exempted sch dist)|(city sch dist)|(ex vlg sch dist)|(union sch dist)|(ex vlg school dist)"), 
        name = str_remove_all(name, "co jt voc sch dist"),
        
         name = str_remove_all(name, "(community school district)|(community unit district)|(comm college district)|(uni sch dist)|(un sch dist)"),
         name = str_remove_all(name, "(co ind sch dist)|(unif school dist)|(unif sch dist)|(union elem sch dt)|(jt unified sch dist)|(jt union high school dist)|(elem sch district)|(jt elem sch dist)"), #
         name = str_remove_all(name, "(district school board)|(ind sch district)|(ind sch dist)|(cons sch dist)|(ind school district)"),
         
         name = str_remove_all(name, "(school district)|(district)|(comm sch dist)|(elem sch dist)|(sch dist)|(fr t h school district)|(elem sch dt)|(union el sch d)|(jt uni sch dist)")
         ) %>% 
  
     # Texas 
  mutate(name = ifelse(state == "TX", str_remove_all(name, "[0-9]"), name)) %>% 
  mutate(name = str_squish(name))

round1 <- acfrs_school_districts %>% left_join(census_gov_unit) %>% drop_na(censusid)
#round1 %>% filter(enrollment == 0)

round1 %>% select(nces_original_name, acfrs_original_name, gov_unit_original_name)

saveRDS(round1, "round1.RDS")
```

# Round 2 
## ACFRs
```{r}
acfrs_sd_2 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1$acfrs_original_name)%>% 
  mutate(name = str_replace_all(name, "-|,|&|#|_", " ")) %>%  
  mutate(name = str_remove_all(name, "(school district of the city of)|(consolidated school district)|(consolidated schools)|(community schools)|intermediate|(office of education)|(city sch dist)|(independent public school district)"),
         name = str_remove_all(name, "^(the)"),
    name = str_remove_all(name, "central|(union free)|(counties boces)|(county board of cooperative educational services)|centre|(community school)|(community high school)"),
  #Michigan
  name = str_remove_all(name, "(union free school district)|(city school district)|(board of cooperative educational services of)|(schools)|township|(district schools)|(union schools)|(public school of)"),
  
  name = str_remove_all(name, "public school"),
  name = str_remove_all(name, "^of "),
  name = str_remove_all(name, "(school)|county|consolidated|(isd)|( joint)|( district)|( community)|(union districit)$")) %>% 
  mutate(name = ifelse(state == "OK", str_replace_all(name, " 00", " "), name)) %>% 
  mutate(name = ifelse(state == "OK", str_replace_all(name, "( i )|(c0)|( c)|( c )|( 0)|( 1 )", " "), name)) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, "( ity)|( 0)", " "), name)) %>%
  mutate(name = str_remove_all(name, "number|(independent)")) %>% 
  mutate(name = str_replace_all(name, "( d )|( no )|( o[0-9])", " ")) %>% 
  mutate(name = str_squish(name))
```

## Gov Units
```{r}
census_sd_2 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1$gov_unit_original_name) %>% 
mutate(name = str_replace_all(name, "-|,|&|#|_", " "),
       name = str_remove_all(name, "'")) %>%  
  mutate(name = ifelse(state == "MI", str_remove_all(name, "[0-9]"), name)) %>% 
mutate(name = str_remove_all(name, "(consolidated school district)|(cons school)|(joint community college)|(county community school corporation)"),
       
       name = str_remove_all(name, "(union free school district)|(uf sch dist)|(central sch dist)|(union free)|(ctl high school dist)|(pt ool dist)|central|(centre union free school dist)"),
  name = str_remove_all(name, "(city school dist)|(central sch)|(comm college)|(community college)|(ctl sch dist)|(ctl school dist)|(co comm coll)|(community high school)|(pub sch dist)|(comm sch dist)"),
  name = str_remove_all(name, "(school district)|central|( ool dist)|(u f school dist)|(ctl sch)|(uf school dist)|(school dist)|(comm schs)|(city sch dist)|(township sch dist)"),
  
  name = str_remove_all(name,"(community sch dist)|(comm school dist)"),
  name = str_remove_all(name, "^of "),
  
  name = str_remove_all(name, "(u f)|(twp)$"),
  name = str_remove_all(name, "(uf)|(isd)$"),
  name = str_remove_all(name, "( ool)|( pt)|( csd)$"),
  name = str_remove_all(name, "( ctl)|(c s d)|( schs)$"),
  
  name = str_remove_all(name, "(comm schools)|(schs dist)|(public school)|(consolidated school)|(community schools)|(joint union)"),
  name = str_remove_all(name, 
"( schools)|( public)|(co schools)|(township)|(pub)|(twp)|( comm)|( community)|( cmty)|(twp f)|(pub fr)|(consol)|( sch)|( scools)|(college)|(township f)|(twp fr)|( co)|( union)|( joint)$"),
  name = str_replace_all(name, "( i 00)", " ")) %>% 

  mutate(name = str_squish(name)) 

```

```{r}
round2 <- acfrs_sd_2 %>% left_join(census_sd_2) %>% drop_na(censusid) 
round1_2 <- round1 %>% rbind(round2)
saveRDS(round2, "round2.RDS")
```

# Round 3
```{r}
# after round 2, how many each state has left NOT matched
acfrs_sd_2 %>% left_join(census_sd_2) %>% filter(is.na(censusid)) %>% count(state) %>% arrange(desc(n))

## after round 2, how many ACFRS left in total NOT matched
acfrs_sd_3 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1_2$acfrs_original_name) 
```

```{r}
# after round 2, how many census left in total NOT matched
census_sd_3 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1_2$gov_unit_original_name)

```

Now need to match acfrs_sd_3 and census_sd_3
```{r}
acfrs_sd_3_clean <- acfrs_sd_3 %>% #filter(state == "ME") %>% arrange(name) %>% 
  mutate(name = str_remove_all(name, "(community schools district)|(county schools district)"),
    name = str_remove_all(name, "(school disrict)|(community schools)|(community schools)"),
         
         name = str_remove_all(name, "( r)|( county)|( consolidated)$")) %>% 
    mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>% 
  mutate(name = str_remove_all(name, "( municipal)|( city)|( union)$")) %>% 
  mutate(name = str_squish(name))
```

```{r}
census_sd_3_clean <- census_sd_3 %>% 
  filter(student > 0) %>% 
  #filter(state == "ME") %>% 
  mutate(name = str_remove_all(name,"(ind school dist)|(independent rict)|(community college dist)|(community college)|(br school dist)")) %>% 
  mutate(name = str_remove_all(name, "( rict )|(county unified school system)"),
         name = str_remove_all(name, "(city sd)|(city pub[0-9])|(city pub)"),
         
    name = str_replace_all(name, "( 0)", " "),
         name = str_replace_all(name, "serv", "service")) %>% 
   # name = str_remove_all(name, " [0-9]+$")) %>% #filter(str_detect(name, "city pub"))
  mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>% 

mutate(name = str_remove_all(name,"( college)|( independent)|( cons)|( i s)|( rict)|( co cons)|( school)|( city)|( co)|( comm)|( ind sh)|( indep)$"),
  name = str_remove(name, "(olidated)|( munc)$")
  ) %>% 
   mutate(name = str_squish(name))
```

```{r}
round_3 <- acfrs_sd_3_clean %>% left_join(census_sd_3_clean) %>% drop_na(censusid)
# NOT matched after round 3
acfrs_sd_3_clean %>% filter(!acfrs_original_name %in% round_3$acfrs_original_name) %>% count(state) %>% arrange(desc(n))  #filter(str_detect(name, "goose"))
```

```{r}
round123 <- round1_2 %>% rbind(round_3) %>% select(state, acfrs_original_name, nces_original_name, gov_unit_original_name, name, ncesid, censusid, county_nces, city_nces, student) 
#%>% write_csv("acfrs_necs_census_matched.csv")

# length(unique(result$acfrs_original_name))
# length(unique(result$ncesid))
# length(result$acfrs_original_name)    

```
# Round 4
```{r}
#Sep 12
## after round 3, how many ACFRS left in total NOT matched
acfrs_sd_4 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name) 

acfrs_sd_4 %>% count(state) %>% arrange(desc(n))
# after round 3, how many census left in total NOT matched
census_sd_4 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round123$gov_unit_original_name)
# From here, go state by state
```
## IN
```{r}
in_acfrs <- acfrs_sd_4 %>% filter(state == "IN") %>% filter(str_detect(acfrs_original_name, "ecas")) %>% 
  mutate(name = str_remove_all(name, "(community schools ecas)|(community school corporation ecas)|(community schools inc)|(com school corporation)|(consol school corporation eca)|(comm school corporation ecas)|(community school corporation of)|(community school corporation)"),
         name = str_remove_all(name, "(county schools)|(school corporation ecas)|(school corporation)|(community schools eca)|(community schools)|(metropolitan of)|(school city of)|(county consolidated)"),
         name = str_remove_all(name, "( ecas)|(cnty cmnty)|(consolidated)|( schools)$"),
         
         # change acfrs name to census name
         name = ifelse(name == "n vermillion eca", "north vermillion", name),
         
         name = str_squish(name))

  in_acfrs %>% filter(str_detect(acfrs_original_name, "perry")) 
```

```{r}
in_census <- census_sd_4 %>% filter(state == "IN") %>% 
  mutate(name = str_remove_all(name, "(community school inc)|(community school corporation)|(community schools of)|(county consolidated school corporation)|(consolidated school corporation)|(county community school corporation)|(consolidated schools)"),
         name = str_remove_all(name, "(school corporation)|(community schools)"),
         name = str_remove_all(name, "( county)|( metropolitan)|( school corp)$"),
         
         name = str_squish(name)) %>% arrange(name)

in_census %>% filter(str_detect(name, "perry")) 
```


```{r}
in_matched <- in_acfrs %>% left_join(in_census) %>% drop_na(censusid)

in_acfrs_matched <- in_matched %>% select(state, acfrs_original_name, name)

in_matched %>% filter(str_detect(acfrs_original_name, "perry"))
```


```{r}
in_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(in_acfrs_matched) %>% filter(!duplicated(censusid)) %>% #filter(str_detect(name, "perry"))
write_csv("IN_match_unmatched.csv")
```

## CA

```{r}
ca_acfrs <- acfrs_sd_4 %>% filter(state == "CA") %>% #filter(str_detect(acfrs_original_name == "huntington beach city"))

  mutate(name = str_remove_all(name, "(county office of education)|(schools district)|(county superintendent of schools)|(county education office)|(union school distict)|(union elementary)"),
        
         name = str_remove_all(name, "( valley)|( joint)$"),
         
         # change name to a common to meet census
     acfrs_original_name = str_squish(acfrs_original_name), # MUST squish original name
     name = ifelse(acfrs_original_name == "galt joint union elementary school district", "galt elementary", name),
     name = ifelse(acfrs_original_name == "galt joint union high school district", "galt high", name),
    name = ifelse(acfrs_original_name == "gold oak union elementary school district", "gold oak element", name),
    
    
    name = case_when(acfrs_original_name == "huntington beach city school district" ~ "huntington elementary",
                     acfrs_original_name == "huntington beach union high school district" ~ "huntington high",
                     acfrs_original_name == "rim of the world unified school district" ~ "rim world",
                     acfrs_original_name == "san rafael city elementary school district" ~ "san rafael elementary",
                     acfrs_original_name == "san rafael city high school district" ~ "san rafael high", 
                     name == "three rivers" ~ "three river", 
                     TRUE ~ name),
    
         name = str_squish(name)) #%>% filter(str_detect(name, "galt"))
  
```

```{r}
ca_census <- census_sd_4 %>% filter(state == "CA") %>% #filter(str_detect(name, "union high"))
  mutate(name = str_remove_all(name, "(joint unified)|(un high school dist)|(union elem school dist)|(co spl schs oper by co supt)|(co spl sch oper by co supt)|(jt unif school district)|(valley jt unified sch dist)|(co off of education)|(valley unified sch dt)|(unified school d)|(county office of education)|(valley jt unif sch dist)|(elem school dist)|(county special schools operated by co supt)"),
         name = str_remove_all(name, "(jt high)|(union elem)|(elementary sch)|(joint union high)|(jt union high sch)|(union high)|(un school dist)|(unified school dst)|(co office of education)|(union sch dt)|(cmty unif sch dist)|(co spl schs)"),
         name = str_remove_all(name, "(union elementary)|(school dist)|(valley elem)|(co special schools)|(elem sch)|(elem school dist)|(elementary sch dist)|(county special schools)|(unified sch dt)|(joint union)|(unif sch dist)|(county selpa)"),
         name = str_remove_all(name, "( valley)||( el)$"),
         name = str_remove_all(name, "( jt)|( ctr)|(em sch)|( rict)$"),
         name = str_remove_all(name, "( val)|(em sch)|( county)|( cy)$"),
         name = str_remove_all(name, "( j t)|( sch)|(school dist)$"),
         name = str_remove_all(name, "( vly)|( ist)|( un)$"),
         
         # change name to a common to meet acfrs
         gov_unit_original_name = str_squish(gov_unit_original_name),
         name = ifelse(gov_unit_original_name == "galt jt union elem sch dist", "galt elementary", name),
         name = ifelse(gov_unit_original_name == "galt jt union high school dist", "galt high", name),
         name = ifelse(gov_unit_original_name == "gen shafter elem sch dist", "general shafter", name),
         name = ifelse(gov_unit_original_name == "gold oak un elem sch dist", "gold oak element", name),
         
         name = ifelse(gov_unit_original_name == "calexico unif sch dist", "calexico", name),
         name = ifelse(gov_unit_original_name == "el segundo uni sch dist", "e l segundo", name),
         name = ifelse(gov_unit_original_name == "grass valley elem school dist", "grass", name),
         
         name = case_when(gov_unit_original_name == "howell mt elem sch dist" ~ "howell mountain", 
                          gov_unit_original_name == "huntington bch city elem school dist" ~ "huntington elementary",
                          gov_unit_original_name == "huntington beach uhs dist" ~ "huntington high",
                          gov_unit_original_name == "san francisco unif sch dist" ~ "an francisco",
                          name == "pacific" & county_nces == "Fresno County" ~ "pacific (fresno county)",
                          name == "pacific" & county_nces == "Humboldt County" ~ "pacific (humboldt county)",
                          
                          gov_unit_original_name == "san rafael elementary district" ~ "san rafael elementary",
                          gov_unit_original_name == "san rafael high school district" ~ "san rafael high",
                          gov_unit_original_name == "victor valley jt union high school dist" ~ "victor",
                          gov_unit_original_name == "south bay union elem sch dist" & county_nces == "Humboldt County"~ "south bay (humboldt county)",
gov_unit_original_name == "south bay union school district"  & county_nces == "San Diego County" ~ "south bay (san diego county)",
                          TRUE ~ name)) %>% 
         
        mutate( name = str_squish(name)) %>% 
  arrange(name) #%>% filter(str_detect(name, "willow"))
```


```{r}
ca_matched <- ca_acfrs %>% left_join(ca_census) %>% drop_na(censusid) 
ca_acfrs_matched <- ca_matched %>% select(state, acfrs_original_name, name)

ca_matched %>% select(acfrs_original_name, gov_unit_original_name, name)
```

```{r}

ca_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ca_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("CA_match_unmatched.csv")

```


## OH

```{r}
oh_acfrs <- acfrs_sd_4 %>% filter(state == "OH") %>% arrange(name) %>% 
  mutate(name = str_remove_all(name, "heights university heights"),
         #name = str_remove_all(name, ""),
         name = str_remove_all(name, "(central)$"),
         name = str_replace_all(name, "mc donald", "mcdonald"),
         
         # change acfrs name to census name
         name = ifelse(name == "", "", name),
         name = str_squish(name))
  
  
```

```{r}
oh_census <-  census_sd_4 %>% filter(state == "OH") %>% arrange(name) %>% 
  mutate(name = str_remove_all(name, "(hgts univ hgts)|(city schools)|(city sch dis t)|(ex vill school dist)|(cent loc sch dist)|(city school dist)|(loc sch dist)|(city school dist)|(local sch dt)"),
         #name = str_remove_all(name, ""),
        name = str_remove_all(name, "( loc)|( exem)|( loc)|( ex village)$"), 
         name = str_remove_all(name, "(ex vil)|(city sch)|( lo)|( cty)|(cal sch dt)$"),
         name = str_replace_all(name, "hgts", "heights"),
         
         name = case_when(gov_unit_original_name == "buckeye cent loc sch dist" ~ "buckeye", 
                       gov_unit_original_name == "pymatuning vall loc sch dist" ~ "pymatuning valley",
                       gov_unit_original_name == "reading cmnty city sch dist" ~ "reading community",
        
                       gov_unit_original_name == "ripley union lewis local sch dist" ~ "ripley union lewis huntington",
                       TRUE ~ name),
        name = case_when(name == "north olmstead" ~ "north olmsted", 
                         name == "washington courthouse" ~ "washington court house",
                         name == "yellow spgs" ~ "yellow springs",
                         TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "tri"))
```


```{r}
oh_matched <- oh_acfrs %>% left_join(oh_census) %>% drop_na(censusid)

oh_matched %>% select(acfrs_original_name, gov_unit_original_name, name)
```


```{r}
oh_acfrs %>% filter(!acfrs_original_name %in% oh_matched$acfrs_original_name)
```

```{r}
oh_census %>% filter(!gov_unit_original_name %in% oh_matched$gov_unit_original_name) %>% arrange(name)
```
```{r}
oh_acfrs_matched <- oh_matched %>% select(state, acfrs_original_name, name)

oh_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(oh_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("OH_match_unmatched.csv")
```

## NE

```{r}
ne_acfrs <-  acfrs_sd_4 %>% filter(state == "NE") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(public schools)|"),
         name = str_remove_all(name, "(public school)"),
         name = str_remove_all(name, "[0-9]{1,2}$"),
         name = str_replace_all(name, "mc donald", "mcdonald"),
         
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "blue hill school district no. 91-0074" ~ "blue hill 74", 
                       acfrs_original_name == "don iphan-trumbu ll public schools district no. 40-0126" ~ "doniphan-trumbull",
                      acfrs_original_name == "dorchester school district no. 44" ~ "dorchester 44",
                       TRUE ~ name),
         name = str_squish(name))# %>% filter(str_detect(name, "blue hill"))
  
```

```{r}
ne_census <-  census_sd_4 %>% filter(state == "NE") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 
  mutate(name = str_replace_all(name, "-", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)"),
        name = str_remove_all(name, "$"),
         name = str_remove_all(name, "(schools)$"),
  name = case_when(gov_unit_original_name == "blue hill vill school dist 74" ~ "blue hill 74",
                       gov_unit_original_name == "doniphan-trumbull public schools" ~ "doniphan-trumbull",
                       gov_unit_original_name == "dorchester vlg sch di 44" ~ "dorchester 44",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
ne_matched <- ne_acfrs %>% left_join(ne_census) %>% drop_na(censusid)

ne_matched %>% select(acfrs_original_name, gov_unit_original_name, name)
```


```{r}
ne_acfrs %>% filter(!acfrs_original_name %in% ne_matched$acfrs_original_name)
```

```{r}
ne_census %>% filter(!nces_original_name %in% ne_matched$nces_original_name) %>% arrange(name)
```

```{r}
ne_acfrs_matched <- ne_matched %>% select(state, acfrs_original_name, name)

ne_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ne_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("NE_match_unmatched.csv")
```
## OK

```{r}
ok_acfrs <-  
  acfrs_sd_4 %>% filter(state == "OK") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(public schools)|(school district no. i-095)"),
         name = str_remove_all(name, "(public school)|(i 95)|(i 11)|(c 29 pottawatomie)|(i 27)|(i 2)|( i 10)|(55 c029)|(i 90)|(no i 365)|(d 29)|(i 51)|(c 32)|(60 i 103)"),
         name = str_remove_all(name, "(i 4)|(  c 9)$"),
         name = str_replace_all(name, "mc donald", "mcdonald"),
         
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "davidson school district no. c-9" ~ "davidson", 
                       acfrs_original_name == "don iphan-trumbu ll public schools district no. 40-0126" ~ "doniphan-trumbull",
                      acfrs_original_name == "dorchester school district no. 44" ~ "dorchester 44",
                       TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
ok_census <-  census_sd_4 %>% filter(state == "OK") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)"),
        name = str_remove_all(name, "$"),
         name = str_remove_all(name, "(schools)$"),
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
ok_matched <- ok_acfrs %>% left_join(ok_census) %>% drop_na(censusid)
ok_matched %>% select(acfrs_original_name, gov_unit_original_name)
```


```{r}
ok_acfrs %>% filter(!acfrs_original_name %in% ok_matched$acfrs_original_name)

```

```{r}
ok_census %>% filter(!nces_original_name %in% ok_matched$nces_original_name) %>% arrange(name)
```

```{r}
ok_acfrs_matched <- ok_matched %>% select(state, acfrs_original_name, name)

ok_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ok_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("OK_match_unmatched.csv")

```
## MO

```{r}
mo_acfrs <-  
  acfrs_sd_4 %>% filter(state == "MO") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(public schools)|(school district no. i-095)|(consolidated no)"),
         name = str_remove_all(name, "(county)|(reorganized 2)|(124)|(reorganized r 2)"),
         name = str_remove_all(name, "(schools)|( 81)$"),
        
         name = str_replace(name, "de soto 73", "desoto 73"),
         name = str_replace(name, "salem r80", "salem r 80"),
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "campbell reorganized school district no.2" ~ "campbell r ii",
                       acfrs_original_name == "fredericktown r-1 school district" ~ "fredericktown r i",
                      acfrs_original_name == "hayti reorganized school district no.2" ~ "hayti r ii",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
mo_census <-  census_sd_4 %>% filter(state == "MO") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(co.)|(florissant r ii)"),
        name = str_remove_all(name, "( 101)|( 58)$"),
         name = str_remove_all(name, "(schools)|( of warren)$"),
        
  name = case_when(gov_unit_original_name == "puxico sch dist r 8" ~ "puxico r viii",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
mo_matched <- mo_acfrs %>% left_join(mo_census) %>% drop_na(censusid)
mo_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name)
```


```{r}
mo_acfrs %>% filter(!acfrs_original_name %in% mo_matched$acfrs_original_name)
```

```{r}
mo_census %>% filter(!nces_original_name %in% mo_matched$nces_original_name) %>% arrange(name)

```

```{r}
mo_acfrs_matched <- mo_matched %>% select(state, acfrs_original_name, name)

mo_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(mo_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("MO_match_unmatched.csv")

```
## OR 

```{r}
or_acfrs <-  
  acfrs_sd_4 %>% filter(state == "OR") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(public schools)|(consolidated no)"),
         name = str_remove_all(name, "(jt)$"),
         name = str_remove_all(name, "(29j)|(10jt)|( 2 c)|j$"),
        
         name = str_replace(name, "de soto 73", "desoto 73"),
         name = str_replace(name, "salem r80", "salem r 80"),
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "centennial school district no. 28jt" ~ "centennial 28",
                       acfrs_original_name == "north santiam school district no. 29j" ~ "north santiam 29",
                      acfrs_original_name == "pendleton school district 16r" ~ "pendleton 16",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "centennial"))
  
```

```{r}
or_census <-  census_sd_4 %>% filter(state == "OR") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(co unit)|(school dist 10j)|(school dist)|(bay )|(unified 7)|(county)"),
         name = str_remove_all(name, "( sd)"),
         name = str_remove_all(name, "( 2c)|j$"),
        name = str_remove_all(name, "( 57)|( 40)|(511)|( 8)|( city)|(10)|(county)$"),
         
        
  name = case_when(gov_unit_original_name == "corvallis sch dist 509-j" ~ "corvallis 509",
                   gov_unit_original_name == "crow-apple gate-lorane school dist 66" ~ "crow applegate lorane 66",
                   gov_unit_original_name == "grant admin school dist 3" ~ "grant 3",
                   gov_unit_original_name == "harney county school dist 4" ~ "harney 4",
                   gov_unit_original_name == "helix school district #1r" ~ "helix 1", 
                   gov_unit_original_name == "jewell sch dist 8" ~ "jewell",
                   gov_unit_original_name == "morrow co sch dist 1" ~ "morrow",
                   gov_unit_original_name == "perrydale sch dist 21" ~ "perrydale",
                   gov_unit_original_name == "pilot rock sch dist 2" ~ "pilot rock",
                   gov_unit_original_name == "scio school dist 95" ~ "scio",
                   gov_unit_original_name == "seaside sch dist 10" ~ "seaside", 
                   gov_unit_original_name == "silver falls school district 4j" ~ "silver falls",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "centennial"))
```


```{r}
or_matched <- or_acfrs %>% left_join(or_census) %>% drop_na(censusid)

or_matched %>% select(acfrs_original_name, nces_original_name, gov_unit_original_name, name)
```


```{r}
or_acfrs %>% filter(!acfrs_original_name %in% or_matched$acfrs_original_name)
```

```{r}
or_census %>% filter(!nces_original_name %in% or_matched$nces_original_name) %>% arrange(name)

```

```{r}
or_acfrs_matched <- or_matched %>% select(state, acfrs_original_name, name)

or_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(or_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("OR_match_unmatched.csv")

```

## NJ

```{r}
nj_acfrs <-  
  acfrs_sd_4 %>% filter(state == "NJ") %>% arrange(name) %>% 

  mutate(name = str_remove_all(name,"(school district of the borough of)|(school district of)|(school district of town of )|(of the)"), 
    name = str_remove_all(name, "(public schools)|(borough of)|(school distrcit)|(township of )"),
         name = str_remove_all(name, "(county)|(township)"),
         name = str_remove_all(name, "^(city of)"),
         name = str_remove_all(name, "(^(of )|(town of ))"),
         name = str_remove_all(name, "(schools)|(school)|(borough)$"),

       # change acfrs name to census name
        acfrs_original_name = str_squish(acfrs_original_name),
        name = case_when(acfrs_original_name == "caldwell-west caldwell school district" ~ "caldwell west",
                      acfrs_original_name == "matawan-abredeen regional school district"~ "matawan aberdeen regional",
                     acfrs_original_name == "passaic board of education-passaic public schools" ~ "passaic",
                     acfrs_original_name == "scotch plains-fanwood regional school district" ~ "scotch plains fanwood regional",
                     acfrs_original_name == "south orange and maplewood school district board of education" ~ "south orange maplewood",
                      TRUE ~ name),
        #  
         name = str_squish(name)) #%>% filter(str_detect(name, "chathams"))
  
```

```{r}
nj_census <-  census_sd_4 %>% filter(state == "NJ") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(township school district)|(city school district)|(boro school district)|(public school district)|(borough school district)|(high school district)|(school district of )"),
        name = str_remove_all(name, "( 101)|( 58)|(high school)|(borough)$"),
         name = str_remove_all(name, "(schools)|(school district)|(township)|( city)|(public)$"),
        
  name = case_when(gov_unit_original_name == "paulsboro boro sch dist" ~ "paulsboro",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
nj_matched <- nj_acfrs %>% left_join(nj_census) %>% drop_na(censusid)
nj_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name, name) %>% filter(str_detect(name, "paulsboro"))
```


```{r}
nj_acfrs %>% filter(!acfrs_original_name %in% nj_matched$acfrs_original_name) %>% arrange(name)


```

```{r}
nj_census %>% filter(!nces_original_name %in% nj_matched$nces_original_name) %>% arrange(name)

```

```{r}
nj_acfrs_matched <- nj_matched %>% select(state, acfrs_original_name, name)

nj_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(nj_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("NJ_match_unmatched.csv")

```
## MI
```{r}
mi_acfrs <-  acfrs_sd_4 %>% filter(state == "MI") %>% arrange(name) %>% 
  
  mutate(name = str_replace_all(name, "-|\\.|#|_", " "),
         name = str_remove_all(name, "(school district of the city of )|(union schools district)"),
    name = str_remove_all(name, "(public school)|(community schools)|(community district)|(school system)|(area schools of gogebic county)"),
         #name = str_remove_all(name, ""),
         name = str_remove_all(name, "(schools)|(school)$"),
        
         # name = str_replace(name, "", ""),
         # name = str_replace(name, "", ""),
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "" ~ "",
                       acfrs_original_name == "detroit public schools community district" ~ "detroit community district",
                      acfrs_original_name == "" ~ "",
                       TRUE ~ name),

         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
mi_census <-  census_sd_4 %>% filter(state == "MI") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.|#", " "),
         name = str_replace_all(name, "\\(|\\)", ""),
         name = str_remove_all(name, "(township school district)|(public school district)|(community schools)|(in the counties of oakland and lapee)|(area schools of gogebic county)|(union schools district )"),
        name = str_remove_all(name, "(school district)|(public schools)|(s/d )"),
         name = str_remove_all(name, "(schools)$"),
        
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "macomb"))
```


```{r}
mi_matched <- mi_acfrs %>% left_join(mi_census) %>% drop_na(censusid)
```


```{r}
mi_acfrs %>% filter(!acfrs_original_name %in% mi_matched$acfrs_original_name)
```

```{r}
mi_census %>% filter(!nces_original_name %in% mi_matched$nces_original_name) %>% arrange(name)

```

```{r}
mi_acfrs_matched <- mi_matched %>% select(state, acfrs_original_name, name)

mi_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(mi_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("MI_match_unmatched.csv")

```
## AR
```{r}
ar_acfrs <-  
  acfrs_sd_4 %>% filter(state == "AR") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "[0-9]{1,2}"),
          name = str_remove_all(name, "consolidated"),
         # name = str_remove_all(name, "$"),
         # 
         # name = str_replace(name, "", ""),
         # name = str_replace(name, "", ""),
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "cave city school district no. 2a" ~ "cave 2a",
                       acfrs_original_name == "" ~ "",
                      acfrs_original_name == "" ~ "",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
ar_census <-  census_sd_4 %>% filter(state == "AR") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.|\\/", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(central sch dist)|(city school district)|(county school dist)|(is central sch dist)"),
         name = str_remove_all(name, "(school district)|(sch dist)|(school dist)|(cons school dist)"),
        name = str_remove_all(name, "[0-9]{1,2}"),
         #name = str_remove_all(name, "$"),
        
  name = case_when(gov_unit_original_name == "cave city sch dist 2 a" ~ "cave 2a",
                   gov_unit_original_name == "south conway co sch dist" ~ "south conway",
                   gov_unit_original_name == "south side school district" & county_nces == "Van Buren County" ~ "south side (van buren)",
                   gov_unit_original_name == "texarkana sch dist 7" & county_nces == "Miller County" ~ "texarkana (miller county)",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
ar_matched <- ar_acfrs %>% left_join(ar_census) %>% drop_na(censusid)
ar_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name, name) %>% filter(str_detect(name, "south side"))
```


```{r}
ar_acfrs %>% filter(!acfrs_original_name %in% ar_matched$acfrs_original_name)
```

```{r}
ar_census %>% filter(!nces_original_name %in% ar_matched$nces_original_name) %>% arrange(name)


```

```{r}
ar_acfrs_matched <- ar_matched %>% select(state, acfrs_original_name, name)

ar_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ar_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("AR_match_unmatched.csv")

```

## MN

```{r}
mn_acfrs <-  
  acfrs_sd_4 %>% filter(state == "MN") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(independent school dist)|(community schools)"),
          name = str_remove_all(name, "^[0-9]{1,4}"),
          name = str_remove_all(name, "[0-9]{1,4}$"),
         # 
         # name = str_replace(name, "-", " "),
         # name = str_replace(name, "", ""),
         # # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "" ~ "",
                       acfrs_original_name == "burnsville-eagan-savage independent school district 191" ~ "burnsville",
                      acfrs_original_name == "eastern carver county schools independent school district no. 112" ~ "eastern 112",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
mn_census <-  census_sd_4 %>% filter(state == "MN") %>% arrange(name) %>% 
  # keep GOV UNIT
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "county public school"),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(public school district)|(ind school dist)|(public school dist)|(school district)|(public sch)|(public sc)|(ind sch dist)"),
        name = str_remove_all(name, "(school dist)|(sch dist)|(isd)"),
        name = str_remove_all(name, "$"),
        
        name = case_when(gov_unit_original_name == "bagley sch district 162" ~ "bagley 162",
                         gov_unit_original_name == "dassel-cokato public sch district 466" ~ "dassel cokato 466",
                         gov_unit_original_name == "eastern carver county isd 112" ~ "eastern 112",
                         gov_unit_original_name == "faribault sch dist 656" ~ "fairbault 656",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```

```{r}
mn_matched <- mn_acfrs %>% left_join(mn_census) %>% drop_na(censusid)
```


```{r}
mn_acfrs %>% filter(!acfrs_original_name %in% mn_matched$acfrs_original_name) %>% arrange(name)

```

```{r}
mn_census %>% filter(!nces_original_name %in% mn_matched$nces_original_name) %>% arrange(name)


```

```{r}
mn_acfrs_matched <- mn_matched %>% select(state, acfrs_original_name, name)

mn_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(mn_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("MN_match_unmatched.csv")

```
## NY

```{r}
ny_acfrs <-  
  acfrs_sd_4 %>% filter(state == "NY") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(board of cooperative educational services of )|(supervisory district of )|(board of education city of)"),
         name = str_remove_all(name, "(board of cooperative educational services)|(enlarged of )|(of the city of )"),
         name = str_remove_all(name, "(county)|(enlarged)|(union free)|(boces)|(central)"),
        name = str_remove_all(name, "^(of )"),

         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "leroy central school district" ~ "le roy",
                       
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
ny_census <-  census_sd_4 %>% filter(state == "NY") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(city school districts)|(union free school district)|(public schs)|(city school district)|(city central school district)"),
        name = str_remove_all(name, "(csd)|(school distict)|(town of)|(school district)|(central school district)"),
         name = str_remove_all(name, "$"),
        
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
ny_matched <- ny_acfrs %>% left_join(ny_census) %>% drop_na(censusid)
```


```{r}
ny_acfrs %>% filter(!acfrs_original_name %in% ny_matched$acfrs_original_name) %>% arrange(name)
```

```{r}
ny_census %>% filter(!nces_original_name %in% ny_matched$nces_original_name) %>% arrange(name)

```

```{r}
ny_acfrs_matched <- ny_matched %>% select(state, acfrs_original_name, name)

ny_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ny_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("NY_match_unmatched.csv")

```

## IL

```{r}
il_acfrs <-  
  acfrs_sd_4 %>% filter(state == "IL") %>% arrange(name) %>% #filter(str_detect(name, "148"))

  mutate(name = str_remove_all(name, "(community)|(counties regional office of education)|(counties special education district 801)|(union elementary consolidated)|(community schools unit district)|(of the city of )|(township high school disrtict )|(cooperative association for special education)"),
          name = str_remove_all(name, "(cc school dist)|(ccsd)|(special education district joint agreement 865)|(city schools district)|(community consl)|(counties special education association)|(elementary school district)"),
          name = str_remove_all(name, "(county)|(community)"), 
          name = str_replace(name, " and ", " and"),
  #        name = str_replace(name, "", ""),
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name)) %>% 
         # IL
mutate(
  name = case_when(acfrs_original_name == "alsip hazelgreen and oak lawn school district 126" ~ "alsip hazlgrn oaklwn 126",
acfrs_original_name == "berwyn south school district 100" ~ "berwyn 100",
acfrs_original_name == "carmi-white county community unit school district no.5" ~ "carmi 5",
acfrs_original_name == "chicago board of education" ~ "chicago 299",
acfrs_original_name == "country club hill school district 160" ~ "country club hills 160", 
acfrs_original_name == "dolton west school district 148" ~  "dolton 148",
acfrs_original_name == "kildeer countryside consolidated school district" ~ "kildeer countryside community consl 96",
acfrs_original_name == "lagrange elementary school district 102" ~ "la grange 102", 
acfrs_original_name == "lemont-bromberek combined school district 113a"~ "lemont bromberek combined 113 a", 
acfrs_original_name == "moline coal valley school dist 40" ~ "moline 40",
acfrs_original_name == "posen-robbins elementary school district 143 1/2" ~ "posen robbins 143 5", 
acfrs_original_name == "prairie-Hills elementary school district 144" ~ "praire hills 144",
#acfrs_original_name == "Thornton Fractional Township High School District No. 215" ~ name = "",
acfrs_original_name == "school district u-46" ~ "u 46 (elgin area)",
acfrs_original_name == "valley view public schools community unit district" ~ "valley view 365 u",
acfrs_original_name == "bureau henry and stark counties regional office of education no. 28" ~"bureau henry stark",
acfrs_original_name == "dewitt livingston logan and mclean counties regional office of education no. 17" ~ "dewitt livingston",
acfrs_original_name =="lemont-bromberek combined school district 113a" ~ "lemont bromberek csd 113a",
acfrs_original_name == "park ridge-niles school district 64" ~ "park ridge 64",
acfrs_original_name == "speed s.e.j.a. 802" ~ "speed seja 802",
                       TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
#mutate(case_when(name== "calumetlaurium keweenaw" ~ "calumet laureium keweenaw"))
```

```{r}
il_census <-  census_sd_4 %>% filter(state == "IL") %>% arrange(name) %>% #filter(str_detect(name, "100"))
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.|\\/|#", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(chsd)|(ccsd)|(cusd)|(union sd)|(twp hsd)|(cons sd)|(usd)|(cons hsd)|(sp ed district)|(union e cons d)|(township hsd)|(coop spec ed)|(cntys sp ed assoc)|(city of)"),
        name = str_remove_all(name, "(roe)|(county)|(cud)|(county spec ed dist)|(esd)|(city)"),
         name = str_replace_all(name, " sd ", " "),
        name = str_replace_all(name, "spec educ assoc", "special education association"),
        name = str_replace_all(name, "365 u", "365u"),
  name = case_when(gov_unit_original_name == "berwyn school district 100" ~ "berwyn 100",
                   gov_unit_original_name == "bureau/henry/stark roe" ~ "bureau henry stark",
                   gov_unit_original_name == "dewitt/livingstn/logan/mclean roe" ~ "dewitt livingston", 
                   gov_unit_original_name == "massac unit district 1" ~ "massac 1",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "chicago"))
```


```{r}
il_matched <- il_acfrs %>% left_join(il_census) %>% drop_na(censusid)
il_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name, name) %>% filter(name == "alsip hazlgrn oaklwn 126")
```


```{r}
il_acfrs %>% filter(!acfrs_original_name %in% il_matched$acfrs_original_name) 
 
```

```{r}
il_census %>% filter(!nces_original_name %in% il_matched$nces_original_name) %>% arrange(name)

```

```{r}
il_acfrs_matched <- il_matched %>% select(state, acfrs_original_name, name)
il_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(il_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("IL_match_unmatched.csv")
```

## TX

```{r}
tx_acfrs <-  
  acfrs_sd_4 %>% filter(state == "TX") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "indepedent"),
          name = str_remove_all(name, "(county consolidated common)"),
         # name = str_remove_all(name, "$"),
         # 
          name = str_replace(name, "mountain", "mt"),
         # name = str_replace(name, "", ""),
         # # change acfrs name to census name
         # acfrs_original_name = str_squish(acfrs_original_name),
          name = case_when(acfrs_original_name == "fort sam houston independent school district" ~ "ft sam houston",
                          acfrs_original_name == "pharrsan juanalamo independent school district" ~ "pharr san juan alamo",
         #              acfrs_original_name == "" ~ "",
                        TRUE ~ name),
         # 
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
tx_census <-  census_sd_4 %>% filter(state == "TX") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_remove_all(name, "'"),
    name = str_replace_all(name, "-|\\.|-|\\/|#", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(county consolidated csd)|( cisd)"),
        name = str_remove_all(name, "( isd)|( csd)|(county)$"),
         name = str_remove_all(name, "$"),
        
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
tx_matched <- tx_acfrs %>% left_join(tx_census) %>% drop_na(censusid)
```


```{r}
tx_acfrs %>% filter(!acfrs_original_name %in% tx_matched$acfrs_original_name)


```

```{r}
tx_census %>% filter(!nces_original_name %in% tx_matched$nces_original_name) %>% arrange(name)

```

```{r}
tx_acfrs_matched <- tx_matched %>% select(state, acfrs_original_name, name)

tx_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(tx_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("TX_match_unmatched.csv")

```
## PA

```{r}
pa_acfrs <-  
  acfrs_sd_4 %>% filter(state == "PA") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(intermediate)|(school)|(unit)"),
         name = str_remove_all(name, "(of borough of )|(county)|(of the city of )|(borough)"),
         name = str_remove_all(name, "^(the )"),
         name = str_replace_all(name, "iv", "4"),
        
         #name = str_replace(name, "heights", "hts"),
         #name = str_replace(name, "", ""),
         # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "harborcreek school district" ~ "harbor creek",
                       acfrs_original_name == "" ~ "",
                      acfrs_original_name == "" ~ "",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
pa_census <-  census_sd_4 %>% filter(state == "PA") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.|\\/|#", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(school dist)"),
        name = str_remove_all(name, "(intermediate)|(school)|(unit)|(county)|(borough)"),
         name = str_remove_all(name, "( sd)|(city)$"),
        name = str_replace_all(name, " iu ", " "),
        
  # name = case_when(gov_unit_original_name == "" ~ "",
  #                     TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
pa_matched <- pa_acfrs %>% left_join(pa_census) %>% drop_na(censusid)
```


```{r}
pa_acfrs %>% filter(!acfrs_original_name %in% pa_matched$acfrs_original_name) %>% arrange(name)
```

```{r}
pa_census %>% filter(!nces_original_name %in% pa_matched$nces_original_name) %>% arrange(name)

```

```{r}
pa_acfrs_matched <- pa_matched %>% select(state, acfrs_original_name, name)

pa_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(pa_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("PA_match_unmatched.csv")

```
## TN

```{r}
tn_acfrs <-  
  acfrs_sd_4 %>% filter(state == "TN") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(board of education)|(community school system)|(city school board of education)"),
         name = str_remove_all(name, "(community schools)|(city schools)|(school system)|(county schools)"),
         name = str_remove_all(name, "(city school)|(special school district)|(county)"),
        
         # name = str_replace(name, "", ""),
         # name = str_replace(name, "", ""),
         # # change acfrs name to census name
         # acfrs_original_name = str_squish(acfrs_original_name),
         # name = case_when(acfrs_original_name == "" ~ "",
         #               acfrs_original_name == "" ~ "",
         #              acfrs_original_name == "" ~ "",
         #               TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
tn_census <-  census_sd_4 %>% filter(state == "TN") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.|#", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(town schs)|(co sp dist)|(sp dist)|(special school district)"),
        name = str_remove_all(name, "(county)"),
         name = str_remove_all(name, "$"),
        
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
tn_matched <- tn_acfrs %>% left_join(tn_census) %>% drop_na(censusid)
```


```{r}
tn_acfrs %>% filter(!acfrs_original_name %in% tn_matched$acfrs_original_name)
```

```{r}
tn_census %>% filter(!nces_original_name %in% tn_matched$nces_original_name) %>% arrange(name)

```

```{r}
tn_acfrs_matched <- tn_matched %>% select(state, acfrs_original_name, name)

tn_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(tn_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("TN_match_unmatched.csv")

```
## LA

```{r}
la_acfrs <-  
  acfrs_sd_4 %>% filter(state == "LA") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(parish school board)|(city school board)|(community school system)"),
         name = str_remove_all(name, "(city of )"),
         name = str_remove_all(name, "(school board)"),
        
         # name = str_replace(name, "", ""),
         # name = str_replace(name, "", ""),
         # # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "central community school system" ~ "central community",
                       acfrs_original_name == "" ~ "",
                      acfrs_original_name == "" ~ "",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
la_census <-  census_sd_4 %>% filter(state == "LA") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(city of)|(comm schools)|(community schools)|(public schs)|(high school)|"),
        name = str_remove_all(name, "( parish)$"),
         name = str_remove_all(name, "(school district)"),
        
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
la_matched <- la_acfrs %>% left_join(la_census) %>% drop_na(censusid)
```


```{r}
la_acfrs %>% filter(!acfrs_original_name %in% la_matched$acfrs_original_name)
```

```{r}
la_census %>% filter(!nces_original_name %in% la_matched$nces_original_name) %>% arrange(name)

```

```{r}
la_acfrs_matched <- la_matched %>% select(state, acfrs_original_name, name)

la_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(la_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("LA_match_unmatched.csv")

```
<!-- ## TN -->

<!-- ```{r} -->
<!-- tn_acfrs <-   -->
<!--   acfrs_sd_4 %>% filter(state == "TN") %>% arrange(name) %>%  -->

<!--   mutate(name = str_remove_all(name, "(city school board of education)|(town of)|(special school district)"), -->
<!--          name = str_remove_all(name, "(city schools)|(county schools)|(community schools)|(city school)|(school system)|(community school)"), -->
<!--          name = str_remove_all(name, "(county|special)"), -->

<!--          # name = str_replace(name, "", ""), -->
<!--          # name = str_replace(name, "", ""), -->
<!--          # # change acfrs name to census name -->
<!--          # acfrs_original_name = str_squish(acfrs_original_name), -->
<!--          # name = case_when(acfrs_original_name == "" ~ "", -->
<!--          #               acfrs_original_name == "" ~ "", -->
<!--          #              acfrs_original_name == "" ~ "", -->
<!--          #               TRUE ~ name), -->

<!--          name = str_squish(name)) #%>% filter(str_detect(name, "lamont")) -->

<!-- ``` -->

<!-- ```{r} -->
<!-- tn_census <-  census_sd_4 %>% filter(state == "TN") %>% arrange(name) %>%  -->
<!--   # use NCES instead of gov unit names -->
<!--   mutate(name = str_to_lower(nces_original_name)) %>%  -->

<!--   mutate(name = str_replace_all(name, "-|\\.", " "), -->
<!--          name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(school district)|(co sp dist)|(sp dist)"), -->
<!--         name = str_remove_all(name, "( county)|( city)$"), -->
<!--          name = str_remove_all(name, "$"), -->

<!--   name = case_when(gov_unit_original_name == "" ~ "", -->
<!--                       TRUE ~ name), -->
<!--          name = str_squish(name)) #%>% filter(str_detect(name, "hill")) -->
<!-- ``` -->


<!-- ```{r} -->
<!-- tn_matched <- tn_acfrs %>% left_join(tn_census) %>% drop_na(censusid) -->
<!-- ``` -->


<!-- ```{r} -->
<!-- tn_acfrs %>% filter(!acfrs_original_name %in% tn_matched$acfrs_original_name) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- tn_census %>% filter(!nces_original_name %in% tn_matched$nces_original_name) %>% arrange(name) -->

<!-- ``` -->

<!-- ```{r} -->
<!-- tn_acfrs_matched <- tn_matched %>% select(state, acfrs_original_name, name) -->

<!-- tn_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(tn_acfrs_matched) %>% filter(!duplicated(censusid)) %>%  -->
<!-- write_csv("TN_match_unmatched.csv") -->

<!-- ``` -->
## CO

```{r}
co_acfrs <-  
  acfrs_sd_4 %>% filter(state == "CO") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "(board of cooperative educational services)"),
         name = str_remove_all(name, "(counties of )|(county schools district)|(of the counties of)"),
         name = str_remove_all(name, "(county)|(boces)|(number)"),
        
         name = str_replace(name, " and ", " "),
         name = str_replace(name, " no ", " "),
         # change acfrs name to census name
          acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "school district no. 1 in the city and county of denver and state of colorado" ~ "denver 1",
                       acfrs_original_name == "bayfield school district 10-jtr" ~ "bayfield 10",
                      acfrs_original_name == "" ~ "",
                       TRUE ~ name),

         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
co_census <-  census_sd_4 %>% filter(state == "CO") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(nces_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(of the count)|(consolidated school district no)|(board of cooperative educational services)"),
        name = str_remove_all(name, "(school district)|(school district no)|(sch dist)"),
         name = str_remove_all(name, "(county)|(consolidated)|(schools)|(in the of e)"),
        name = str_replace_all(name, " no ", " "),
        
  name = case_when(gov_unit_original_name == "denver sch dist 1" ~ "denver 1",
                   gov_unit_original_name == "bayfield sch dist 10 jt" ~ "bayfield 10",
                   
gov_unit_original_name == "delta co sch dist j 50" ~ "delta joint 50j",
gov_unit_original_name == "eagle co sch dist re-50" ~ "eagle re50j",
gov_unit_original_name == "falcon sch dist 49" ~ "el paso 49",
gov_unit_original_name == "florence sch dist re-2" ~ "fremont re 2",
gov_unit_original_name == "gunnison watershed sch dist re-1j" ~ "gunnison watershed re 1",
gov_unit_original_name == "mapleton sch dist 1" ~ "mapleton adams 1",
gov_unit_original_name == "monte vista sch dist c-8" ~ "monte vista 8",
gov_unit_original_name == "gunnison watershed sch dist re-1j" ~ "gunnison watershed re 1",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```


```{r}
co_matched <- co_acfrs %>% left_join(co_census) %>% drop_na(censusid)
```


```{r}
co_acfrs %>% filter(!acfrs_original_name %in% co_matched$acfrs_original_name) %>% arrange(name)
```

```{r}
co_census %>% filter(!nces_original_name %in% co_matched$nces_original_name) %>% arrange(name)


```

```{r}
co_acfrs_matched <- co_matched %>% select(state, acfrs_original_name, name)

co_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(co_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("CO_match_unmatched.csv")

```

## KS 

```{r}
ks_acfrs <-  
  acfrs_sd_4 %>% filter(state == "KS") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "^(no )"),
         name = str_remove_all(name, "(number )|(county special education cooperative)|"),
         name = str_remove_all(name, "(special education interlocal)|(unified school distrct)"),
        
         # name = str_replace(name, "", ""),
         # name = str_replace(name, "", ""),
         # change acfrs name to census name
         # acfrs_original_name = str_squish(acfrs_original_name),
         # name = case_when(acfrs_original_name == "" ~ "",
         #               acfrs_original_name == "" ~ "",
         #              acfrs_original_name == "" ~ "",
         #               TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
  
```

```{r}
ks_census <-  census_sd_4 %>% filter(state == "KS") %>% arrange(name) %>% 
  # use NCES instead of gov unit names
  mutate(name = str_to_lower(gov_unit_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_remove_all(name, "(public schools)|(public school district)|(community schools)|(public schs)|(high school)|(co comm schools)|(co community)|(county unified school district)"),
        name = str_remove_all(name, "(unified school district)|(comm schools)|(county schools)|(public school)"),
         name = str_remove_all(name, "( schools)|( city)|(county)$"),
        
  name = case_when(gov_unit_original_name == "" ~ "",
                      TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "367"))
```


```{r}
ks_matched <- ks_acfrs %>% left_join(ks_census) %>% drop_na(censusid)
```


```{r}
ks_acfrs %>% filter(!acfrs_original_name %in% ks_matched$acfrs_original_name)
```

```{r}
ks_census %>% filter(!nces_original_name %in% ks_matched$nces_original_name) %>% arrange(name)

```

```{r}
ks_acfrs_matched <- ks_matched %>% select(state, acfrs_original_name, name)

ks_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ks_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("KS_match_unmatched.csv")

```
## CA

```{r}
ca_acfrs <-  
  acfrs_sd_4 %>% filter(state == "CA") %>% arrange(name) %>% 
  
  mutate(name = str_remove_all(name, "^(no )"),
         name = str_remove_all(name, "(county office of education)|(county special education cooperative)|(county superintendent of schools)|(county education office)|((county special education local plan area))"),
         name = str_remove_all(name, "(public school)|(unified school distirct)|(union school distict)|(union school districit)"),
        
          name = str_remove_all(name, "(joint)|(city)|(community)|(valley)"),
         # name = str_replace(name, "", ""),
        # change acfrs name to census name
         acfrs_original_name = str_squish(acfrs_original_name),
         name = case_when(acfrs_original_name == "galt joint union high school district" ~ "galt high",
                       acfrs_original_name == "gold oak union elementary school district" ~ "gold oak elementary",
                      acfrs_original_name == "gen shafter elem sch dist" ~ "general shafter",
                      acfrs_original_name == "e l segundo unified school district" ~ "el segundo",
                      acfrs_original_name == "huntington beach city school district" ~ "huntington elementary",
                      acfrs_original_name == "huntington beach union high school district" ~ "huntington high",
                       TRUE ~ name),
         
         name = str_squish(name)) #%>% filter(str_detect(name, "galt"))

```

```{r}
ca_census <-  census_sd_4 %>% filter(state == "CA") %>% arrange(name) %>% 
  
  mutate(name = str_to_lower(gov_unit_original_name)) %>% 

  mutate(name = str_replace_all(name, "-|\\.", " "),
         name = str_replace_all(name, "pk", "park"),
         name = str_replace_all(name, " mt", " mountain"),
         name = str_remove_all(name, "(county special schools operated by co supt)|(jt unified school dist)|
(val elementary school district)|(cmty unif sch dist)|(county office of education)"),

         name = str_remove_all(name, "(union elem sch dist)|(co spl schs oper by co supt)|(jt union high sch dist)|(jt unified sch dist)|(co spl sch oper by co supt)|(unified elementary school district)|(jt un high school dist)|(jt unified jt sch dist)|(jt school district)|(jt unif school district)|(cy elementary school district)|
(co spl sch oper by co sup)|(bch city elem school dist)|(elementary sch dist)"),

         name = str_remove_all(name, "(unif sch dist)|(jt uni sch dist)|(val unified sch dist)|(unified school district)|(jt unified sch dis)|(unif sch dis)|(jt unif sch dist)|(jt union high school dist)|(ctr unif sch dist)"),
        name = str_remove_all(name, "(joint unified)|(jt high sch dist)|(jt uhs dist)|(uni sch dist)|(union elem school dist)"),
        name = str_remove_all(name, "(union elementary)|(elem sch dist)|(county schools)|(public school)|(un school dist)| (elementary sch)|(jt union)|(elem school dist)|(el sch dist)|(union high sch dist)|(co special schools)"),
         
         name = str_remove_all(name, "( schools)|( city)|(county)|(valley)|(union elem)|(pk unif sch dis)|(jt high)|(school dist)|(unified school dst)"),
name = str_remove_all(name, "(co spl schs)|(joint union high rict)|(un elem sch dist)|(cy uni sch dist)|(county union high school district)|(unified school d)|(union sch dt)|(j t unified school dist)|(val union elem sch)|(county special schools)|(joint union high)|(unified sch)"),
        name = str_remove_all(name, "(val)|( jt)|(elem sch)|(unified)|(sch dt)$"),


  name = case_when(gov_unit_original_name == "galt jt union high school dist" ~ "galt high",
                   gov_unit_original_name == "gen shafter elem sch dist" ~ "general shafter",
                   gov_unit_original_name == "gold oak un elem sch dist" ~ "gold oak elementary",
                   gov_unit_original_name == "calexico unif sch dist" ~ "calexico",
                   gov_unit_original_name == "huntington bch city elem school dist" ~ "huntington elementary",
                   gov_unit_original_name == "huntington beach uhs dist" ~ "huntington high",
                      TRUE ~ name),
name = case_when(name == "paficic" & county_nces == "Humboldt County" ~ "pacific (humboldt county)",
                 name == "paficic" & county_nces == "Fresno County" ~ "pacific (fresno county)", 
                 nces_original_name == "South Bay Union Elementary"  & county_nces == "Humboldt County" ~ "south bay (humboldt county)",
nces_original_name == "South Bay Union"  & county_nces == "San Diego County" ~ "south bay (san diego county)",
                TRUE ~ name),
         name = str_squish(name)) #%>% filter(str_detect(name, "hanford"))

# 
# name = case_when("gardner edgerton antioch 231" ~ "gardner 231"))
#  name = case_when(gov_unit_original_name = "minidoka county jt sch district 331" ~ "minidoka joint 331"

```


```{r}
ca_matched <- ca_acfrs %>% left_join(ca_census) %>% drop_na(censusid)
```

```{r}
ca_acfrs %>% filter(!acfrs_original_name %in% ca_matched$acfrs_original_name)

```

```{r}
ca_census %>% filter(!nces_original_name %in% ca_matched$nces_original_name) %>% arrange(name)
```

```{r}
ca_acfrs_matched <- ca_matched %>% select(state, acfrs_original_name, name)

ca_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ca_acfrs_matched) %>% filter(!duplicated(censusid)) %>% 
write_csv("KS_match_unmatched.csv")

```
# End round 4 - state by state
```{r}
#after round 3, how many ACFRS left in total NOT matched
acfrs_sd_4 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name) 
# round 4 state by state
round4 <- in_matched %>% 
  rbind(oh_matched) %>% 
  rbind(ca_matched) %>% 
  rbind(ne_matched) %>% 
  rbind(ok_matched) %>% 
  rbind(mo_matched) %>% 
  rbind(or_matched) %>% 
  rbind(nj_matched) %>% 
  rbind(mi_matched) %>% 
  rbind(ar_matched) %>% 
  rbind(mn_matched) %>% 
  rbind(ny_matched) %>% 
  rbind(il_matched) %>% 
  rbind(tx_matched) %>% 
  rbind(pa_matched) %>% 
  rbind(tn_matched) %>% 
  rbind(la_matched) %>% 
  rbind(co_matched) %>% 
  rbind(ks_matched)

round1234 <- round123 %>% rbind(round4)
```

```{r}
states_census <- in_census %>% 
  rbind(oh_census) %>% 
  rbind(ca_census) %>% 
  rbind(ne_census) %>% 
  rbind(ok_census) %>% 
  rbind(mo_census) %>% 
  rbind(or_census) %>% 
  rbind(nj_census) %>% 
  rbind(mi_census) %>% 
  rbind(ar_census) %>% 
  rbind(mn_census) %>% 
  rbind(ny_census) %>% 
  rbind(il_census) %>% 
  rbind(tx_census) %>% 
  rbind(pa_census) %>% 
  rbind(tn_census) %>% 
  rbind(la_census) %>% 
  rbind(co_census) %>% 
  rbind(ks_census)

# acfrs_sd_4 %>% filter(!acfrs_original_name %in% states_matched$acfrs_original_name
#                 )

acfrs_sd_4 %>% count(state) %>% arrange(desc(n))
# after round 3, how many census left in total NOT matched
census_sd_4 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round123$gov_unit_original_name)

# filter out those already got matched in round 4 - left over
census_sd_4 %>% 
  select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% 
  filter(!nces_original_name %in% states_census$nces_original_name & !gov_unit_original_name %in% states_census$gov_unit_original_name) %>% 
  mutate(acfrs_original_name = NA) -> foo1
  write_csv("states_left_UNmatched.csv")

states_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% 
left_join(states_acfrs_matched) %>% filter(!duplicated(censusid)) -> foo2
#%>% 
write_csv("20states_match_unmatched.csv")

```

# Result

```{r}
# Total SD in gov unit names that are in nces: 13,713 
tot_nces_gov <- nces %>%  
 left_join(govname_nces_id) %>% select(state, nces_original_name, gov_unit_original_name, ncesid, censusid, county_nces, student) 

# tot matched 8274 
tot_matched <- round1234 %>% #filter(!duplicated(nces_original_name)) %>% 
  select(state, nces_original_name, gov_unit_original_name, ncesid, censusid, county_nces, student, acfrs_original_name)

# total file: all 13713 nces, with 8274 acfrs, the rest 5766 nces is not matched - while having other 
# total acfrs file name = 10415 - 8274 matched = 2141 left over 
nrow(acfrs_file_name)
nrow(tot_matched)

tot_nces_gov %>% left_join(tot_matched) %>% #drop_na(acfrs_original_name) %>% 
  write.csv("final_match.csv")
```