-
Notifications
You must be signed in to change notification settings - Fork 0
/
acfrs_filename_govname_ncesid.Rmd
1778 lines (1338 loc) · 78.2 KB
/
acfrs_filename_govname_ncesid.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Initial Mapping School Districts: ACFRs & NCES"
output: html_document
date: '2022-08-31'
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
#library(threadr)
```
Using a list of file names in ACFrs (provided by Marc) to map with the names in NCES list.
Result: "final_match.csv".
NCES name = ACFRs name
HSD = High School District
CHSD = Community High School District
CUSD = Community Unit School District
CCSD = Community Consolidated School District
Spec Educ Coop = Special Education Cooperative
ROE = Regional Office of Education
sd = public schools district
USD = School District
UD = Consolidated School District
# Example of 63 School Districts
This is a hand-matched example of 63 school district in NCES data & ACFRs.
Analyzing the pattern of match between these 2 datasets to help improving match in the full dataset.
```{r}
example_match_60sd <- rio::import(here::here("data", "Mappings for Largest School Districts (2).xlsx")) %>%
rename(name = `Name in ACFR System`,
nces_name = `District Name`) %>%
select(name, nces_name, `State District ID`, `State...7`)
example_match_60sd_clean <- example_match_60sd %>%
mutate(nces_name = str_to_lower(nces_name),
name = str_to_lower(name)) %>%
# remove terms in acfrs, 1st time
mutate(name = str_remove_all(name, "(the)?\\s*school district\\s*(of)?|county|independent school district|district school board|\\s*board of education\\s* (of the)?|public schools|the school board of|board of education|public school system")) %>%
# remove terms in acfrs, second time
mutate(name = str_trim(name, side = "both")) %>%
mutate(name = str_remove_all(name, "^of\\s*|-$|municipalno.*|\\.|'s|’s|[0-9]*$")) %>%
mutate(name = str_trim(name, side = "both")) %>% # need to repeat b/c after removing words, space remains
# remove terms in nces, 1st time
mutate(nces_name = str_remove_all(nces_name, "(the)?\\s*school district\\s*(of)?|independent school district|district( school board)?|board of education|isd|public schools|schools|(co)? pblc schs|county|(city )?sd")) %>%
# remove terms in nces, 2nd time
mutate(nces_name = str_trim(nces_name, side = "both")) %>%
mutate(nces_name = str_remove_all(nces_name, "^of\\s*|-$|municipalno.*|\\.|'s|’s|[0-9]*$")) %>%
mutate(nces_name = str_trim(nces_name, side = "both"))
example_matched <- example_match_60sd_clean %>%
# check of the 2 name cols are identical
mutate(same_name = ifelse(name == nces_name, TRUE, FALSE)) %>%
filter(same_name == TRUE)
examples_NOT_matched <- example_match_60sd_clean %>%
# check of the 2 name cols are identical
mutate(same_name = ifelse(name == nces_name, TRUE, FALSE)) %>%
filter(same_name == FALSE)
```
# Round 1
## ACFRs
```{r}
acfrs_file_name <- rio::import("data/sd_list.xlsx") %>%
dplyr::mutate(state = str_split(acfrs_file_name, " ",simplify = TRUE)[ ,1]) %>%
mutate(acfrs_original_name = str_remove_all(acfrs_file_name, "2020.pdf"),
acfrs_original_name = str_sub(acfrs_original_name, 3),
acfrs_original_name = str_to_lower(acfrs_original_name)) %>%
select(state, acfrs_original_name)
```
```{r}
acfrs_school_districts <- acfrs_file_name %>%
mutate(name = str_to_lower(acfrs_original_name)) %>%
mutate(name = str_remove_all(name, "no\\.|#|'")) %>%
mutate(name = str_replace_all(name, "/", " ")) %>%
mutate(name = str_replace_all(name, "\\.", " ")) %>%
mutate(name = str_replace_all(name, "-", " ")) %>%
mutate(name = str_remove_all(name, "(community consolidated school district)|(community consolidated schools district)|(joint unified school district)|(center unified school district)"),
#name = str_remove_all(name, ""),
name = str_remove_all(name, "(consolidated high school district)|(consolidated independent school district)"),
name = str_remove_all(name, "(union high school district)|(city school district)|(union elementary school district)|(union school district)|(county unified school district)|(joint unified school district)"), # cali
name = str_remove_all(name, "(county school district)|(county independent school district)"),
name = str_remove_all(name, "(community unit school district)|(community unit district)"),
name = str_remove_all(name, "(public school district)|(public schools district)|(independent school district)|(district school board)"),
#OH
name = str_remove_all(name, "(union exempted village school district)|(exempted village school district)|(county joint vocational school district)"),
name = str_remove_all(name, "(high school district)|(local school district)"),
name = str_remove_all(name, "educational service district"),
name = str_remove_all(name, "fractional township"),
name = str_remove_all(name, "(the school districts of)|(board of education)|(the school board of)|(public school system)"),
name = str_remove_all(name, "unified school district"),
name = str_remove_all(name, "(community school district)|(comm unit school)|(community school dist)"),
name = str_remove_all(name,"(elementary school district)|(elementary scool district)"),
name = str_remove_all(name,"public schools"),
name = str_remove_all(name,"grade school district"),
name = str_remove_all(name, "(school district)|(comm sch dist)|(elem sch dist)|(sch dist)|(ind sch dist)")) %>%
mutate(name = str_squish(name))
```
## Govt name in NCES
```{r}
# File Paul sent to Marc: "dataformarc" file. Email Sep 7, 2022
govname_nces_id <- rio::import("data/censusID_necesID_link.xlsx") %>%
# name in this file is government unit name - also the name in file "Govt_Units_2021_Final.xlsx", sheet 3 + sheet 4
rename(gov_unit_original_name = name) %>%
mutate(gov_unit_original_name = str_to_lower(gov_unit_original_name)) %>%
mutate(name = str_trim(gov_unit_original_name)) %>%
rename(ncesid = `NCES Agency Identification Number`,
censusid = idcensus) %>%
select(censusid, ncesid, gov_unit_original_name, name)
# NCES list only has 13,713: data downloaded Jan 27, 2020. Not including charter schools. Student > 1.
nces <- rio::import(here::here("data", "ncesdata_DBBFFFC.xlsx"), skip = 14) %>%
select(`NCES District ID`, `District Name`, `County Name*`, City, State, `Students*`) %>%
rename(nces_original_name = `District Name`,
county_nces = `County Name*`,
state = State,
student = `Students*`,
ncesid = `NCES District ID`,
city_nces = City
)
```
## Cleaning govt unit name
```{r}
census_gov_unit <- nces %>% left_join(govname_nces_id) %>%
# Only get those 13,713 in NCES to match with ACFRs
mutate(name = str_replace_all(name, "\\.", " ")) %>%
mutate(name = str_replace_all(name, "/", " ")) %>%
mutate(name = str_replace_all(name, "-|&", " ")) %>%
mutate(name = str_remove_all(name, "(community consolidated school district)"),
name = str_remove_all(name, "(community unit school district)|(community unit)"),
name = str_remove_all(name, "consolidated school district"),
name = str_remove_all(name, "county school district"),
name = str_remove_all(name, "community consolidated schools district"),
name = str_remove_all(name, "(community high school district)|(high school district)"),
name = str_remove_all(name, "community consolidated school district"),
name = str_remove_all(name, "consolidated high school district"),
name = str_remove_all(name,"(city unified sch dist)|(joint unified school district)"),
name = str_remove_all(name, "(unified school district)|(union high school dist)|(co office of ed)|(unified sch dist)|(unified school dist)|(union elem sch dist)|(co unif sch dist)|(union elementary sch dist)"),
name = str_remove_all(name,"(public school district)|(public schools)|(unit school district)|(union school district)"),
name = str_remove_all(name,"(elementary school district)|(elementary scool district)|(elem school district)|(elementary school dist)"),
# Ohio
name = str_remove_all(name, "(local school district)|(local sch dist)|(local school dist)|(jt voc sch dist)|(exempted sch dist)|(city sch dist)|(ex vlg sch dist)|(union sch dist)|(ex vlg school dist)"),
name = str_remove_all(name, "co jt voc sch dist"),
name = str_remove_all(name, "(community school district)|(community unit district)|(comm college district)|(uni sch dist)|(un sch dist)"),
name = str_remove_all(name, "(co ind sch dist)|(unif school dist)|(unif sch dist)|(union elem sch dt)|(jt unified sch dist)|(jt union high school dist)|(elem sch district)|(jt elem sch dist)"), #
name = str_remove_all(name, "(district school board)|(ind sch district)|(ind sch dist)|(cons sch dist)|(ind school district)"),
name = str_remove_all(name, "(school district)|(district)|(comm sch dist)|(elem sch dist)|(sch dist)|(fr t h school district)|(elem sch dt)|(union el sch d)|(jt uni sch dist)")
) %>%
# Texas
mutate(name = ifelse(state == "TX", str_remove_all(name, "[0-9]"), name)) %>%
mutate(name = str_squish(name))
round1 <- acfrs_school_districts %>% left_join(census_gov_unit) %>% drop_na(censusid)
#round1 %>% filter(enrollment == 0)
round1 %>% select(nces_original_name, acfrs_original_name, gov_unit_original_name)
saveRDS(round1, "round1.RDS")
```
# Round 2
## ACFRs
```{r}
acfrs_sd_2 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1$acfrs_original_name)%>%
mutate(name = str_replace_all(name, "-|,|&|#|_", " ")) %>%
mutate(name = str_remove_all(name, "(school district of the city of)|(consolidated school district)|(consolidated schools)|(community schools)|intermediate|(office of education)|(city sch dist)|(independent public school district)"),
name = str_remove_all(name, "^(the)"),
name = str_remove_all(name, "central|(union free)|(counties boces)|(county board of cooperative educational services)|centre|(community school)|(community high school)"),
#Michigan
name = str_remove_all(name, "(union free school district)|(city school district)|(board of cooperative educational services of)|(schools)|township|(district schools)|(union schools)|(public school of)"),
name = str_remove_all(name, "public school"),
name = str_remove_all(name, "^of "),
name = str_remove_all(name, "(school)|county|consolidated|(isd)|( joint)|( district)|( community)|(union districit)$")) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, " 00", " "), name)) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, "( i )|(c0)|( c)|( c )|( 0)|( 1 )", " "), name)) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, "( ity)|( 0)", " "), name)) %>%
mutate(name = str_remove_all(name, "number|(independent)")) %>%
mutate(name = str_replace_all(name, "( d )|( no )|( o[0-9])", " ")) %>%
mutate(name = str_squish(name))
```
## Gov Units
```{r}
census_sd_2 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1$gov_unit_original_name) %>%
mutate(name = str_replace_all(name, "-|,|&|#|_", " "),
name = str_remove_all(name, "'")) %>%
mutate(name = ifelse(state == "MI", str_remove_all(name, "[0-9]"), name)) %>%
mutate(name = str_remove_all(name, "(consolidated school district)|(cons school)|(joint community college)|(county community school corporation)"),
name = str_remove_all(name, "(union free school district)|(uf sch dist)|(central sch dist)|(union free)|(ctl high school dist)|(pt ool dist)|central|(centre union free school dist)"),
name = str_remove_all(name, "(city school dist)|(central sch)|(comm college)|(community college)|(ctl sch dist)|(ctl school dist)|(co comm coll)|(community high school)|(pub sch dist)|(comm sch dist)"),
name = str_remove_all(name, "(school district)|central|( ool dist)|(u f school dist)|(ctl sch)|(uf school dist)|(school dist)|(comm schs)|(city sch dist)|(township sch dist)"),
name = str_remove_all(name,"(community sch dist)|(comm school dist)"),
name = str_remove_all(name, "^of "),
name = str_remove_all(name, "(u f)|(twp)$"),
name = str_remove_all(name, "(uf)|(isd)$"),
name = str_remove_all(name, "( ool)|( pt)|( csd)$"),
name = str_remove_all(name, "( ctl)|(c s d)|( schs)$"),
name = str_remove_all(name, "(comm schools)|(schs dist)|(public school)|(consolidated school)|(community schools)|(joint union)"),
name = str_remove_all(name,
"( schools)|( public)|(co schools)|(township)|(pub)|(twp)|( comm)|( community)|( cmty)|(twp f)|(pub fr)|(consol)|( sch)|( scools)|(college)|(township f)|(twp fr)|( co)|( union)|( joint)$"),
name = str_replace_all(name, "( i 00)", " ")) %>%
mutate(name = str_squish(name))
```
```{r}
round2 <- acfrs_sd_2 %>% left_join(census_sd_2) %>% drop_na(censusid)
round1_2 <- round1 %>% rbind(round2)
saveRDS(round2, "round2.RDS")
```
# Round 3
```{r}
# after round 2, how many each state has left NOT matched
acfrs_sd_2 %>% left_join(census_sd_2) %>% filter(is.na(censusid)) %>% count(state) %>% arrange(desc(n))
## after round 2, how many ACFRS left in total NOT matched
acfrs_sd_3 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1_2$acfrs_original_name)
```
```{r}
# after round 2, how many census left in total NOT matched
census_sd_3 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1_2$gov_unit_original_name)
```
Now need to match acfrs_sd_3 and census_sd_3
```{r}
acfrs_sd_3_clean <- acfrs_sd_3 %>% #filter(state == "ME") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(community schools district)|(county schools district)"),
name = str_remove_all(name, "(school disrict)|(community schools)|(community schools)"),
name = str_remove_all(name, "( r)|( county)|( consolidated)$")) %>%
mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>%
mutate(name = str_remove_all(name, "( municipal)|( city)|( union)$")) %>%
mutate(name = str_squish(name))
```
```{r}
census_sd_3_clean <- census_sd_3 %>%
filter(student > 0) %>%
#filter(state == "ME") %>%
mutate(name = str_remove_all(name,"(ind school dist)|(independent rict)|(community college dist)|(community college)|(br school dist)")) %>%
mutate(name = str_remove_all(name, "( rict )|(county unified school system)"),
name = str_remove_all(name, "(city sd)|(city pub[0-9])|(city pub)"),
name = str_replace_all(name, "( 0)", " "),
name = str_replace_all(name, "serv", "service")) %>%
# name = str_remove_all(name, " [0-9]+$")) %>% #filter(str_detect(name, "city pub"))
mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>%
mutate(name = str_remove_all(name,"( college)|( independent)|( cons)|( i s)|( rict)|( co cons)|( school)|( city)|( co)|( comm)|( ind sh)|( indep)$"),
name = str_remove(name, "(olidated)|( munc)$")
) %>%
mutate(name = str_squish(name))
```
```{r}
round_3 <- acfrs_sd_3_clean %>% left_join(census_sd_3_clean) %>% drop_na(censusid)
# NOT matched after round 3
acfrs_sd_3_clean %>% filter(!acfrs_original_name %in% round_3$acfrs_original_name) %>% count(state) %>% arrange(desc(n)) #filter(str_detect(name, "goose"))
```
```{r}
round123 <- round1_2 %>% rbind(round_3) %>% select(state, acfrs_original_name, nces_original_name, gov_unit_original_name, name, ncesid, censusid, county_nces, city_nces, student)
#%>% write_csv("acfrs_necs_census_matched.csv")
# length(unique(result$acfrs_original_name))
# length(unique(result$ncesid))
# length(result$acfrs_original_name)
```
# Round 4
```{r}
#Sep 12
## after round 3, how many ACFRS left in total NOT matched
acfrs_sd_4 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name)
acfrs_sd_4 %>% count(state) %>% arrange(desc(n))
# after round 3, how many census left in total NOT matched
census_sd_4 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round123$gov_unit_original_name)
# From here, go state by state
```
## IN
```{r}
in_acfrs <- acfrs_sd_4 %>% filter(state == "IN") %>% filter(str_detect(acfrs_original_name, "ecas")) %>%
mutate(name = str_remove_all(name, "(community schools ecas)|(community school corporation ecas)|(community schools inc)|(com school corporation)|(consol school corporation eca)|(comm school corporation ecas)|(community school corporation of)|(community school corporation)"),
name = str_remove_all(name, "(county schools)|(school corporation ecas)|(school corporation)|(community schools eca)|(community schools)|(metropolitan of)|(school city of)|(county consolidated)"),
name = str_remove_all(name, "( ecas)|(cnty cmnty)|(consolidated)|( schools)$"),
# change acfrs name to census name
name = ifelse(name == "n vermillion eca", "north vermillion", name),
name = str_squish(name))
in_acfrs %>% filter(str_detect(acfrs_original_name, "perry"))
```
```{r}
in_census <- census_sd_4 %>% filter(state == "IN") %>%
mutate(name = str_remove_all(name, "(community school inc)|(community school corporation)|(community schools of)|(county consolidated school corporation)|(consolidated school corporation)|(county community school corporation)|(consolidated schools)"),
name = str_remove_all(name, "(school corporation)|(community schools)"),
name = str_remove_all(name, "( county)|( metropolitan)|( school corp)$"),
name = str_squish(name)) %>% arrange(name)
in_census %>% filter(str_detect(name, "perry"))
```
```{r}
in_matched <- in_acfrs %>% left_join(in_census) %>% drop_na(censusid)
in_acfrs_matched <- in_matched %>% select(state, acfrs_original_name, name)
in_matched %>% filter(str_detect(acfrs_original_name, "perry"))
```
```{r}
in_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(in_acfrs_matched) %>% filter(!duplicated(censusid)) %>% #filter(str_detect(name, "perry"))
write_csv("IN_match_unmatched.csv")
```
## CA
```{r}
ca_acfrs <- acfrs_sd_4 %>% filter(state == "CA") %>% #filter(str_detect(acfrs_original_name == "huntington beach city"))
mutate(name = str_remove_all(name, "(county office of education)|(schools district)|(county superintendent of schools)|(county education office)|(union school distict)|(union elementary)"),
name = str_remove_all(name, "( valley)|( joint)$"),
# change name to a common to meet census
acfrs_original_name = str_squish(acfrs_original_name), # MUST squish original name
name = ifelse(acfrs_original_name == "galt joint union elementary school district", "galt elementary", name),
name = ifelse(acfrs_original_name == "galt joint union high school district", "galt high", name),
name = ifelse(acfrs_original_name == "gold oak union elementary school district", "gold oak element", name),
name = case_when(acfrs_original_name == "huntington beach city school district" ~ "huntington elementary",
acfrs_original_name == "huntington beach union high school district" ~ "huntington high",
acfrs_original_name == "rim of the world unified school district" ~ "rim world",
acfrs_original_name == "san rafael city elementary school district" ~ "san rafael elementary",
acfrs_original_name == "san rafael city high school district" ~ "san rafael high",
name == "three rivers" ~ "three river",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "galt"))
```
```{r}
ca_census <- census_sd_4 %>% filter(state == "CA") %>% #filter(str_detect(name, "union high"))
mutate(name = str_remove_all(name, "(joint unified)|(un high school dist)|(union elem school dist)|(co spl schs oper by co supt)|(co spl sch oper by co supt)|(jt unif school district)|(valley jt unified sch dist)|(co off of education)|(valley unified sch dt)|(unified school d)|(county office of education)|(valley jt unif sch dist)|(elem school dist)|(county special schools operated by co supt)"),
name = str_remove_all(name, "(jt high)|(union elem)|(elementary sch)|(joint union high)|(jt union high sch)|(union high)|(un school dist)|(unified school dst)|(co office of education)|(union sch dt)|(cmty unif sch dist)|(co spl schs)"),
name = str_remove_all(name, "(union elementary)|(school dist)|(valley elem)|(co special schools)|(elem sch)|(elem school dist)|(elementary sch dist)|(county special schools)|(unified sch dt)|(joint union)|(unif sch dist)|(county selpa)"),
name = str_remove_all(name, "( valley)||( el)$"),
name = str_remove_all(name, "( jt)|( ctr)|(em sch)|( rict)$"),
name = str_remove_all(name, "( val)|(em sch)|( county)|( cy)$"),
name = str_remove_all(name, "( j t)|( sch)|(school dist)$"),
name = str_remove_all(name, "( vly)|( ist)|( un)$"),
# change name to a common to meet acfrs
gov_unit_original_name = str_squish(gov_unit_original_name),
name = ifelse(gov_unit_original_name == "galt jt union elem sch dist", "galt elementary", name),
name = ifelse(gov_unit_original_name == "galt jt union high school dist", "galt high", name),
name = ifelse(gov_unit_original_name == "gen shafter elem sch dist", "general shafter", name),
name = ifelse(gov_unit_original_name == "gold oak un elem sch dist", "gold oak element", name),
name = ifelse(gov_unit_original_name == "calexico unif sch dist", "calexico", name),
name = ifelse(gov_unit_original_name == "el segundo uni sch dist", "e l segundo", name),
name = ifelse(gov_unit_original_name == "grass valley elem school dist", "grass", name),
name = case_when(gov_unit_original_name == "howell mt elem sch dist" ~ "howell mountain",
gov_unit_original_name == "huntington bch city elem school dist" ~ "huntington elementary",
gov_unit_original_name == "huntington beach uhs dist" ~ "huntington high",
gov_unit_original_name == "san francisco unif sch dist" ~ "an francisco",
name == "pacific" & county_nces == "Fresno County" ~ "pacific (fresno county)",
name == "pacific" & county_nces == "Humboldt County" ~ "pacific (humboldt county)",
gov_unit_original_name == "san rafael elementary district" ~ "san rafael elementary",
gov_unit_original_name == "san rafael high school district" ~ "san rafael high",
gov_unit_original_name == "victor valley jt union high school dist" ~ "victor",
gov_unit_original_name == "south bay union elem sch dist" & county_nces == "Humboldt County"~ "south bay (humboldt county)",
gov_unit_original_name == "south bay union school district" & county_nces == "San Diego County" ~ "south bay (san diego county)",
TRUE ~ name)) %>%
mutate( name = str_squish(name)) %>%
arrange(name) #%>% filter(str_detect(name, "willow"))
```
```{r}
ca_matched <- ca_acfrs %>% left_join(ca_census) %>% drop_na(censusid)
ca_acfrs_matched <- ca_matched %>% select(state, acfrs_original_name, name)
ca_matched %>% select(acfrs_original_name, gov_unit_original_name, name)
```
```{r}
ca_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ca_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("CA_match_unmatched.csv")
```
## OH
```{r}
oh_acfrs <- acfrs_sd_4 %>% filter(state == "OH") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "heights university heights"),
#name = str_remove_all(name, ""),
name = str_remove_all(name, "(central)$"),
name = str_replace_all(name, "mc donald", "mcdonald"),
# change acfrs name to census name
name = ifelse(name == "", "", name),
name = str_squish(name))
```
```{r}
oh_census <- census_sd_4 %>% filter(state == "OH") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(hgts univ hgts)|(city schools)|(city sch dis t)|(ex vill school dist)|(cent loc sch dist)|(city school dist)|(loc sch dist)|(city school dist)|(local sch dt)"),
#name = str_remove_all(name, ""),
name = str_remove_all(name, "( loc)|( exem)|( loc)|( ex village)$"),
name = str_remove_all(name, "(ex vil)|(city sch)|( lo)|( cty)|(cal sch dt)$"),
name = str_replace_all(name, "hgts", "heights"),
name = case_when(gov_unit_original_name == "buckeye cent loc sch dist" ~ "buckeye",
gov_unit_original_name == "pymatuning vall loc sch dist" ~ "pymatuning valley",
gov_unit_original_name == "reading cmnty city sch dist" ~ "reading community",
gov_unit_original_name == "ripley union lewis local sch dist" ~ "ripley union lewis huntington",
TRUE ~ name),
name = case_when(name == "north olmstead" ~ "north olmsted",
name == "washington courthouse" ~ "washington court house",
name == "yellow spgs" ~ "yellow springs",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "tri"))
```
```{r}
oh_matched <- oh_acfrs %>% left_join(oh_census) %>% drop_na(censusid)
oh_matched %>% select(acfrs_original_name, gov_unit_original_name, name)
```
```{r}
oh_acfrs %>% filter(!acfrs_original_name %in% oh_matched$acfrs_original_name)
```
```{r}
oh_census %>% filter(!gov_unit_original_name %in% oh_matched$gov_unit_original_name) %>% arrange(name)
```
```{r}
oh_acfrs_matched <- oh_matched %>% select(state, acfrs_original_name, name)
oh_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(oh_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("OH_match_unmatched.csv")
```
## NE
```{r}
ne_acfrs <- acfrs_sd_4 %>% filter(state == "NE") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(public schools)|"),
name = str_remove_all(name, "(public school)"),
name = str_remove_all(name, "[0-9]{1,2}$"),
name = str_replace_all(name, "mc donald", "mcdonald"),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "blue hill school district no. 91-0074" ~ "blue hill 74",
acfrs_original_name == "don iphan-trumbu ll public schools district no. 40-0126" ~ "doniphan-trumbull",
acfrs_original_name == "dorchester school district no. 44" ~ "dorchester 44",
TRUE ~ name),
name = str_squish(name))# %>% filter(str_detect(name, "blue hill"))
```
```{r}
ne_census <- census_sd_4 %>% filter(state == "NE") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-", " "),
name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)"),
name = str_remove_all(name, "$"),
name = str_remove_all(name, "(schools)$"),
name = case_when(gov_unit_original_name == "blue hill vill school dist 74" ~ "blue hill 74",
gov_unit_original_name == "doniphan-trumbull public schools" ~ "doniphan-trumbull",
gov_unit_original_name == "dorchester vlg sch di 44" ~ "dorchester 44",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```
```{r}
ne_matched <- ne_acfrs %>% left_join(ne_census) %>% drop_na(censusid)
ne_matched %>% select(acfrs_original_name, gov_unit_original_name, name)
```
```{r}
ne_acfrs %>% filter(!acfrs_original_name %in% ne_matched$acfrs_original_name)
```
```{r}
ne_census %>% filter(!nces_original_name %in% ne_matched$nces_original_name) %>% arrange(name)
```
```{r}
ne_acfrs_matched <- ne_matched %>% select(state, acfrs_original_name, name)
ne_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ne_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("NE_match_unmatched.csv")
```
## OK
```{r}
ok_acfrs <-
acfrs_sd_4 %>% filter(state == "OK") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(public schools)|(school district no. i-095)"),
name = str_remove_all(name, "(public school)|(i 95)|(i 11)|(c 29 pottawatomie)|(i 27)|(i 2)|( i 10)|(55 c029)|(i 90)|(no i 365)|(d 29)|(i 51)|(c 32)|(60 i 103)"),
name = str_remove_all(name, "(i 4)|( c 9)$"),
name = str_replace_all(name, "mc donald", "mcdonald"),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "davidson school district no. c-9" ~ "davidson",
acfrs_original_name == "don iphan-trumbu ll public schools district no. 40-0126" ~ "doniphan-trumbull",
acfrs_original_name == "dorchester school district no. 44" ~ "dorchester 44",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
```
```{r}
ok_census <- census_sd_4 %>% filter(state == "OK") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-", " "),
name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)"),
name = str_remove_all(name, "$"),
name = str_remove_all(name, "(schools)$"),
name = case_when(gov_unit_original_name == "" ~ "",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```
```{r}
ok_matched <- ok_acfrs %>% left_join(ok_census) %>% drop_na(censusid)
ok_matched %>% select(acfrs_original_name, gov_unit_original_name)
```
```{r}
ok_acfrs %>% filter(!acfrs_original_name %in% ok_matched$acfrs_original_name)
```
```{r}
ok_census %>% filter(!nces_original_name %in% ok_matched$nces_original_name) %>% arrange(name)
```
```{r}
ok_acfrs_matched <- ok_matched %>% select(state, acfrs_original_name, name)
ok_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ok_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("OK_match_unmatched.csv")
```
## MO
```{r}
mo_acfrs <-
acfrs_sd_4 %>% filter(state == "MO") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(public schools)|(school district no. i-095)|(consolidated no)"),
name = str_remove_all(name, "(county)|(reorganized 2)|(124)|(reorganized r 2)"),
name = str_remove_all(name, "(schools)|( 81)$"),
name = str_replace(name, "de soto 73", "desoto 73"),
name = str_replace(name, "salem r80", "salem r 80"),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "campbell reorganized school district no.2" ~ "campbell r ii",
acfrs_original_name == "fredericktown r-1 school district" ~ "fredericktown r i",
acfrs_original_name == "hayti reorganized school district no.2" ~ "hayti r ii",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
```
```{r}
mo_census <- census_sd_4 %>% filter(state == "MO") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-|\\.", " "),
name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(co.)|(florissant r ii)"),
name = str_remove_all(name, "( 101)|( 58)$"),
name = str_remove_all(name, "(schools)|( of warren)$"),
name = case_when(gov_unit_original_name == "puxico sch dist r 8" ~ "puxico r viii",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```
```{r}
mo_matched <- mo_acfrs %>% left_join(mo_census) %>% drop_na(censusid)
mo_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name)
```
```{r}
mo_acfrs %>% filter(!acfrs_original_name %in% mo_matched$acfrs_original_name)
```
```{r}
mo_census %>% filter(!nces_original_name %in% mo_matched$nces_original_name) %>% arrange(name)
```
```{r}
mo_acfrs_matched <- mo_matched %>% select(state, acfrs_original_name, name)
mo_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(mo_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("MO_match_unmatched.csv")
```
## OR
```{r}
or_acfrs <-
acfrs_sd_4 %>% filter(state == "OR") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(public schools)|(consolidated no)"),
name = str_remove_all(name, "(jt)$"),
name = str_remove_all(name, "(29j)|(10jt)|( 2 c)|j$"),
name = str_replace(name, "de soto 73", "desoto 73"),
name = str_replace(name, "salem r80", "salem r 80"),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "centennial school district no. 28jt" ~ "centennial 28",
acfrs_original_name == "north santiam school district no. 29j" ~ "north santiam 29",
acfrs_original_name == "pendleton school district 16r" ~ "pendleton 16",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "centennial"))
```
```{r}
or_census <- census_sd_4 %>% filter(state == "OR") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-|\\.", " "),
name = str_remove_all(name, "(co unit)|(school dist 10j)|(school dist)|(bay )|(unified 7)|(county)"),
name = str_remove_all(name, "( sd)"),
name = str_remove_all(name, "( 2c)|j$"),
name = str_remove_all(name, "( 57)|( 40)|(511)|( 8)|( city)|(10)|(county)$"),
name = case_when(gov_unit_original_name == "corvallis sch dist 509-j" ~ "corvallis 509",
gov_unit_original_name == "crow-apple gate-lorane school dist 66" ~ "crow applegate lorane 66",
gov_unit_original_name == "grant admin school dist 3" ~ "grant 3",
gov_unit_original_name == "harney county school dist 4" ~ "harney 4",
gov_unit_original_name == "helix school district #1r" ~ "helix 1",
gov_unit_original_name == "jewell sch dist 8" ~ "jewell",
gov_unit_original_name == "morrow co sch dist 1" ~ "morrow",
gov_unit_original_name == "perrydale sch dist 21" ~ "perrydale",
gov_unit_original_name == "pilot rock sch dist 2" ~ "pilot rock",
gov_unit_original_name == "scio school dist 95" ~ "scio",
gov_unit_original_name == "seaside sch dist 10" ~ "seaside",
gov_unit_original_name == "silver falls school district 4j" ~ "silver falls",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "centennial"))
```
```{r}
or_matched <- or_acfrs %>% left_join(or_census) %>% drop_na(censusid)
or_matched %>% select(acfrs_original_name, nces_original_name, gov_unit_original_name, name)
```
```{r}
or_acfrs %>% filter(!acfrs_original_name %in% or_matched$acfrs_original_name)
```
```{r}
or_census %>% filter(!nces_original_name %in% or_matched$nces_original_name) %>% arrange(name)
```
```{r}
or_acfrs_matched <- or_matched %>% select(state, acfrs_original_name, name)
or_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(or_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("OR_match_unmatched.csv")
```
## NJ
```{r}
nj_acfrs <-
acfrs_sd_4 %>% filter(state == "NJ") %>% arrange(name) %>%
mutate(name = str_remove_all(name,"(school district of the borough of)|(school district of)|(school district of town of )|(of the)"),
name = str_remove_all(name, "(public schools)|(borough of)|(school distrcit)|(township of )"),
name = str_remove_all(name, "(county)|(township)"),
name = str_remove_all(name, "^(city of)"),
name = str_remove_all(name, "(^(of )|(town of ))"),
name = str_remove_all(name, "(schools)|(school)|(borough)$"),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "caldwell-west caldwell school district" ~ "caldwell west",
acfrs_original_name == "matawan-abredeen regional school district"~ "matawan aberdeen regional",
acfrs_original_name == "passaic board of education-passaic public schools" ~ "passaic",
acfrs_original_name == "scotch plains-fanwood regional school district" ~ "scotch plains fanwood regional",
acfrs_original_name == "south orange and maplewood school district board of education" ~ "south orange maplewood",
TRUE ~ name),
#
name = str_squish(name)) #%>% filter(str_detect(name, "chathams"))
```
```{r}
nj_census <- census_sd_4 %>% filter(state == "NJ") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-|\\.", " "),
name = str_remove_all(name, "(township school district)|(city school district)|(boro school district)|(public school district)|(borough school district)|(high school district)|(school district of )"),
name = str_remove_all(name, "( 101)|( 58)|(high school)|(borough)$"),
name = str_remove_all(name, "(schools)|(school district)|(township)|( city)|(public)$"),
name = case_when(gov_unit_original_name == "paulsboro boro sch dist" ~ "paulsboro",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```
```{r}
nj_matched <- nj_acfrs %>% left_join(nj_census) %>% drop_na(censusid)
nj_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name, name) %>% filter(str_detect(name, "paulsboro"))
```
```{r}
nj_acfrs %>% filter(!acfrs_original_name %in% nj_matched$acfrs_original_name) %>% arrange(name)
```
```{r}
nj_census %>% filter(!nces_original_name %in% nj_matched$nces_original_name) %>% arrange(name)
```
```{r}
nj_acfrs_matched <- nj_matched %>% select(state, acfrs_original_name, name)
nj_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(nj_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("NJ_match_unmatched.csv")
```
## MI
```{r}
mi_acfrs <- acfrs_sd_4 %>% filter(state == "MI") %>% arrange(name) %>%
mutate(name = str_replace_all(name, "-|\\.|#|_", " "),
name = str_remove_all(name, "(school district of the city of )|(union schools district)"),
name = str_remove_all(name, "(public school)|(community schools)|(community district)|(school system)|(area schools of gogebic county)"),
#name = str_remove_all(name, ""),
name = str_remove_all(name, "(schools)|(school)$"),
# name = str_replace(name, "", ""),
# name = str_replace(name, "", ""),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "" ~ "",
acfrs_original_name == "detroit public schools community district" ~ "detroit community district",
acfrs_original_name == "" ~ "",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
```
```{r}
mi_census <- census_sd_4 %>% filter(state == "MI") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-|\\.|#", " "),
name = str_replace_all(name, "\\(|\\)", ""),
name = str_remove_all(name, "(township school district)|(public school district)|(community schools)|(in the counties of oakland and lapee)|(area schools of gogebic county)|(union schools district )"),
name = str_remove_all(name, "(school district)|(public schools)|(s/d )"),
name = str_remove_all(name, "(schools)$"),
name = case_when(gov_unit_original_name == "" ~ "",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "macomb"))
```
```{r}
mi_matched <- mi_acfrs %>% left_join(mi_census) %>% drop_na(censusid)
```
```{r}
mi_acfrs %>% filter(!acfrs_original_name %in% mi_matched$acfrs_original_name)
```
```{r}
mi_census %>% filter(!nces_original_name %in% mi_matched$nces_original_name) %>% arrange(name)
```
```{r}
mi_acfrs_matched <- mi_matched %>% select(state, acfrs_original_name, name)
mi_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(mi_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("MI_match_unmatched.csv")
```
## AR
```{r}
ar_acfrs <-
acfrs_sd_4 %>% filter(state == "AR") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "[0-9]{1,2}"),
name = str_remove_all(name, "consolidated"),
# name = str_remove_all(name, "$"),
#
# name = str_replace(name, "", ""),
# name = str_replace(name, "", ""),
# change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "cave city school district no. 2a" ~ "cave 2a",
acfrs_original_name == "" ~ "",
acfrs_original_name == "" ~ "",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
```
```{r}
ar_census <- census_sd_4 %>% filter(state == "AR") %>% arrange(name) %>%
# use NCES instead of gov unit names
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-|\\.|\\/", " "),
name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(central sch dist)|(city school district)|(county school dist)|(is central sch dist)"),
name = str_remove_all(name, "(school district)|(sch dist)|(school dist)|(cons school dist)"),
name = str_remove_all(name, "[0-9]{1,2}"),
#name = str_remove_all(name, "$"),
name = case_when(gov_unit_original_name == "cave city sch dist 2 a" ~ "cave 2a",
gov_unit_original_name == "south conway co sch dist" ~ "south conway",
gov_unit_original_name == "south side school district" & county_nces == "Van Buren County" ~ "south side (van buren)",
gov_unit_original_name == "texarkana sch dist 7" & county_nces == "Miller County" ~ "texarkana (miller county)",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```
```{r}
ar_matched <- ar_acfrs %>% left_join(ar_census) %>% drop_na(censusid)
ar_matched %>% select(acfrs_original_name, gov_unit_original_name, nces_original_name, name) %>% filter(str_detect(name, "south side"))
```
```{r}
ar_acfrs %>% filter(!acfrs_original_name %in% ar_matched$acfrs_original_name)
```
```{r}
ar_census %>% filter(!nces_original_name %in% ar_matched$nces_original_name) %>% arrange(name)
```
```{r}
ar_acfrs_matched <- ar_matched %>% select(state, acfrs_original_name, name)
ar_census %>% select(state, nces_original_name, gov_unit_original_name, name, ncesid, censusid, student) %>% left_join(ar_acfrs_matched) %>% filter(!duplicated(censusid)) %>%
write_csv("AR_match_unmatched.csv")
```
## MN
```{r}
mn_acfrs <-
acfrs_sd_4 %>% filter(state == "MN") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(independent school dist)|(community schools)"),
name = str_remove_all(name, "^[0-9]{1,4}"),
name = str_remove_all(name, "[0-9]{1,4}$"),
#
# name = str_replace(name, "-", " "),
# name = str_replace(name, "", ""),
# # change acfrs name to census name
acfrs_original_name = str_squish(acfrs_original_name),
name = case_when(acfrs_original_name == "" ~ "",
acfrs_original_name == "burnsville-eagan-savage independent school district 191" ~ "burnsville",
acfrs_original_name == "eastern carver county schools independent school district no. 112" ~ "eastern 112",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "lamont"))
```
```{r}
mn_census <- census_sd_4 %>% filter(state == "MN") %>% arrange(name) %>%
# keep GOV UNIT
mutate(name = str_to_lower(nces_original_name)) %>%
mutate(name = str_replace_all(name, "-|\\.", " "),
name = str_remove_all(name, "county public school"),
name = str_remove_all(name, "(public schools)|(comm schools)|(community schools)|(public schs)|(high school)|(public school district)|(ind school dist)|(public school dist)|(school district)|(public sch)|(public sc)|(ind sch dist)"),
name = str_remove_all(name, "(school dist)|(sch dist)|(isd)"),
name = str_remove_all(name, "$"),
name = case_when(gov_unit_original_name == "bagley sch district 162" ~ "bagley 162",
gov_unit_original_name == "dassel-cokato public sch district 466" ~ "dassel cokato 466",
gov_unit_original_name == "eastern carver county isd 112" ~ "eastern 112",
gov_unit_original_name == "faribault sch dist 656" ~ "fairbault 656",
TRUE ~ name),
name = str_squish(name)) #%>% filter(str_detect(name, "hill"))
```
```{r}
mn_matched <- mn_acfrs %>% left_join(mn_census) %>% drop_na(censusid)
```
```{r}
mn_acfrs %>% filter(!acfrs_original_name %in% mn_matched$acfrs_original_name) %>% arrange(name)
```
```{r}
mn_census %>% filter(!nces_original_name %in% mn_matched$nces_original_name) %>% arrange(name)