-
Notifications
You must be signed in to change notification settings - Fork 1
/
data.py
391 lines (351 loc) · 16.8 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
import functools
import os
import pandas as pd
publicDataRoot = os.path.join(os.path.split(__file__)[0], "OSPI_publicschools/")
privateDataRoot = os.path.join(os.path.split(__file__)[0], "WA_privateschools/")
@functools.cache
def publicData():
# 2022-23 For Bellevue only. One school at a time. *Sigh*
# https://washingtonstatereportcard.ospi.k12.wa.us/ReportCard/ViewSchoolOrDistrict/101387
# CAVEAT -- the 2022 disagree with
# https://washingtonstatereportcard.ospi.k12.wa.us/ReportCard/ViewSchoolOrDistrict/100019
# The figures here show:
# 22-23: 19354
# def bellevue2023():
# def gen():
# for filename in glob.glob(f"{publicDataRoot}/OSPI_publicschools/2022_Bellevue/*.csv"):
# df = pd.read_csv(filename)
# df = df.rename(
# columns={'Organization Name': 'SchoolName', 'School Year': "Year", 'Gradelevel': 'GradeLevel',
# 'Number Of Students': 'All Students'})
# df['DistrictName'] = 'Bellevue'
# # print(df['School Name'].unique(), filename)
# yield df
#
# return pd.concat(gen())
# Public school data: https://www.k12.wa.us/data-reporting/data-portal
year2Pubdata = {
2014: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2014-15_School_Year.csv'),
2015: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2015-16_School_Year.csv'),
2016: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2016-17_School_Year.csv'),
2017: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2017-18_School_Year.csv'),
2018: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2018-19_School_Year.csv'),
2019: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2019-20_School_Year.csv'),
2020: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2020-21_School_Year.csv'),
2021: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2021-22_School_Year.csv'),
2022: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2022-23_School_Year.csv'),
# 2023: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2023-24_School_Year_20240130.csv'),
2023: pd.read_csv('OSPI_publicschools/Report_Card_Enrollment_2023-24_School_Year_20240312.csv'),
# 2022: bellevue2023()
}
# Normalize data - just want ['Year', 'School Name', "Region", "Grade", "Total"]
# This format is common the this collections of workbooks
years = list(year2Pubdata.keys())
datacols = ['Year', 'School Name', "Region", "Grade", "Total", "County"]
grades = ['K', 'P'] + list(range(1, 13)) # NB. range in Python is exclusive of last values\
gradeMap = { # For standardize grade categories
'Pre-Kindergarten': 'P',
'Kindergarten': 'K',
'Half-day Kindergarten': 'K', # ?
'1st Grade': '1',
'2nd Grade': '2',
'3rd Grade': '3',
'4th Grade': '4',
'5th Grade': '5',
'6th Grade': '6',
'7th Grade': '7',
'8th Grade': '8',
'9th Grade': '9',
'10th Grade': '10',
'11th Grade': '11',
'12th Grade': '12',
}
# Cleanup each CSV file
def mapData(year, df):
df = df.copy()
df = df.rename(columns={'SchoolName': 'School Name', 'DistrictName': 'Region', 'All Students': 'Total'})
df['Year'] = year
df = df[df['GradeLevel'] != 'AllGrades']
df = df[df['GradeLevel'] != 'All Grades']
df['Grade'] = df['GradeLevel'].replace(gradeMap)
df = df[df['School Name'] != 'State Total']
df = df[df['School Name'] != 'District Total']
return df[datacols + list(set(df.columns) - set(datacols))]
print(years)
year2PubdataNormalized = {year: mapData(year, df) for (year, df) in year2Pubdata.items()}
pubdataNormalized = pd.concat(year2PubdataNormalized.values()).reindex()
print(f"Columns in: {pd.concat(year2Pubdata.values()).columns}")
print(f"Columns out: {pubdataNormalized.columns}")
# Santity checks
print(pubdataNormalized.columns)
print("Grade", sorted(pubdataNormalized["Grade"].unique()))
print("Year", sorted(pubdataNormalized["Year"].unique()))
print(pubdataNormalized["County"].unique())
print("County", pubdataNormalized.index.size, pubdataNormalized["County"].isna().size)
# display(pubdataNormalized[pubdataNormalized["County"].isna()])
dAll = pubdataNormalized.copy()
print("Total WA public enrollment by year")
# display(dAll.groupby("Year")["Total"].sum())
return pubdataNormalized
@functools.cache
def privateData():
gradeMap = {'Total PreK': 'P',
'Total KG': 'K',
'Total G1': '1',
'Total G2': '2',
'Total G3': '3',
'Total G4': '4',
'Total G5': '5',
'Total G6': '6',
'Total G7': '7',
'Total G8': '8',
'Total G9': '9',
'Total G10': '10',
'Total G11': '11',
'Total G12': '12',
'PK Total': 'P',
# 'PK Total.1',
'KG Total': 'K',
'G1 Total': '1',
'G2 Total': '2',
'G3 Total': '3',
'G4 Total': '4',
'G5 Total': '5',
'G6 Total': '6',
'G7 Total': '7',
'G8 Total': '8',
'G9 Total': '9',
'G10 Total': '10',
'G11 Total': '11',
'G12 Total': '12',
'PreK TOTAL': 'P',
'KG TOTAL': 'K',
'G1 TOTAL': '1',
'G2 TOTAL': '2',
'G3 TOTAL': '3',
'G4 TOTAL': '4',
'G5 TOTAL': '5',
'G6 TOTAL': '6',
'G7 TOTAL': '7',
'G8 TOTAL': '8',
'G9 TOTAL': '9',
'G10 TOTAL': '10',
'GI0 TOTAL': '10',
'G11 TOTAL': '11',
'G12 TOTAL': '12',
'PreK': 'P',
'KG': 'K',
'Grade 1': '1',
'Grade2': '2',
'Grade 2': '2',
'Grade 3': '3',
'Grade 4': '4',
'Grade 5': '5',
'Grade 6': '6',
'Grade 7': '7',
'Grade 8': '8',
'Grade 9': '9',
'Grade 10': '10',
'Grade 11': '11',
'Grade 12': '12',
1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10', 11: '11', 12: '12'}
# Final columns, except Grade, Total
datacols = [
'School Name',
# 'Street Address', # Missing in some
# Some are "District Name", some are "City"
# Will use to hold both and search for substrings: "Bellevue"
"Region",
# 'State',
# 'Zipcode', Missing in 2018
# 'County'
'Year', # Added
]
allFinalCols = datacols + ["Grade", "Total"]
# Final grade categories
grades = ["P", "K"] + [str(g) for g in range(1, 13)]
import pandas as pd
pd.set_option('display.max_columns', None) # set the max columns to display to none
# Private data source: https://www.sbe.wa.gov/our-work/private-schools
d2018 = pd.read_excel('WA_privateschools/Final Excel P105B Combined Data for 201819.xlsx')
d2019 = pd.read_excel('WA_privateschools/Copy of 2019-20 Private School Enrollment_073120.xlsx', sheet_name=1,
skiprows=[0, 1])
# d2020 = pd.read_excel('WA_privateschools/Copy of 2020-21 Private School Enrollment x Grade_for the website.xlsx')
d2020 = pd.read_excel(
'WA_privateschools/Fixed - Copy of 2020-21 Private School Enrollment x Grade_for the website.xlsx')
d2021 = pd.read_excel('WA_privateschools/2021-22 Private School Enrollment (Website).xlsx')
d2022 = pd.read_excel('WA_privateschools/2022 - 2023 Private School Enrollment (website).xlsx')
d2023 = pd.read_excel('WA_privateschools/2023 - 2024 Private School Enrollment (website).xlsx')
def d2018normalized():
# Normalize 2018 --- Not used. Format has quite a lot difference. Suspicious of the changes
# d2018_ = d2018.rename(columns={"District Name": "Region"}) # "City" is missing in this dataset. We only have "District Name"
# d2018_['Grade'] = d2018['Grade'].replace(gradeMap)
# d2018_['Year'] = 2018
# d2018_ = d2018_[d2018_['School Name'].notnull()]
# print(d2018_.shape, list(d2018_[allFinalCols].columns))
pass
def d2019normalized():
# Normalize 2019
df = d2019.rename(columns=gradeMap)
df = df.rename(columns={"Total": "_Total", "Name of School": "School Name", "City": "Region"})
# print(sorted(df.columns))
df = df.melt(id_vars=set(df.columns) - set(grades), value_vars=grades, var_name="Grade", value_name="Total")
df['Year'] = 2019
df = df[df['School Name'].notnull()]
print(df.shape, list(df[allFinalCols].columns))
checkNulls(df)
return df
def d2020normalized():
# Normalize 2020
df = d2020.rename(columns=gradeMap).rename(columns={"City": "Region"})
print(df.shape, list(df.columns))
df[grades] # The grade columns should all exist
df = df.melt(id_vars=set(df.columns) - set(grades), value_vars=grades, var_name="Grade", value_name="Total")
df['Grade'] = df['Grade'].replace(gradeMap)
df['Year'] = 2020
df = df[df['School Name'].notnull()]
checkNulls(df)
# display(df[df["Region"].isnull()])
# display(d2020["City"].isnull().sum())
print(f'null cities: {d2020["City"].isnull().sum()}')
# print(f'Schools with null cities:')
# for school in d2020.loc[d2020["City"].isnull(),"School Name"].unique():
# display(d2020[d2020["School Name"]==school])
# print(f'Schools with null totals:')
# for school in df.loc[df["Total"].isnull(),"School Name"].unique():
# display(d2020[d2020["School Name"]==school])
# display(df[df["Total"].isnull()])
# display(df[df["Region"].isnull() & (df["School Name"]=="Carden Country School")])
# display(df[df["School Name"]=="Carden Country School"])
return df
# def d2021normalized():
# # Normalize 2020
# df = d2020.rename(columns=gradeMap).rename(columns={"City": "Region"})
# print(df.shape, list(df.columns))
# df[grades] # The grade columns should all exist
# df = df.melt(id_vars=set(df.columns) - set(grades), value_vars=grades, var_name="Grade", value_name="Total")
# df['Grade'] = df['Grade'].replace(gradeMap)
# df['Year'] = 2021
# df = df[df['School Name'].notnull()]
# checkNulls(df)
# # display(df[df["Region"].isnull()])
# # display(d2020["City"].isnull().sum())
# print(f'null cities: {d2020["City"].isnull().sum()}')
# # print(f'Schools with null cities:')
# # for school in d2020.loc[d2020["City"].isnull(),"School Name"].unique():
# # display(d2020[d2020["School Name"]==school])
# # print(f'Schools with null totals:')
# # for school in df.loc[df["Total"].isnull(),"School Name"].unique():
# # display(d2020[d2020["School Name"]==school])
#
# # display(df[df["Total"].isnull()])
# # display(df[df["Region"].isnull() & (df["School Name"]=="Carden Country School")])
# # display(df[df["School Name"]=="Carden Country School"])
# return df
def checkNulls(df):
schools = set()
for col in ['Grade', 'Year', 'Total',
'School Name',
'Region',
# 'Street Address'
]:
mask = (df[col].isna() | df[col].isnull())
if mask.sum():
print(f"{col} has {len(mask)} nulls")
schools.intersection(df.loc[mask, "School Name"].unique())
for sname in schools:
display(df[df["School Name"] == sname])
# print(d2021.columns)
def d2021normalized():
df = d2021.rename(columns=gradeMap).rename(
columns={"Name of School": "School Name", 'ZIP': "Zipcode", "City": "Region"})
df[grades] # The grade columns should all exist
df = df.melt(id_vars=set(df.columns) - set(grades), value_vars=grades, var_name="Grade", value_name="Total")
df['Grade'] = df['Grade'].replace(gradeMap)
df['Year'] = 2021
df.loc[df["Total"].isna(), "Total"] = 0
df["Total"] = df["Total"].astype(int)
df = df[df['School Name'].notnull()]
checkNulls(df)
print(df.shape, list(df[allFinalCols].columns))
return df
# return _normalized2021(d2021)
# Normalize 2022
def _normalize(df, year):
df = df.rename(columns=gradeMap).rename(
columns={'ZIP': "Zipcode",
"City": "Region", # District
"Address": "Street Address"})
print(df.columns)
df[grades] # The grade columns should all exist
df = df.melt(id_vars=set(df.columns) - set(grades), value_vars=grades, var_name="Grade", value_name="Total")
df['Grade'] = df['Grade'].replace(gradeMap)
df['Year'] = year
df.loc[df["Total"].isna(), "Total"] = 0
df["Total"] = df["Total"].astype(int)
df = df[df['School Name'].notnull()]
checkNulls(df)
# print(df.shape, list(df[allFinalCols].columns))
print(df.shape, list(df.columns))
return df
# Index(['School Name', 'Street Address', 'Region', 'State', 'Zipcode', 'P',
# 'PK Total.1', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
# '11', '12', 'K12 Total Reported'],
# dtype='object')
def d2022normalized():
return _normalize(d2022, year=2022)
# Index(['School Name', 'Street Address', 'Region', 'State', 'Zipcode', 'County',
# 'District', 'PreK TOTAL', 'KG TOTAL', 'G1 TOTAL', 'G2 TOTAL',
# 'G3 TOTAL', 'G4 TOTAL', 'G5 TOTAL', 'G6 TOTAL', 'G7 TOTAL', 'G8 TOTAL',
# 'G9 TOTAL', 'GI0 TOTAL', 'G11 TOTAL', 'G12 TOTAL'],
# dtype='object')
def d2023normalized():
return _normalize(d2023, year=2023)
def normalizeAddresses(df):
df = df.copy()
def simplifyAddr(a):
try:
a = a.upper().replace(".", "").replace("AVENUE", "AVE")
a = " ".join(a.split()[:3])
return a
except:
print("Faied address", a)
raise
df["Street Address"] = df["Street Address"].apply(simplifyAddr)
return df
def fixDuplicateAddress(df):
df_ = [["Region", "School Name", "Street Address"]].sort_values(["Region", "School Name", "Street Address"]) \
.drop_duplicates() \
.reset_index()
for idx, region, school, addr in df_[df_[["Region", "School Name"]].duplicated(keep=False)].itertuples():
mask = (df["Region"] == region) & (df["School Name"] == school)
newschool = f"{school} {addr}"
print(f"Renaming {repr(school)}, {repr(addr)} -> {repr(newschool)}")
df.loc[mask, "School Name"] = newschool
# 2018 seems to have some very different reporting. Not sure it is consistent. Dropped here
dAll = pd.concat(
[d2019normalized(), d2020normalized(), d2021normalized(), d2022normalized(), d2023normalized()]).reindex()
# dAll = pd.concat([d2022_, ]).reindex()
# Place the important data in the first columns.
dAll = dAll[allFinalCols + list(set(dAll.columns) - set(allFinalCols))].sort_values(
["Year", "Grade", "School Name"])
dAll = dAll[~(dAll["School Name"].isnull())] # Zap rows without school names
dAll = dAll[~(dAll["School Name"].astype(str) == 'NaN')] # Zap rows without school names
dAll = dAll[~(dAll["School Name"].isna())] # Zap rows without school names
dAll = dAll[dAll["School Name"] != "State Total"] # Zap school total rows
dAll.loc[dAll["Total"].isna(), "Total"] = 0
dAll = dAll.astype({"Total": int})
dAll = dAll.astype({"Year": int})
dAll = dAll.reset_index()
# display(dAll[dAll["Street Address"].isna()])
# dAll = normalizeAddresses(dAll)
# dAll = fixDuplicateAddress(dAll)
# There are a number of school in the data without a district or city, or a district of 0 for some reason.
# We'll just ignore them from bellevue processing since none are in Bellevie
# dAll = dAll[dAll["Region"].astype(str)!='0'] # 0 is not a region
print(f"Years reporting {dAll['Year'].unique()}")
print("Schools reporting their city/district as 0")
# display(dAll[dAll["Region"].isnull()]["School Name"].unique())
print("\nTotal WA private enrollment by year")
# display(dAll.groupby("Year")["Total"].sum())
return dAll