forked from civictechdc/ancfinder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_anc_database.py
executable file
·415 lines (354 loc) · 17.2 KB
/
update_anc_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/python3
import sys
import csv
import json
import re, urllib.request, urllib.parse, urllib.error, os.path, io
import getpass
from collections import OrderedDict
# from cspickert, https://gist.github.com/cspickert/1650271
class GoogleDocsClient(object):
def __init__(self, email, password):
super(GoogleDocsClient, self).__init__()
self.email = email
self.password = password
def _get_auth_token(self, email, password, source, service):
url = "https://www.google.com/accounts/ClientLogin"
params = {
"Email": email, "Passwd": password,
"service": service,
"accountType": "HOSTED_OR_GOOGLE",
"source": source
}
req = urllib.request.Request(url, urllib.parse.urlencode(params).encode("utf8"))
return re.findall(br"Auth=(.*)", urllib.request.urlopen(req).read())[0]
def get_auth_token(self):
source = type(self).__name__
return self._get_auth_token(self.email, self.password, source, service="wise")
def download(self, spreadsheet, gid, format="csv"):
url_format = "https://spreadsheets.google.com/feeds/download/spreadsheets/Export?key=%s&exportFormat=%s&gid=%i"
headers = {
b"Authorization": b"GoogleLogin auth=" + self.get_auth_token(),
b"GData-Version": b"3.0"
}
req = urllib.request.Request(url_format % (spreadsheet, format, gid), headers=headers)
return io.TextIOWrapper(urllib.request.urlopen(req), encoding="utf8")
def urlopen(url):
# Opens a URL and decodes its content assuming UTF-8; returns a stream.
data = urllib.request.urlopen(url)
return io.TextIOWrapper(data, encoding="utf8")
def csv_file_to_dict(csv_file):
return list(csv.DictReader(csv_file))
def get_base_data():
# This function initializes an empty JSON file with base data.
# We don't normally use this function since we try to update
# data in place selectively, because it's a lot faster than
# collecting everything from scratch.
print("fetching Google spreadsheet to create base data")
# Create Google Documents client and download the spreadsheet as a CSV for each worksheet.
spreadsheet_id = "0AsuPWK1wtxNfdGdyVWU4Z3J3X3g3RzVyVWJ1Rkd4dXc" # our spreadsheet
gs = GoogleDocsClient(google_email, google_password)
wards = csv_file_to_dict(gs.download(spreadsheet_id, 1)) # second argument is the worksheet ID
ancs = csv_file_to_dict(gs.download(spreadsheet_id, 2))
smds = csv_file_to_dict(gs.download(spreadsheet_id, 0))
output = OrderedDict()
for ward in wards:
w = OrderedDict()
output[ward["Ward"]] = w
w["ward"] = int(ward["Ward"])
w["ancs"] = OrderedDict()
for anc in ancs:
a = OrderedDict()
output[ anc["ANC"][0] ]["ancs"][ anc["ANC"][1] ] = a
a["anc"] = anc["ANC"][0:2]
a["anc_letter"] = anc["ANC"][1]
a["smds"] = OrderedDict()
for smd in smds:
s = OrderedDict()
output[ smd["smd"][0] ]["ancs"][ smd["smd"][1] ]["smds"][ smd["smd"][2:] ] = s
s["anc"] = smd["smd"][0:2]
s["smd"] = smd["smd"]
s["smd_number"] = smd["smd"][2:]
s["ward"] = smd["ward"]
return output
def add_googledoc_data(output):
print("fetching Google spreadsheet")
# Don't ever run this without also running the scraperwiki
# data because scraperwiki overrides google doc data.
# Create Google Documents client and download the spreadsheet as a CSV for each worksheet.
spreadsheet_id = "0AsuPWK1wtxNfdGdyVWU4Z3J3X3g3RzVyVWJ1Rkd4dXc" # our spreadsheet
gs = GoogleDocsClient(google_email, google_password)
wards = csv_file_to_dict(gs.download(spreadsheet_id, 1)) # second argument is the worksheet ID
ancs = csv_file_to_dict(gs.download(spreadsheet_id, 2))
smds = csv_file_to_dict(gs.download(spreadsheet_id, 0))
for ward in wards:
w = output[ward["Ward"]]
w["description"] = ward["Description"]
for anc in ancs:
a = output[ anc["ANC"][0] ]["ancs"][ anc["ANC"][1] ]
a["website"] = anc["Website"]
for smd in smds:
s = output[ smd["smd"][0] ]["ancs"][ smd["smd"][1] ]["smds"][ smd["smd"][2:] ]
s.update(smd)
def add_scraperwiki_data(output):
print("adding more commissioner data")
# additional information about ANC commissioners
sw_data = urlopen("https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=json&name=dc_anc_commissioner_info_from_official_anc_website&query=select+*+from+`swdata`&apikey=")
for rec in json.load(sw_data):
smd = rec["smd"].strip()
for k, v in list(rec.items()):
if v == None: continue
output[smd[0]]["ancs"][smd[1]]["smds"][smd[2:]][k] = v.strip()
def add_term_data(output):
print("adding commissioner term information")
# Number of terms served by current ANC commissioner
term_data = csv.reader(open('data/anc-terms.csv'), delimiter=',')
con_data = csv.reader(open('data/anc-contestation.csv'), delimiter=',')
for rec in term_data:
smd = rec[1]
output[smd[0]]["ancs"][smd[1]]["smds"][smd[2:]]["terms"] = rec[5]
for rec in con_data:
smd = rec[0]
output[smd[0]]["ancs"][smd[1]]["smds"][smd[2:]]["contestation"] = rec[1]
def add_geographic_data(output):
print("adding geographic data")
# Add ANC/SMD geographic extents (bounding box) from the GIS server.
for ward in list(output.values()):
for anc in list(ward["ancs"].values()):
anc["bounds"] = json.load(urlopen("http://gis.govtrack.us/boundaries/dc-anc-2013/" + anc["anc"].lower()))["extent"]
for smd in list(anc["smds"].values()):
smd["bounds"] = json.load(urlopen("http://gis.govtrack.us/boundaries/dc-smd-2013/" + smd["smd"].lower()))["extent"]
def add_neighborhood_data(output):
print("adding neighborhood data")
# Add intersections between ANCs/SMDs and neighborhoods, and estimate the
# population in every intersection because it looks weird to say ANC 1B
# contains the "Tidal Basin". We'll handle small-population intersections
# in the front-end.
# Query the Census API for the population of every block group that intersects
# with a neighborhood.
bg_population = { }
for intsect in json.load(open("data/neighborhoods-blockgroups.json")):
bg = intsect["2012-blockgroups"]["id"]
if bg in bg_population: continue
state_fips, county_fips, tract, bg_num = bg[0:2], bg[2:5], bg[5:11], bg[11:12]
if state_fips != "11":
# not in DC! this should be a denegerate intersection of some sort
bg_population[bg] = 0
continue
url = "http://api.census.gov/data/%s?key=%s&get=%s&in=state:%s+county:%s+tract:%s&for=block+group:%s" \
% ("2010/sf1", census_api_key, "P0010001", state_fips, county_fips, tract, bg_num)
census_data = json.load(urlopen(url))
bg_population[bg] = int(census_data[1][0])
# For each neighborhood, sum the populations of the block groups it intersects with
# weighted by the proportion of the block group that intersects with the neighborhood.
hood_population = { }
for intsect in json.load(open("data/neighborhoods-blockgroups.json")):
h = intsect["dc-neighborhoods"]["id"]
bg = intsect["2012-blockgroups"]["id"]
intsect_pop = bg_population[bg] * intsect["2012-blockgroups"]["ratio"]
hood_population[h] = hood_population.get(h, 0) + intsect_pop
# Clear out existing data.
for ward in output.values():
for anc in ward["ancs"].values():
anc["neighborhoods"] = []
for smd in anc["smds"].values():
smd["neighborhoods"] = []
# Now store the neighborhood data in the output, once for ANCs as a whole
# and once for the SMDs.
for anc_smd in ("anc", "smd"):
dat = json.load(open("data/%s-neighborhood.json" % anc_smd))
for ix in dat:
ward = ix[anc_smd]["id"][0]
anc = ix[anc_smd]["id"][1]
if anc_smd == "anc":
feature = output[ward]["ancs"][anc]
else:
smd = ix[anc_smd]["id"][2:]
feature = output[ward]["ancs"][anc]["smds"][smd]
feature["neighborhoods"].append({
"name": ix["neighborhood"]["name"],
"part-of-" + anc_smd: ix[anc_smd]["ratio"],
"part-of-neighborhood": ix["neighborhood"]["ratio"],
"population": int(round(hood_population[ix["neighborhood"]["id"]] * ix["neighborhood"]["ratio"])),
})
# Sort so that JSON output is consistent from run to run.
for ward in output.values():
for anc in ward["ancs"].values():
anc["neighborhoods"].sort(key = lambda h : -h["part-of-anc"])
for smd in anc["smds"].values():
smd["neighborhoods"].sort(key = lambda h : -h["part-of-smd"])
def add_census_data(output):
# Estimate ANC/SMD Census population data by averaging across the tracts that
# each ANC and SMD intersects with. Tracts appear to be the lowest level that
# interesting data is uniformly available from the 2011 American Community Survey.
# We can get more precise information on population from block groups though.
# Pre-load the relevant data points from the Census API.
census_fields = {
"2010/sf1": [
("P0010001", int, "count"), # Total Population
("P0180001", int, "count"), # Households
("P0180002", int, "count"), # Family Households
("H0040001", int, "count"), # Occupied Housing Units
("H0050001", int, "count"), # Vacant Housing Units
],
# http://www.census.gov/developers/data/acs_5yr_2011_var.xml
"2011/acs5": [
("B01003_001E", int, "count"), # Total Population
("B01002_001E", float, "median"), # Median Age
("B07001_001E", int, "count"), # Geographic Mobility in the Past Year Total Population
("B07001_017E", int, "count"), # .... Same House One Year Ago
("B07001_033E", int, "count"), # .... Moved Within Same County
("B07001_065E", int, "count"), # .... Moved From Different State
("B07001_081E", int, "count"), # .... Moved From Abroad
("B19019_001E", float, "median"), # Median Household Income
("B16002_001E", int, "count"), # No One Age 14+ in Household Speaks English
],
}
census_data = { }
for division in ("tract", "blockgroup"):
dat = json.load(open("data/smd-%s.json" % division))
for intsect in dat:
#if intsect["smd"]["id"] != "1A02": continue
# get the ID of this tract or blockgroup
id = intsect[division]["id"]
if id in census_data: continue
census_data[id] = { }
# query Census API
state_fips, county_fips, tract, bg_num = id[0:2], id[2:5], id[5:11], (id+"?")[11:12]
for census_table in census_fields:
field_list = ",".join(f[0] for f in census_fields[census_table])
if division == "tract":
url = "http://api.census.gov/data/%s?key=%s&get=%s&in=state:%s+county:%s&for=tract:%s" \
% (census_table, census_api_key, field_list, state_fips, county_fips, tract)
else:
url = "http://api.census.gov/data/%s?key=%s&get=%s&in=state:%s+county:%s+tract:%s&for=blockgroup:%s" \
% (census_table, census_api_key, field_list, state_fips, county_fips, tract, bg_num)
data_from_api = json.load(urlopen(url))
for i, (fieldname, datatype, sum_mode) in enumerate(census_fields[census_table]):
v = data_from_api[1][i]
if v is not None: v = datatype(v)
census_data[id][fieldname] = v
# Clear out existing data.
for ward in output.values():
for anc in ward["ancs"].values():
anc["census"] = { "by-blockgroup": { }, "by-tract": { } }
for smd in anc["smds"].values():
smd["census"] = { "by-blockgroup": { }, "by-tract": { } }
# Estimate values for the ANCs and SMDs as a whole.
for division1 in ("smd", "anc"):
for division2 in ("tract", "blockgroup"):
dat = json.load(open("data/%s-%s.json" % (division1, division2)))
for ix in dat:
# Here's an intersection between an ANC/SMD and a blockgroup/tract.
# Get the dict that represents the ANC or SMD's census info.
ward = ix[division1]["id"][0]
anc = ix[division1]["id"][1]
if division1 == "anc":
feature = output[ward]["ancs"][anc]
else:
smd = ix[division1]["id"][2:]
feature = output[ward]["ancs"][anc]["smds"][smd]
feature = feature["census"]["by-" + division2]
# Get the id of the blockgroup or tract.
census_id = ix[division2]["id"]
#if census_id not in census_data: continue # only for testing
# Estimate each field for the ANC/SND as a whole.
for census_table in census_fields:
for fieldname, datatype, summode in census_fields[census_table]:
if summode == "count":
# This value may not available from the Census (some blockgroup ACS data) so
# mark the feature as not-computable.
if census_data[census_id][fieldname] is None:
feature[fieldname] = "missing-data"
continue
if feature.get(fieldname) == "missing-data":
# Already marked this field is non-computable because data is missing?
continue
# This field is a count, so we estimate the whole by adding the
# field values weighted by the ratio of the intersection area to
# the area of the blockgroup or tract.
feature.setdefault(fieldname, { "value": 0, "type": summode })
feature[fieldname]["value"] += census_data[census_id][fieldname] * ix[division2]["ratio"]
elif summode == "median":
# For medians, we take the average across all of the intersections
# weighted by the population in each intersction.
feature.setdefault(fieldname, { "value": 0.0, "type": summode, "weight": 0.0, "missing_weight": 0.0 })
w = census_data[census_id]["B01003_001E"] * ix[division2]["ratio"]
# This value may not available from the Census (some blockgroup ACS data). It seems
# like even tracts are missing some of these values, but we want to estimate as
# best as possible so we'll estimate from those tracts where values exist, and we'll
# record the proportion of the population in this ANC/SMD that did not contribute
# to the estimate.
if census_data[census_id][fieldname] is None:
feature[fieldname]["missing_weight"] += w
continue
feature[fieldname]["value"] += census_data[census_id][fieldname] * w
feature[fieldname]["weight"] += w
# Finish up.
def clean_up(f):
# Prefer blockgroup statistics if available.
# Round some values.
# For medians, divide by the total weight.
for division in ("tract", "blockgroup"):
for k, v in list(f["by-" + division].items()):
if v == "missing-data": continue
if v["type"] == "count":
f[k] = v
f[k]["source"] = division
v["value"] = int(round(v["value"]))
elif v["type"] == "median" and v["weight"] > 0:
# Skip this if we're missing more data as a blockgroup than as a tract,
# or if we're just missing a lot of data (more than 33%).
v["missing_pct"] = v["missing_weight"] / (v["weight"] + v["missing_weight"])
if v["missing_pct"] > .33 or (k in f and f[k]["missing_pct"] < v["missing_pct"]): continue
f[k] = v
f[k]["source"] = division
v["value"] = round(v["value"] / v["weight"])
del v["weight"]
del v["missing_weight"]
del v["type"]
del f["by-" + division]
for ward in output.values():
for anc in ward["ancs"].values():
clean_up(anc["census"])
for smd in anc["smds"].values():
clean_up(smd["census"])
if __name__ == "__main__":
if not os.path.exists("update_anc_database_creds.py"):
print("This program downloads our public Google Doc with Ward/ANC/SMD information.")
print("But we need your Google login and Census API key info to get it...")
google_email = input("google account email> ")
google_password = getpass.getpass("google account password> ")
census_api_key = getpass.getpass("census api key> ")
else:
# Or create a file named "update_anc_database_creds.py" and put in
# it your Google credentials and Census API key (see the README).
exec(compile(open("update_anc_database_creds.py").read(), "update_anc_database_creds.py", 'exec'))
if "--reset" in sys.argv:
# Re-create file from scratch.
output = get_base_data()
else:
# Update file in place.
output = json.load(open("ancbrigadesite/static/ancs.json"))
def should(argname):
# Should we process this selective update? Yes if:
# * no command line arguments were given (i.e. process all)
# * --reset was specified (must process all)
# * --argname was specified
return len(sys.argv) == 1 or "--reset" in sys.argv or ("--"+argname) in sys.argv
if should("base"):
# always do this together
add_googledoc_data(output)
add_scraperwiki_data(output)
if should("terms"): add_term_data(output)
if should("gis"): add_geographic_data(output)
if should("neighborhoods"): add_neighborhood_data(output)
if should("census"): add_census_data(output)
# Output.
output = json.dumps(output, indent=True, ensure_ascii=False, sort_keys=True)
## for old static file site
#with open("www/ancs.jsonp", "w") as f:
# f.write("anc_data = ")
# f.write(output)
# for new Django-backed site
with open("ancbrigadesite/static/ancs.json", "w") as f:
f.write(output)