-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocessing.py
486 lines (437 loc) · 21.1 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
import pandas as pd
import numpy as np
import random
import string
import os
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib
def nc_race(x):
if x.ethnic_code.strip() == 'HL':
return 4
else:
if x.race_code.strip() == 'W':
return 5
elif x.race_code.strip() == 'B':
return 3
elif x.race_code.strip() == 'A':
return 2
else:
return 6
def transform_output(x):
"""
Tranform predict_ethnic output from ethnicity name to code in order to match voter's file
:param x: string
:return: int
"""
if x == 'white':
return 5
elif x == 'black':
return 3
elif x == 'asian':
return 2
elif x == 'hispanic':
return 4
elif x == 'other':
return 6
else:
raise Exception('Undefined ethnic %s' % x)
def preprocess_surname(file_loc):
"""
This function preprocess surname list csv file and return a Dataframe
containing ethnicity probability conditioned on surname.
:param file_loc: string
:return: DataFrame, having column=['name', 'perc', 'white', 'black', 'api',
'aian', '2race', 'hispanic']
'perc' : percentage of this name in total population
'white' : Percent Non-Hispanic White
'black' : Percent Non-Hispanic Black
'api' : Percent Non-Hispanic Asian and Pacific Islander
'aian' : Percent Non-Hispanic American Indian and Alaskan Native
'2race' : Percent Non-Hispanic of Two or More Races
'hispanic' : Percent Hispanic Origin
"""
try:
name_prob = pd.read_csv(file_loc)
except:
raise Exception('Cannot open surname list csv file')
name_prob = name_prob.convert_objects(convert_numeric=True)
name_prob = name_prob[[u'name', u'prop100k', u'pctwhite',
u'pctblack', u'pctapi', u'pctaian', u'pct2prace', u'pcthispanic']]
name_prob.columns = [
'name', 'perc', 'white', 'black', 'api', 'aian', '2race', 'hispanic']
name_prob['other'] = 0
other_list = ['aian', '2race']
for other_race in other_list:
name_prob.loc[:, 'other'] = name_prob.loc[
:, 'other'] + name_prob.loc[:, other_race]
race_list = ['white', 'black', 'api', 'other', 'hispanic']
for race in race_list:
name_prob[race] = name_prob[race] / float(100)
name_prob['perc'] = name_prob['perc'] / float(100000)
name_prob.rename(columns={'api': 'asian'}, inplace=True)
name_prob.index = name_prob['name']
name = name_prob.drop(['name', 'aian', '2race'], 1)
return name
def read_census(file_loc, census_type='group'):
"""
This function read census file. If census data is block group level, it will extract
ethnicty information. If census data is block level, it will just read csv file as
DataFrame. This function is used in preprocess_census function.
:param file: string, dataset file location
:param census_type: string, specify what level census data is provided ('group' or 'block')
:return: DataFrame
For census block group data, it has column=['total', 'white', 'black', 'indian_alaska',
'asian', 'hawaiian_islander', 'other', '2ormore', 'hispanic', 'TRACT', 'BLKGRP', 'COUNTY',
'y5', 'y9', 'y14', 'y17', 'y24', 'y34', 'y44', 'y54', 'y64', 'y74', 'y84', 'y85o']]
'total': total number of people in this census block group
'white', 'black', 'indian_alaska', 'asian', 'hawaiian_islander', 'hispanic':
number of people in this ethnic in this census block group
'2ormore': number of people having two or more ethnicity
'other': number of people having other ethnicity not described above
'TRACT', 'BLKGRP', 'COUNTY': geographical code describing this census block group
'y5', 'y9', 'y14', 'y17', 'y24', 'y34', 'y44', 'y54', 'y64', 'y74', 'y84':
number of people within an interval of age in this census block group.
eg: 'y9' represents number of people from age 5 to age 9
'y14' represents number of people from age 10 to age 14
'y85o': number of people 85 years and over
"""
try:
if census_type == 'group':
census = pd.read_csv(file_loc, dtype=object)
census = census[['SE_T015_001', 'SE_T015_003', 'SE_T015_004',
'SE_T015_005', 'SE_T015_006', 'SE_T015_007',
'SE_T015_008', 'SE_T015_009', 'SE_T015_010',
'Geo_TRACT', 'Geo_BLKGRP', 'Geo_COUNTY',
'SE_T008_002', 'SE_T008_003', 'SE_T008_004',
'SE_T008_005', 'SE_T008_006', 'SE_T008_007',
'SE_T008_008', 'SE_T008_009', 'SE_T008_010',
'SE_T008_011', 'SE_T008_012', 'SE_T008_013']]
elif census_type == 'block':
census = pd.read_csv(file_loc, dtype=object)
return census
except:
raise Exception('Cannot open census csv file. Please make sure to '
'choose correct census_type, and download correct '
'census data. eg: census block group level from: '
'http://old.socialexplorer.com/pub/reportdata/'
'CsvResults.aspx?reportid=R10950075 '
'or census block level from NHGIS '
'https://www.nhgis.org')
col_dict = {'SE_T015_001': 'total', 'SE_T015_003': 'white', 'SE_T015_004':
'black', 'SE_T015_005': 'indian_alaska', 'SE_T015_006':
'asian', 'SE_T015_007': 'hawaiian_islander', 'SE_T015_008':
'other', 'SE_T015_009': '2race', 'SE_T015_010': 'hispanic',
'Geo_TRACT': 'TRACT', 'Geo_BLKGRP': 'BLKGRP', 'Geo_COUNTY':
'COUNTY', 'SE_T008_002': 'y5', 'SE_T008_003': 'y9',
'SE_T008_004': 'y14', 'SE_T008_005': 'y17', 'SE_T008_006':
'y24', 'SE_T008_007': 'y34', 'SE_T008_008': 'y44',
'SE_T008_009': 'y54', 'SE_T008_010': 'y64', 'SE_T008_011':
'y74', 'SE_T008_012': 'y84', 'SE_T008_013': 'y85o'}
census.rename(columns=col_dict, inplace=True)
return census
def create_cbg2000(census_df, transform=False):
"""
Create cbg2000 geocoding from borocode, tract and blkgrp in census block
group data.
:param census_df: DataFrame, cleaned census data output by read_census()
transform: Boolean, transform=True when NY's census block group data
and voter's file is used, else transform=False
:return: DataFrame, having a new column called 'cbg2000'
"""
census = census_df
# creating cbg2000 geocode
if transform is True:
census['borocode'] = ''
census.loc[census['COUNTY'] == '047', 'borocode'] = '3'
census.loc[census['COUNTY'] == '081', 'borocode'] = '4'
census.loc[census['COUNTY'] == '061', 'borocode'] = '1'
census.loc[census['COUNTY'] == '005', 'borocode'] = '2'
census.loc[census['COUNTY'] == '085', 'borocode'] = '5'
census['cbg2000'] = ''
census.loc[:, 'cbg2000'] = census.loc[:, 'borocode'] + \
census.loc[:, 'TRACT'] + census.loc[:, 'BLKGRP']
census = census.drop('borocode', 1)
else:
census.loc[:, 'cbg2000'] = census.loc[:, 'COUNTY'] + \
census.loc[:, 'TRACT'] + census.loc[:, 'BLKGRP']
return census
def preprocess_census(file_loc, transform=False, census_type='group'):
"""
Preprocess census data. It combines ethnicity information to percentage
of people in white, black, asian, hispanic and other. It also gives
percentage of people in different age range described in read_census().
:param file_loc: string, location of census data
:param transform: Boolean, transform=True when NY's census block group data
and voter's file is used, else transform=False
:param census_type: string, specify what level census data is provided
('group' or 'block')
:return: DataFrame, having column=['white', 'black', 'asian', 'other',
'hispanic', 'y5', 'y9', 'y14', 'y17', 'y24', 'y34', 'y44', 'y54',
'y64', 'y74', 'y84', 'y85o', 'total'].
Index is location geocode.
"""
census = read_census(file_loc, census_type=census_type)
if census_type == 'group':
# For census block group data
census = create_cbg2000(census, transform)
float_type_list = ['total', 'white', 'black', 'indian_alaska', 'asian',
'hawaiian_islander', 'other', '2race', 'hispanic',
'y5', 'y9', 'y14', 'y17', 'y24', 'y34', 'y44',
'y54', 'y64', 'y74', 'y84', 'y85o']
census[float_type_list] = census[float_type_list].astype(float)
# combining asian and hawaiian islander to be asian and pacific
# islander (aian)
census.loc[:, 'asian'] = census.loc[
:, 'asian'] + census.loc[:, 'hawaiian_islander']
# combining indian_alaska and two more races to be other
other_list = ['indian_alaska', '2race']
for other_race in other_list:
census.loc[:, 'other'] = census.loc[
:, 'other'] + census.loc[:, other_race]
census = census.drop(other_list, 1)
census = census.drop(['TRACT', 'BLKGRP', 'COUNTY'], 1)
# normalize count to percentage
normalize_list = ['white', 'black', 'asian', 'other', 'hispanic', 'y5',
'y9', 'y14', 'y17', 'y24', 'y34', 'y44', 'y54',
'y64', 'y74', 'y84', 'y85o']
for col in normalize_list:
census[col] = census[col] / census['total']
census['perc'] = census['total'] / census['total'].sum()
census.index = census['cbg2000']
census = census.drop('cbg2000', 1)
return census
elif census_type == 'block':
# Census 2000
if 'FX1001' in census.columns:
census = census[['GISJOIN', 'FX1001', 'FX1002', 'FX1003', 'FX1004','FX1005','FX1006','FXZ001']]
census[['FX1001', 'FX1002', 'FX1003', 'FX1004','FX1005','FX1006','FXZ001']] = \
census[['FX1001', 'FX1002', 'FX1003', 'FX1004','FX1005','FX1006','FXZ001']].astype(float)
census['total'] = census.sum(axis=1)
col_dict = {'FX1001': 'white', 'FX1002': 'black', 'FX1003': 'indian_alaska',
'FX1004': 'asian', 'FX1005': 'hawaiian_islander', 'FX1006': 'other',
'FXZ001': 'hispanic'}
# Census 2010
elif 'H7Z010' in census.columns:
census = census[['GISJOIN', 'H7Z003', 'H7Z004', 'H7Z005', 'H7Z006','H7Z007','H7Z008','H7Z010']]
census[['H7Z003', 'H7Z004', 'H7Z005', 'H7Z006','H7Z007','H7Z008','H7Z010']] = \
census[['H7Z003', 'H7Z004', 'H7Z005', 'H7Z006','H7Z007','H7Z008','H7Z010']].astype(float)
census['total'] = census.sum(axis=1)
col_dict = {'H7Z003': 'white', 'H7Z004': 'black', 'H7Z005': 'indian_alaska',
'H7Z006': 'asian', 'H7Z007': 'hawaiian_islander', 'H7Z008': 'other',
'H7Z010': 'hispanic'}
else:
raise Exception('Unknown census file')
census.rename(columns=col_dict, inplace=True)
census.loc[:, 'asian'] = census.loc[
:, 'asian'] + census.loc[:, 'hawaiian_islander']
other_list = ['indian_alaska']
for other_race in other_list:
census.loc[:, 'other'] = census.loc[
:, 'other'] + census.loc[:, other_race]
normalize_list = ['white', 'black', 'asian', 'other', 'hispanic']
for col in normalize_list:
census[col] = census[col] / census['total']
census['perc'] = census['total'] / census['total'].sum()
census.index = census['GISJOIN']
return census
else:
raise Exception('Undefined census type %s' %census_type)
def create_location_prob(cleaned_census_df):
"""
Extract ethnicity probability conditioned on location from cleaned census DataFrame
:param cleaned_census_df: DataFrame, output from preprocess_census()
:return: DataFrame, having column=['total', 'white', 'black', 'asian', 'other',
'hispanic', 'perc'].
Index is location geocode.
"""
census = cleaned_census_df
# create a dataframe containing ethnicity probability conditioned on block
# location
location_prob = census[
['total', 'white', 'black', 'asian', 'other', 'hispanic', 'perc']]
return location_prob
def create_age_prob(cleaned_census_df):
"""
Extract age probability conditioned on location from cleaned census DataFrame
:param cleaned_census_df: DataFrame, output from preprocess_census()
:return: DataFrame, having column=['total', 'y5', 'y9', 'y14', 'y17', 'y24',
'y34', 'y44', 'y54', 'y64', 'y74', 'y84', 'y85o', 'perc']
"""
census = cleaned_census_df
# create a dataframe containing age probability conditioned on block
# location
age_prob = census[['total', 'y5', 'y9', 'y14', 'y17', 'y24',
'y34', 'y44', 'y54', 'y64', 'y74', 'y84', 'y85o', 'perc']]
return age_prob
def create_location_ethnic_prob(cleaned_census_df, return_ethnic_perc=False):
"""
Create a DataFrame containing location probability conditioned on ethnicity
P(location | ethnicity)
:param cleaned_census_df: DataFrame, from output of preprocess_census()
:return location_ethnic_prob: DataFrame
ethnic_perc: Series, containing percentage of each ethnicity
"""
location_prob = cleaned_census_df[['total', 'white', 'black', 'asian', 'other',
'hispanic', 'perc']]
location_ethnic_prob = location_prob.copy()
ethnic_list = ['white', 'black', 'asian', 'other', 'hispanic']
ethnic_perc = dict()
for ethnic in ethnic_list:
temp = location_prob[ethnic] * location_prob['perc']
ethnic_perc[ethnic] = temp.sum()
location_ethnic_prob.loc[:, ethnic] = temp / ethnic_perc[ethnic]
ethnic_perc = pd.Series(ethnic_perc)
if return_ethnic_perc:
return location_ethnic_prob, ethnic_perc
else:
return location_ethnic_prob
def validate_input(lastname, cbg2000):
"""
Check whether lastname and cbg2000 have same length.
:param lastname: string or list
:param cbg2000: string or list
:return: lastname_list: list
cbg2000_list: list
"""
lastname_list = lastname
cbg2000_list = cbg2000
if isinstance(lastname, str):
lastname_list = [lastname]
if isinstance(cbg2000, str):
cbg2000_list = [cbg2000]
if len(cbg2000_list) != len(lastname_list) and len(cbg2000) > 0:
raise Exception(
'Input lastname list and cbg2000 list should have same length')
return lastname_list, cbg2000_list
def read_voter(file_loc):
"""
Read voter's file and return a DataFrame
:param file_loc: string, location of voter's file
:return: DataFrame
"""
if isinstance(file_loc, str):
file_type = file_loc.split('.')[-1]
if file_type == 'csv':
voter_file = pd.read_csv(file_loc, dtype=object)
elif file_type == 'dta':
voter_file = pd.read_stata(file_loc, preserve_dtypes=False,
convert_categoricals=False, convert_dates=False)
else:
raise Exception("Can not open voter's file, please input a csv or dta file")
return voter_file
else:
raise Exception("Please input string as file location")
def preprocess_voter(file_loc, census_type='group', sample=0, remove_name=True):
"""
Preprocess voter's file. It will drop rows with na in ['voter_id', 'gisjoin10',
'gisjoin00', 'lastname', 'firstname', 'gender', 'race', 'birth_date']. If
type='group', it will also create geocode from county, tract and blkgroup to match
census block group file.
:param file_loc: string
:param type: string, 'group' or 'block'
:param sample: int, if greater than 0, it will sample rows from voter file
:param remove_name: boolean, if True, it will remove voter whose surname is not in
census name list.
:return:
"""
test = read_voter(file_loc)
print("Finish reading from file")
if sample > 0:
rows = random.sample(test.index, sample)
test = test.ix[rows]
if census_type == 'group':
if 'race' in test.columns:
id_use = ['voter_id', 'county', 'tract', 'blkgroup', 'lastname',
'firstname', 'gender', 'race', 'birth_date', '_merge']
else:
id_use = ['voter_id', 'county', 'tract', 'blkgroup', 'lastname',
'firstname', 'gender', 'birth_date', '_merge']
str_use = ['county', 'tract', 'blkgroup']
test = test[id_use]
test = test.dropna(axis=0)
test[str_use] = test[str_use].astype(int).astype(str)
test['county'] = test['county'].map(lambda x: x.rjust(3, '0'))
test['tract'] = test['tract'].map(lambda x: x.rjust(6, '0'))
test.loc[:, 'bctcb2000'] = test.loc[:, 'county'] + \
test.loc[:, 'tract'] + test.loc[:, 'blkgroup']
elif census_type == 'block':
try:
if 'race' in test.columns:
id_use = ['voter_id', 'gisjoin10', 'gisjoin00', 'lastname',
'firstname', 'gender', 'race', 'birth_date']
else:
id_use = ['voter_id', 'gisjoin10', 'gisjoin00', 'lastname',
'firstname', 'gender', 'birthday']
test = test[id_use]
test = test.dropna(axis=0)
except:
id_use = ['voter_reg_num', 'gisjoin10', 'gisjoin00',
'last_name', 'first_name', 'sex_code', 'ethnic_code', 'race_code']
test = test[id_use]
test = test.dropna(axis=0)
col_dict = {'voter_reg_num': 'voter_id', 'last_name': 'lastname',
'first_name': 'firstname'}
test.rename(columns=col_dict, inplace=True)
else:
raise Exception('Undefined type %s' % census_type)
# remove rows having lastname not in census name list
if 'ethnic_code' in test.columns:
print("Starting applying race")
form_race = test.apply(nc_race, axis=1)
test['race'] = form_race
test['lastname'] = test['lastname'].map(lambda x: x.upper())
test['lastname'] = test['lastname'].apply(string.strip)
if remove_name == True:
name_prob = preprocess_surname('./data/surname_list/app_c.csv')
intlastname = np.in1d(test['lastname'], name_prob.index)
test = test[intlastname]
# combine some ethnics to 'other'
if 'race' in test.columns:
test.race = test.race.astype(float).astype(int)
test.race = test.race.replace({7: 6, 1: 6, 9: 6})
return test
def create_name_predictor(file_loc, n_gram=(2,5), save=True):
"""
Using character level n_gram and logistic regression to train a classification
model to predict ethnicity based on surname only.
:param file_loc: string, surname list file location
:param n_gram: tuple (min_n, max_n), The lower and upper boundary of the range
of n-values for different n-grams to be extracted. All values of n such that
min_n <= n <= max_n will be used.
:param save: boolean, if True, it will save models to ./model/ directory
:return: n_gram_model to vectorize string and classifier to do classification
"""
name_prob = preprocess_surname(file_loc).fillna(0)
name_prob = name_prob[pd.Series(name_prob.index).notnull().tolist()]
name_prob = name_prob[['white','black','asian','hispanic','other']]
name_prob['label'] = name_prob.idxmax(axis=1)
name_list = pd.Series(name_prob.index.tolist())
n_gram_model = CountVectorizer(analyzer='char', ngram_range=n_gram)
train_x = n_gram_model.fit_transform(name_list.tolist())
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs')
classifier.fit(train_x, name_prob['label'])
if save == True:
if not os.path.exists('./model/'):
os.makedirs('./model/')
joblib.dump(n_gram_model, './model/n_gram.pkl')
joblib.dump(classifier, './model/classifier.pkl')
return n_gram_model, classifier
def n_gram_name_prob(n_gram_model, classifier, surname):
"""
Create surname_ethnicity probability dataframe using n_gram_model and classifier.
:param n_gram_model: saved n_gram_model or returned by create_name_predictor
:param classifier: saved classifier_model or returned by create_name_predictor
:param surname: list, list of surname
:return: DataFrame, containing surname_ethnicity probability
"""
name_col = classifier.classes_
test_x = n_gram_model.transform(surname)
predict_prob = classifier.predict_proba(test_x)
return pd.DataFrame(predict_prob, columns=name_col, index=surname)
if __name__ == '__main__':
name = preprocess_surname('./data/surname_list/app_c.csv')
print(name.iloc[:3])
census = preprocess_census('./data/Census2000_BG/C2000_NY.csv', transform=True, census_type='group')
location_ethnic_prob = create_location_ethnic_prob(census)
print(location_ethnic_prob.iloc[:3])