From cbb04918d338a54f0195899b6c89b379c1f84927 Mon Sep 17 00:00:00 2001
From: unknown <eddie.janowicz@gmail.com>
Date: Thu, 16 Jul 2015 10:09:08 -0700
Subject: [PATCH] pums slicing script to use fixed-length puma ids

---
 scripts/dl_and_slice_pums.py        |  13 +-
 scripts/synth_example.py            |   2 +-
 synthpop/recipes/starter.py         |   8 +-
 synthpop/recipes/starter2.py        |  12 +-
 synthpop/recipes/starter3.py        | 324 ----------------------------
 synthpop/test/test_censushelpers.py |   2 +-
 6 files changed, 21 insertions(+), 340 deletions(-)
 delete mode 100644 synthpop/recipes/starter3.py

diff --git a/scripts/dl_and_slice_pums.py b/scripts/dl_and_slice_pums.py
index 54780bf..6b5a0c3 100644
--- a/scripts/dl_and_slice_pums.py
+++ b/scripts/dl_and_slice_pums.py
@@ -36,9 +36,6 @@
         with zipfile.ZipFile(filepath, "r") as z:
             z.extractall(loader.get_path('pums'))
 
-pums_file = 'ss13pusb.csv'
-pums = pd.read_csv(os.path.join(loader.get_path('pums'), pums_file))
-
 for pums_file in ['ss13husa.csv', 'ss13husb.csv', 
                   'ss13husc.csv', 'ss13husd.csv',
                   'ss13pusa.csv', 'ss13pusb.csv',
@@ -47,14 +44,21 @@
     pums = pd.read_csv(os.path.join(loader.get_path('pums'), pums_file))
 
     for state_id in np.unique(pums['ST']):
-        '    Processing pums for state %s' % state_id
+        print '    Processing pums for state %s' % state_id
         pum_state = pums[pums['ST'] == state_id]
+        state_id = '{:>02}'.format(state_id)
+        if pums_file[4] == 'h':
+            pums_state_filename = 'puma_h_%s.csv' % (state_id)
+        elif pums_file[4] == 'p':   
+            pums_state_filename = 'puma_p_%s.csv' % (state_id)
+        pum_state.to_csv(os.path.join(loader.get_path('pums'), pums_state_filename), index = False)
 
         print '        Slicing up pums files by 2000 pumas'
         for puma00 in np.unique(pum_state['PUMA00']):
             if puma00 != -9:
                 print puma00
                 df = pum_state[pum_state['PUMA00'] == puma00]
+                puma00 = '{:>05}'.format(puma00)
                 if pums_file[4] == 'h':
                     output_filename = 'puma00_h_%s_%s.csv' % (state_id, puma00)
                 elif pums_file[4] == 'p':   
@@ -66,6 +70,7 @@
             if puma10 != -9:
                 print puma10
                 df = pum_state[pum_state['PUMA10'] == puma10]
+                puma10 = '{:>05}'.format(puma10)
                 if pums_file[4] == 'h':
                     output_filename = 'puma10_h_%s_%s.csv' % (state_id, puma10)
                 elif pums_file[4] == 'p':   
diff --git a/scripts/synth_example.py b/scripts/synth_example.py
index 13837fc..76c9978 100644
--- a/scripts/synth_example.py
+++ b/scripts/synth_example.py
@@ -1,4 +1,4 @@
-from synthpop.recipes.starter3 import Starter
+from synthpop.recipes.starter2 import Starter
 from synthpop.synthesizer import synthesize_all, enable_logging 
 import os
 
diff --git a/synthpop/recipes/starter.py b/synthpop/recipes/starter.py
index ee37f31..6977948 100644
--- a/synthpop/recipes/starter.py
+++ b/synthpop/recipes/starter.py
@@ -138,9 +138,9 @@ def get_household_joint_dist_for_geography(self, ind):
         puma10, puma00 = c.tract_to_puma(ind.state, ind.county, ind.tract)
         # this is cached so won't download more than once
         if type(puma00) == str:
-            h_pums = self.c.download_household_pums(int(ind.state), int(puma10), int(puma00))
+            h_pums = self.c.download_household_pums(ind.state, puma10, puma00)
         elif np.isnan(puma00): # only puma10 available
-            h_pums = self.c.download_household_pums(int(ind.state), int(puma10), None)
+            h_pums = self.c.download_household_pums(ind.state, puma10, None)
 
         def cars_cat(r):
             if r.VEH == 0:
@@ -184,9 +184,9 @@ def get_person_joint_dist_for_geography(self, ind):
         puma10, puma00 = c.tract_to_puma(ind.state, ind.county, ind.tract)
         # this is cached so won't download more than once
         if type(puma00) == str:
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), int(puma00))
+            p_pums = self.c.download_population_pums(ind.state, puma10, puma00)
         elif np.isnan(puma00): # only puma10 available
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), None)
+            p_pums = self.c.download_population_pums(ind.state, puma10, None)
 
         def age_cat(r):
             if r.AGEP <= 19:
diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py
index 4984ab4..6493cce 100644
--- a/synthpop/recipes/starter2.py
+++ b/synthpop/recipes/starter2.py
@@ -179,11 +179,11 @@ def get_household_joint_dist_for_geography(self, ind):
         puma10, puma00 = c.tract_to_puma(ind.state, ind.county, ind.tract)
         # this is cached so won't download more than once
         if type(puma00) == str:
-            h_pums = self.c.download_household_pums(int(ind.state), int(puma10), int(puma00))
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), int(puma00))
+            h_pums = self.c.download_household_pums(ind.state, puma10, puma00)
+            p_pums = self.c.download_population_pums(ind.state, puma10, puma00)
         elif np.isnan(puma00): # only puma10 available
-            h_pums = self.c.download_household_pums(int(ind.state), int(puma10), None)
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), None)
+            h_pums = self.c.download_household_pums(ind.state, puma10, None)
+            p_pums = self.c.download_population_pums(ind.state, puma10, None)
             
         h_pums = h_pums.set_index('serialno')
 
@@ -294,9 +294,9 @@ def get_person_joint_dist_for_geography(self, ind):
         puma10, puma00 = c.tract_to_puma(ind.state, ind.county, ind.tract)
         # this is cached so won't download more than once
         if type(puma00) == str:
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), int(puma00))
+            p_pums = self.c.download_population_pums(ind.state, puma10, puma00)
         elif np.isnan(puma00): # only puma10 available
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), None)
+            p_pums = self.c.download_population_pums(ind.state, puma10, None)
 
         def age_cat(r):
             if r.AGEP <= 19:
diff --git a/synthpop/recipes/starter3.py b/synthpop/recipes/starter3.py
deleted file mode 100644
index 6f22625..0000000
--- a/synthpop/recipes/starter3.py
+++ /dev/null
@@ -1,324 +0,0 @@
-from .. import categorizer as cat
-from ..census_helpers import Census
-import pandas as pd, numpy as np
-
-
-# TODO DOCSTRINGS!!
-class Starter:
-    """
-    This is a recipe for getting the marginals and joint distributions to use
-    to pass to the synthesizer using simple categories - population, age,
-    race, and sex for people, and children, income, cars, and workers for
-    households.  This module is responsible for
-
-    Parameters
-    ----------
-    c : object
-        census_helpers.Census object
-    state : string
-        FIPS code the state
-    county : string
-        FIPS code for the county
-    tract : string, optional
-        FIPS code for a specific track or None for all tracts in the county
-
-    Returns
-    -------
-    household_marginals : DataFrame
-        Marginals per block group for the household data (from ACS)
-    person_marginals : DataFrame
-        Marginals per block group for the person data (from ACS)
-    household_jointdist : DataFrame
-        joint distributions for the households (from PUMS), one joint
-        distribution for each PUMA (one row per PUMA)
-    person_jointdist : DataFrame
-        joint distributions for the persons (from PUMS), one joint
-        distribution for each PUMA (one row per PUMA)
-    tract_to_puma_map : dictionary
-        keys are tract ids and pumas are puma ids
-    """
-    def __init__(self, key, state, county, tract=None):
-        self.c = c = Census(key)
-        self.state = state
-        self.county = county
-        self.tract = tract
-        
-        age_of_head_columns = ['B25007_0%02dE' % i for i in range(1, 22)]
-        race_of_head_columns = ['B25006_0%02dE' % i for i in range(1, 11)]
-        hispanic_head_columns = ['B25003I_0%02dE' % i for i in range(1, 4)]
-        hh_size_columns = ['B25009_0%02dE' % i for i in range(1, 18)]
-        income_columns = ['B19001_0%02dE' % i for i in range(1, 18)]
-        vehicle_columns = ['B08201_0%02dE' % i for i in range(1, 7)]
-        workers_columns = ['B08202_0%02dE' % i for i in range(1, 6)]
-        presence_of_children_columns = ['B11005_001E', 'B11005_002E', 'B11005_011E']
-        tenure_mover_columns = ['B25038_0%02dE' % i for i in range(1, 16)]
-        block_group_columns = income_columns + presence_of_children_columns + \
-                              tenure_mover_columns + hh_size_columns + age_of_head_columns + \
-                              race_of_head_columns + hispanic_head_columns
-        tract_columns = vehicle_columns + workers_columns
-        h_acs = c.block_group_and_tract_query(block_group_columns,
-                                              tract_columns, state, county,
-                                              merge_columns=['tract', 'county',
-                                                             'state'],
-                                              block_group_size_attr="B11005_001E",
-                                              tract_size_attr="B08201_001E",
-                                              tract=tract)
-        self.h_acs = h_acs
-
-        self.h_acs_cat = cat.categorize(h_acs, {
-            ("hh_age_of_head", "lt35"): "B25007_003E + B25007_004E + B25007_013E + B25007_014E",
-            ("hh_age_of_head", "gt35-lt65"): "B25007_005E + B25007_006E + B25007_007E + B25007_008E + "
-                                          "B25007_015E + B25007_016E + B25007_017E + B25007_018E",
-            ("hh_age_of_head", "gt65"): "B25007_009E + B25007_010E + B25007_011E + "
-                                     "B25007_019E + B25007_020E + B25007_021E",
-            ("hh_race_of_head", "black"): "B25006_003E",
-            ("hh_race_of_head", "white"): "B25006_002E",
-            ("hh_race_of_head", "asian"): "B25006_005E",
-            ("hh_race_of_head", "other"): "B25006_004E + B25006_006E + B25006_007E + B25006_008E ",
-            ("hispanic_head", "yes"): "B25003I_001E",
-            ("hispanic_head", "no"): "B11005_001E - B25003I_001E",
-            ("hh_children", "yes"): "B11005_002E",
-            ("hh_children", "no"): "B11005_011E",
-            ("hh_income", "lt30"): "B19001_002E + B19001_003E + B19001_004E + "
-                                "B19001_005E + B19001_006E",
-            ("hh_income", "gt30-lt60"): "B19001_007E + B19001_008E + B19001_009E + "
-                                      "B19001_010E + B19001_011E",
-            ("hh_income", "gt60-lt100"): "B19001_012E + B19001_013E",
-            ("hh_income", "gt100-lt150"): "B19001_014E + B19001_015E",
-            ("hh_income", "gt150"): "B19001_016E + B19001_017E",    
-            ("hh_cars", "none"): "B08201_002E",
-            ("hh_cars", "one"): "B08201_003E",
-            ("hh_cars", "two or more"): "B08201_004E + B08201_005E + B08201_006E",
-            ("hh_workers", "none"): "B08202_002E",
-            ("hh_workers", "one"): "B08202_003E",
-            ("hh_workers", "two or more"): "B08202_004E + B08202_005E",
-            ("tenure_mover", "own recent"): "B25038_003E",
-            ("tenure_mover", "own not recent"): "B25038_002E - B25038_003E",
-            ("tenure_mover", "rent recent"): "B25038_010E",
-            ("tenure_mover", "rent not recent"): "B25038_009E - B25038_010E",
-            ("hh_size", "one"): "B25009_003E + B25009_011E",
-            ("hh_size", "two"): "B25009_004E + B25009_012E",
-            ("hh_size", "three"): "B25009_005E + B25009_013E",
-            ("hh_size", "four or more"): "B25009_006E + B25009_014E + "
-                                         "B25009_007E + B25009_015E + "
-                                         "B25009_008E + B25009_016E + "
-                                         "B25009_009E + B25009_017E",
-        }, index_cols=['state', 'county', 'tract', 'block group'])
-        
-        #gq_population = ['B26001_001E']
-        hh_population = ['B11002_001E'] #HH population, for the hhpop/totalpop adjustment
-        population = ['B01001_001E'] #This includes GQ
-        hispanic = ['B03003_002E', 'B03003_003E']
-        sex = ['B01001_002E', 'B01001_026E']
-        race = ['B02001_0%02dE' % i for i in range(1, 11)]
-        male_age_columns = ['B01001_0%02dE' % i for i in range(3, 26)]
-        female_age_columns = ['B01001_0%02dE' % i for i in range(27, 50)]
-        all_columns = population + sex + race + male_age_columns + \
-            female_age_columns + hh_population + hispanic
-        p_acs = c.block_group_query(all_columns, state, county, tract=tract)
-
-        self.p_acs_cat = cat.categorize(p_acs, {
-            ("person_age", "19 and under"): "(B01001_003E + B01001_004E + B01001_005E + "
-                                     "B01001_006E + B01001_007E + B01001_027E + "
-                                     "B01001_028E + B01001_029E + B01001_030E + "
-                                     "B01001_031E) * B11002_001E*1.0/B01001_001E",
-            ("person_age", "20 to 35"): "(B01001_008E + B01001_009E + B01001_010E + "
-                                 "B01001_011E + B01001_012E + B01001_032E + "
-                                 "B01001_033E + B01001_034E + B01001_035E + "
-                                 "B01001_036E) * B11002_001E*1.0/B01001_001E",
-            ("person_age", "35 to 60"): "(B01001_013E + B01001_014E + B01001_015E + "
-                                 "B01001_016E + B01001_017E + B01001_037E + "
-                                 "B01001_038E + B01001_039E + B01001_040E + "
-                                 "B01001_041E) * B11002_001E*1.0/B01001_001E",
-            ("person_age", "above 60"): "(B01001_018E + B01001_019E + B01001_020E + "
-                                 "B01001_021E + B01001_022E + B01001_023E + "
-                                 "B01001_024E + B01001_025E + B01001_042E + "
-                                 "B01001_043E + B01001_044E + B01001_045E + "
-                                 "B01001_046E + B01001_047E + B01001_048E + "
-                                 "B01001_049E) * B11002_001E*1.0/B01001_001E",
-            ("race", "white"):   "(B02001_002E) * B11002_001E*1.0/B01001_001E",
-            ("race", "black"):   "(B02001_003E) * B11002_001E*1.0/B01001_001E",
-            ("race", "asian"):   "(B02001_005E) * B11002_001E*1.0/B01001_001E",
-            ("race", "other"):   "(B02001_004E + B02001_006E + B02001_007E + "
-                                 "B02001_008E) * B11002_001E*1.0/B01001_001E",
-            ("person_sex", "male"):     "(B01001_002E) * B11002_001E*1.0/B01001_001E",
-            ("person_sex", "female"):   "(B01001_026E) * B11002_001E*1.0/B01001_001E",
-            ("hispanic", "yes"):     "(B03003_003E) * B11002_001E*1.0/B01001_001E",
-            ("hispanic", "no"):   "(B03003_002E) * B11002_001E*1.0/B01001_001E",
-        }, index_cols=['state', 'county', 'tract', 'block group'])
-
-    def get_geography_name(self):
-        # this synthesis is at the block group level for most variables
-        return "block_group"
-
-    def get_num_geographies(self):
-        return len(self.p_acs_cat)
-
-    def get_available_geography_ids(self):
-        # return the ids of the geographies, in this case a state, county,
-        # tract, block_group id tuple
-        for tup in self.p_acs_cat.index:
-            yield pd.Series(tup, index=self.p_acs_cat.index.names)
-
-    def get_household_marginal_for_geography(self, ind):
-        return self.h_acs_cat.loc[tuple(ind.values)]
-
-    def get_person_marginal_for_geography(self, ind):
-        return self.p_acs_cat.loc[tuple(ind.values)]
-
-    def get_household_joint_dist_for_geography(self, ind):
-        c = self.c
-
-        puma10, puma00 = c.tract_to_puma(ind.state, ind.county, ind.tract)
-        # this is cached so won't download more than once
-        if type(puma00) == str:
-            h_pums = self.c.download_household_pums(int(ind.state), int(puma10), int(puma00))
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), int(puma00))
-        elif np.isnan(puma00):
-            print 'This geography not mapped to puma00, use puma10 pums records only'
-            h_pums = self.c.download_household_pums(int(ind.state), int(puma10), None)
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), None)
-        else:
-            import pdb; pdb.set_trace()
-            
-        h_pums = h_pums.set_index('serialno')
-
-        # get person pums, join to households, calculate needed household-level variables
-        age_of_head = p_pums[p_pums.RELP == 0].groupby('serialno').AGEP.max()
-        num_workers =  p_pums[p_pums.ESR.isin([1, 2, 4, 5])].groupby('serialno').size()
-        h_pums['race_of_head'] = p_pums[p_pums.RELP == 0].groupby('serialno').RAC1P.max()
-        h_pums['hispanic_head'] = p_pums[p_pums.RELP == 0].groupby('serialno').HISP.max()
-        h_pums['age_of_head'] = age_of_head
-        h_pums['workers'] = num_workers
-        h_pums.workers = h_pums.workers.fillna(0)
-        h_pums = h_pums.reset_index()
-
-        def age_of_head_cat(r):
-            if r.age_of_head < 35:
-                return "lt35"
-            elif r.age_of_head >= 65:
-                return "gt65"
-            return "gt35-lt65"
-            
-        def race_of_head_cat(r):
-            if r.race_of_head == 1:
-                return "white"
-            elif r.race_of_head == 2:
-                return "black"
-            elif r.race_of_head == 6:
-                return "asian"
-            return "other"
-            
-        def hispanic_head_cat(r):
-            if r.hispanic_head == 1:
-                return "no"
-            return "yes"
-
-        def hh_size_cat(r):
-            if r.NP == 1:
-                return "one"
-            elif r.NP == 2:
-                return "two"
-            elif r.NP == 3:
-                return "three"
-            return "four or more"
-
-        def cars_cat(r):
-            if r.VEH == 0:
-                return "none"
-            elif r.VEH == 1:
-                return "one"
-            return "two or more"
-
-        def children_cat(r):
-            if r.R18 == 1:
-                return "yes"
-            return "no"
-
-        def income_cat(r):
-            if r.HINCP >= 150000:
-                return "gt150"
-            elif (r.HINCP >= 100000) & (r.HINCP < 150000):
-                return "gt100-lt150"
-            elif (r.HINCP >= 60000) & (r.HINCP < 100000):
-                return "gt60-lt100"
-            elif (r.HINCP >= 30000) & (r.HINCP < 60000):
-                return "gt30-lt60"
-            return "lt30"
-
-        def workers_cat(r):
-            if r.workers >= 2:
-                return "two or more"
-            elif r.workers == 1:
-                return "one"
-            return "none"
-            
-        def tenure_mover_cat(r):
-            if (r.MV < 4) & (r.TEN < 3):
-                return "own recent"
-            elif (r.MV >= 4) & (r.TEN < 3):
-                return "own not recent"
-            elif (r.MV < 4) & (r.TEN >=3):
-                return "rent recent"
-            return "rent not recent"
-
-        h_pums, jd_households = cat.joint_distribution(
-            h_pums,
-            cat.category_combinations(self.h_acs_cat.columns),
-            {"hh_cars": cars_cat, "hh_children": children_cat,
-             "hh_income": income_cat, "hh_workers": workers_cat,
-             "tenure_mover": tenure_mover_cat,
-             "hh_size":hh_size_cat, "hh_age_of_head":age_of_head_cat,
-             "hh_race_of_head":race_of_head_cat,
-             "hispanic_head":hispanic_head_cat}
-        )
-        return h_pums, jd_households
-
-    def get_person_joint_dist_for_geography(self, ind):
-        c = self.c
-
-        puma10, puma00 = c.tract_to_puma(ind.state, ind.county, ind.tract)
-        # this is cached so won't download more than once
-        if type(puma00) == str:
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), int(puma00))
-        elif np.isnan(puma00):
-            print 'This geography not mapped to puma00, use puma10 pums records only'
-            p_pums = self.c.download_population_pums(int(ind.state), int(puma10), None)
-        else:
-            import pdb; pdb.set_trace()
-
-        def age_cat(r):
-            if r.AGEP <= 19:
-                return "19 and under"
-            elif r.AGEP <= 35:
-                return "20 to 35"
-            elif r.AGEP <= 60:
-                return "35 to 60"
-            return "above 60"
-
-        def race_cat(r):
-            if r.RAC1P == 1:
-                return "white"
-            elif r.RAC1P == 2:
-                return "black"
-            elif r.RAC1P == 6:
-                return "asian"
-            return "other"
-
-        def sex_cat(r):
-            if r.SEX == 1:
-                return "male"
-            return "female"
-
-        def hispanic_cat(r):
-            if r.HISP == 1:
-                return "no"
-            return "yes"
-
-        p_pums, jd_persons = cat.joint_distribution(
-            p_pums,
-            cat.category_combinations(self.p_acs_cat.columns),
-            {"person_age": age_cat, "race": race_cat, "person_sex": sex_cat,
-             "hispanic": hispanic_cat}
-        )
-        return p_pums, jd_persons
diff --git a/synthpop/test/test_censushelpers.py b/synthpop/test/test_censushelpers.py
index 3ef7780..bb35cd9 100644
--- a/synthpop/test/test_censushelpers.py
+++ b/synthpop/test/test_censushelpers.py
@@ -63,7 +63,7 @@ def test_wide_block_group_query(c):
 
 
 def test_tract_to_puma(c):
-    puma = c.tract_to_puma("06", "075", "030600")
+    puma = c.tract_to_puma("06", "075", "030600")[0]
     assert puma == "07506"