From b0bca6d94c2004b5b057f010019b5c33ab07ef65 Mon Sep 17 00:00:00 2001 From: AndreFCruz Date: Tue, 18 Jun 2024 17:04:17 +0200 Subject: [PATCH] added ACS target columns --- folktexts/_utils.py | 4 +- folktexts/acs/acs_columns.py | 103 ++++++++++++++++++++++++++++++----- 2 files changed, 92 insertions(+), 15 deletions(-) diff --git a/folktexts/_utils.py b/folktexts/_utils.py index 50a41bb..c1e0f96 100644 --- a/folktexts/_utils.py +++ b/folktexts/_utils.py @@ -91,7 +91,7 @@ def standardize_path(path: str | Path) -> str: return Path(path).expanduser().resolve().as_posix() -def get_thresholded_column_name(column_name: str, threshold: float | int) -> str: +def get_thresholded_column_name(column_name: str, threshold: float | int, *, op: str = ">") -> str: """Standardizes naming of thresholded columns.""" threshold_str = f"{threshold:.2f}".replace(".", "_") if isinstance(threshold, float) else str(threshold) - return f"{column_name}_binary_{threshold_str}" + return f"{column_name}_{op}_{threshold_str}" diff --git a/folktexts/acs/acs_columns.py b/folktexts/acs/acs_columns.py index 3fbac7b..d29570f 100644 --- a/folktexts/acs/acs_columns.py +++ b/folktexts/acs/acs_columns.py @@ -14,13 +14,14 @@ ACS_ST_FILE = Path(__file__).parent / "data" / "ST-codes-acs.txt" -# Describing ACS columns and corresponding questions +# AGEP: Age acs_age = ColumnToText( "AGEP", short_description="age", value_map=lambda x: f"{int(x)} years old", ) +# COW: Class of Worker acs_class_of_worker = ColumnToText( "COW", short_description="class of worker", @@ -37,6 +38,7 @@ }, ) +# SCHL: Educational Attainment acs_schooling = ColumnToText( "SCHL", short_description="highest educational attainment", @@ -68,6 +70,7 @@ }, ) +# MAR: Marital Status acs_marital_status = ColumnToText( "MAR", short_description="marital status", @@ -80,6 +83,7 @@ }, ) +# OCCP: Occupation acs_occupation = ColumnToText( "OCCP", short_description="occupation", @@ -90,17 +94,19 @@ ), ) +# POBP: Place of Birth acs_place_of_birth = ColumnToText( "POBP", short_description="place of birth", value_map=partial(parse_pums_code, file=ACS_POBP_FILE), ) +# RELP: Relationship to Reference Person acs_relationship = ColumnToText( "RELP", - short_description="relationship to the reference person in the household", + short_description="relationship to the reference person in the survey", value_map={ - 0: "The 'reference person' itself", + 0: "The reference person itself", 1: "Husband/wife", 2: "Biological son or daughter", 3: "Adopted son or daughter", @@ -121,6 +127,7 @@ }, ) +# WKHP: Usual Hours Worked per Week acs_work_hours = ColumnToText( "WKHP", short_description="usual number of hours worked per week", @@ -128,6 +135,7 @@ value_map=lambda x: f"{int(x)} hours", ) +# SEX: Sex acs_sex = ColumnToText( "SEX", short_description="sex", @@ -137,6 +145,7 @@ }, ) +# RAC1P: Race acs_race = ColumnToText( "RAC1P", short_description="race", @@ -156,6 +165,7 @@ }, ) +# PINCP: Yearly Income acs_income = ColumnToText( "PINCP", short_description="yearly income", @@ -163,7 +173,7 @@ value_map=lambda x: f"${int(x):,}", ) -acs_income_binary_qa = MultipleChoiceQA( +acs_income_qa = MultipleChoiceQA( column=get_thresholded_column_name("PINCP", 50_000), text="What is this person's estimated yearly income?", choices=[ @@ -182,13 +192,14 @@ num_forward_passes=2, ) -acs_income = ColumnToText( +acs_income_target_col = ColumnToText( name=get_thresholded_column_name("PINCP", 50_000), short_description="yearly income", missing_value_fill="N/A (less than 15 years old)", - question=acs_income_binary_qa, + question=acs_income_qa, ) +""" acs_income_brackets = ColumnToText( "PINCP_brackets", short_description="yearly income", @@ -204,27 +215,30 @@ ], ), ) +""" -# Note: yes/no values are flipped when using PUBCOV as the label column -acs_pubcov_binary_qa = MultipleChoiceQA( +# PUBCOV: Public Health Coverage +# NOTE: in folktables the negative choice has value `0` instead of `2` +acs_pubcov_qa = MultipleChoiceQA( column="PUBCOV", text="Does this person have public health insurance coverage?", choices=[ - Choice("No, individual is not covered by public health insurance", 0), - Choice("Yes, individual is covered by public health insurance", 1), + Choice("Yes, person is covered by public health insurance", 1), + Choice("No, person is not covered by public health insurance", 2), ], ) -acs_pubcov = ColumnToText( +acs_pubcov_target_col = ColumnToText( "PUBCOV", short_description="public health coverage status", value_map={ 1: "Covered by public health insurance", 2: "Not covered by public health insurance", }, - question=acs_pubcov_binary_qa, + question=acs_pubcov_qa, ) +# DIS: Disability Status acs_disability = ColumnToText( "DIS", short_description="disability status", @@ -234,6 +248,7 @@ }, ) +# ESP: Employment Status of Parents acs_emp_parents = ColumnToText( "ESP", short_description="employment status of parents", @@ -250,6 +265,7 @@ missing_value_fill="N/A (not own child of householder, and not child in subfamily)", ) +# CIT: Citizenship Status acs_citizenship = ColumnToText( "CIT", short_description="citizenship status", @@ -262,6 +278,7 @@ }, ) +# MIG: Mobility Status acs_mobility = ColumnToText( "MIG", short_description="mobility status over the last year", @@ -272,6 +289,23 @@ }, ) +acs_mobility_qa = MultipleChoiceQA( + column=get_thresholded_column_name("MIG", 1, op="=="), + text="Has this person moved in the last year?", + choices=[ + Choice("No, person has lived in the same house for the last year", 1), + Choice("Yes, person has moved in the last year", 0), + ], +) + +acs_mobility_target_col = ColumnToText( + name=get_thresholded_column_name("MIG", 1, op="=="), + short_description="mobility status over the last year", + question=acs_mobility_qa, + use_value_map_only=True, +) + +# MIL: Military Service Status acs_military = ColumnToText( "MIL", short_description="military service status", @@ -284,6 +318,7 @@ missing_value_fill="N/A (less than 17 years old)", ) +# ANC: Ancestry acs_ancestry = ColumnToText( "ANC", short_description="ancestry", @@ -295,6 +330,7 @@ }, ) +# NATIVITY: Nativity acs_nativity = ColumnToText( "NATIVITY", short_description="nativity", @@ -304,6 +340,7 @@ }, ) +# DEAR: Hearing Status acs_hearing = ColumnToText( "DEAR", short_description="hearing status", @@ -313,6 +350,7 @@ }, ) +# DEYE: Vision Status acs_vision = ColumnToText( "DEYE", short_description="vision status", @@ -322,6 +360,7 @@ }, ) +# DREM: Cognitive Status acs_cognitive = ColumnToText( "DREM", short_description="cognition status", @@ -332,6 +371,7 @@ missing_value_fill="N/A (less than 5 years old)", ) +# ESR: Employment Status acs_employment = ColumnToText( "ESR", short_description="employment status", @@ -346,9 +386,26 @@ missing_value_fill="N/A (less than 16 years old)", ) +acs_employment_qa = MultipleChoiceQA( + column=get_thresholded_column_name("ESR", 1, op="=="), + text="What is this person's employment status?", + choices=[ + Choice("Employed civilian", 1), + Choice("Unemployed or in the military", 0), + ], +) + +acs_employment_target_col = ColumnToText( + name=get_thresholded_column_name("ESR", 1, op="=="), + short_description="employment status", + question=acs_employment_qa, + use_value_map_only=True, +) + +# ST: State acs_state = ColumnToText( "ST", - short_description="state", + short_description="resident state", value_map=partial( parse_pums_code, file=ACS_ST_FILE, @@ -356,6 +413,7 @@ ), ) +# FER: Parenthood Status acs_parenthood = ColumnToText( "FER", short_description="person has given birth within the last year", @@ -367,6 +425,7 @@ missing_value_fill="N/A (less than 15 years old, or greater than 50 years old, or male)", ) +# JWMNP: Commute Time acs_commute_time = ColumnToText( "JWMNP", short_description="commute time", @@ -374,6 +433,23 @@ missing_value_fill="N/A (not a worker, or worker who worked at home)", ) +acs_commute_time_qa = MultipleChoiceQA( + column=get_thresholded_column_name("JWMNP", 20), + text="What is this person's commute time?", + choices=[ + Choice("Longer than 20 minutes", 1), + Choice("Less than 20 minutes", 0), + ], +) + +acs_travel_time_target_col = ColumnToText( + name=get_thresholded_column_name("JWMNP", 20), + short_description="commute time", + question=acs_commute_time_qa, + use_value_map_only=True, +) + +# JWTR: Commute Method acs_commute_method = ColumnToText( "JWTR", short_description="means of transportation to work", @@ -393,6 +469,7 @@ }, ) +# POVPIP: Income-to-Poverty Ratio acs_poverty_ratio = ColumnToText( "POVPIP", short_description="income-to-poverty ratio",