-
Notifications
You must be signed in to change notification settings - Fork 36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added new census distribution over buildings UDF #97
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import geopandas as gpd | ||
|
||
@fused.udf | ||
def udf( | ||
bbox: fused.types.TileGDF = None, | ||
release: str = "2024-03-12-alpha-0", | ||
polygon: gpd.GeoDataFrame = None, | ||
resolution: int = 10 | ||
|
||
): | ||
import geopandas as gpd | ||
import concurrent.futures | ||
from utils import get_buildings_h3,acs_5yr_bbox, get_census | ||
import h3 | ||
import pandas as pd | ||
from shapely.geometry import shape, box, Polygon | ||
import logging | ||
|
||
# Getting Overture buildings Data in H3 Format | ||
building_data = get_buildings_h3(bbox, release, resolution) | ||
|
||
# Getting Census Data in H3 Format | ||
census_df = get_census(bbox, census_variable='Total Pop', scale_factor=200, is_density=True, year=2022) | ||
|
||
print(census_df) | ||
print(building_data) | ||
|
||
# Performing SJoin on buildings data and Census data to find the population distribution | ||
joined_gdf = gpd.sjoin(building_data, census_df, how="left", op="intersects") | ||
|
||
# Calculation of population count to visualize population | ||
joined_gdf['cnt'] = joined_gdf['cnt'].fillna(0) | ||
|
||
|
||
return joined_gdf |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
<!--fused:preview--> | ||
<p align="center"><img src="https://raw.githubusercontent.com/fusedio/udfs/main/public/Census_Distribution_across_Buildings/fused-screenshot-overture-ACS.png" width="600" alt="UDF preview image"></p> | ||
|
||
<!--fused:tags--> | ||
Tags: `population` `buildings` `overture` `census` `usa` `h3` | ||
|
||
<!--fused:readme--> | ||
## Overview | ||
|
||
This UDF calculates and displays the approximate population distribution from the ACS (American Community Survey) population census dataset across building footprints in the United States, using the Overture Maps Building Footprint Dataset. By integrating H3 index, it spatially joins census population data with building footprint geometries to provide a detailed analysis of population distribution at a granular level. | ||
|
||
## External links | ||
|
||
- [ACS official Website](https://www.census.gov/programs-surveys/acs/) | ||
- Buildings footprints [Overture Maps](https://overturemaps.org/) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
{ | ||
"version": "0.0.3", | ||
"job_config": { | ||
"version": "0.0.3", | ||
"name": null, | ||
"steps": [ | ||
{ | ||
"type": "udf", | ||
"udf": { | ||
"type": "geopandas_v2", | ||
"name": "Census_Distribution_across_Buildings", | ||
"entrypoint": "udf", | ||
"parameters": {}, | ||
"metadata": { | ||
"fused:defaultParameters": [ | ||
{ | ||
"parameter": "release", | ||
"value": "", | ||
"type": "string", | ||
"suggestedValues": [ | ||
"2024-02-15-alpha-0", | ||
"2024-03-12-alpha-0" | ||
] | ||
} | ||
], | ||
"fused:defaultViewState": { | ||
"enable": true, | ||
"latitude": 40.779884303572246, | ||
"longitude": -73.96453426313244, | ||
"zoom": 12.08646398175605, | ||
"pitch": 0, | ||
"bearing": 0 | ||
}, | ||
"fused:tags": [ | ||
{ | ||
"id": "population", | ||
"label": "population", | ||
"isCreatable": true | ||
}, | ||
{ | ||
"id": "buildings", | ||
"label": "buildings" | ||
}, | ||
{ | ||
"id": "overture", | ||
"label": "overture", | ||
"isCreatable": true | ||
}, | ||
{ | ||
"id": "census", | ||
"label": "census", | ||
"isCreatable": true | ||
}, | ||
{ | ||
"id": "usa", | ||
"label": "usa", | ||
"isCreatable": true | ||
}, | ||
{ | ||
"id": "h3", | ||
"label": "h3", | ||
"isCreatable": true | ||
} | ||
], | ||
"fused:description": "## Overview\n\nThis UDF calculates and displays the approximate population distribution from the ACS (American Community Survey) population census dataset across building footprints in the United States, using the Overture Maps Building Footprint Dataset. By integrating H3 index, it spatially joins census population data with building footprint geometries to provide a detailed analysis of population distribution at a granular level.\n\n## External links\n\n- [ACS official Website](https://www.census.gov/programs-surveys/acs/)\n- Buildings footprints [Overture Maps](https://overturemaps.org/) \n", | ||
"fused:assetUrl": "https://raw.githubusercontent.com/fusedio/udfs/main/public/Census_Distribution_across_Buildings/fused-screenshot-overture-ACS.png", | ||
"fused:vizConfig": { | ||
"tileLayer": { | ||
"@@type": "TileLayer", | ||
"minZoom": 0, | ||
"maxZoom": 19, | ||
"tileSize": 256, | ||
"pickable": true | ||
}, | ||
"rasterLayer": { | ||
"@@type": "BitmapLayer", | ||
"pickable": true | ||
}, | ||
"vectorLayer": { | ||
"opacity": 5, | ||
"@@type": "GeoJsonLayer", | ||
"stroked": false, | ||
"filled": true, | ||
"pickable": true, | ||
"getRadius": 10, | ||
"getFillColor": "@@=[properties.cnt/3+200, properties.cnt/5, properties.cnt/20]" | ||
} | ||
}, | ||
"fused:udfType": "auto", | ||
"fused:slug": "Census_Distribution_across_Buildings", | ||
"fused:name": "Census_Distribution_across_Buildings", | ||
"fused:id": null | ||
}, | ||
"source": "Census_Distribution_across_Buildings.py", | ||
"headers": [ | ||
{ | ||
"module_name": "utils", | ||
"source_file": "utils.py" | ||
} | ||
] | ||
} | ||
} | ||
], | ||
"metadata": null | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,188 @@ | ||||||
import geopandas as gpd | ||||||
import concurrent.futures | ||||||
import fused | ||||||
import geopandas as gpd | ||||||
import shapely | ||||||
from shapely import Point, Polygon | ||||||
import h3 | ||||||
|
||||||
# Function to Geometry to H3 polygon from centroid location | ||||||
def geometry_to_hexagon(geom, resolution=10): | ||||||
centroid = geom.centroid | ||||||
h3_index = h3.latlng_to_cell(centroid.y, centroid.x, resolution) | ||||||
hex_boundary = h3.cell_to_boundary(h3_index) | ||||||
return Polygon([(b[1], b[0]) for b in hex_boundary]) | ||||||
|
||||||
# Converting Overture Maps Buildings to Hexagons | ||||||
@fused.cache | ||||||
def get_buildings_h3( | ||||||
bbox: fused.types.TileGDF = None, | ||||||
release: str = "2024-03-12-alpha-0", | ||||||
resolution: int = 10 | ||||||
): | ||||||
import pandas as pd | ||||||
from shapely.geometry import box | ||||||
|
||||||
utils = fused.load( | ||||||
"https://github.com/fusedio/udfs/tree/f8f0c0f/public/common/" | ||||||
).utils | ||||||
|
||||||
theme = "buildings" | ||||||
overture_type = "building" | ||||||
min_zoom = 12 | ||||||
num_parts = 5 | ||||||
|
||||||
table_path = f"s3://us-west-2.opendata.source.coop/fused/overture/{release}/theme={theme}/type={overture_type}" | ||||||
table_path = table_path.rstrip("/") | ||||||
|
||||||
def get_part(part): | ||||||
part_path = f"{table_path}/part={part}/" if num_parts != 1 else table_path | ||||||
try: | ||||||
return utils.table_to_tile( | ||||||
bbox, table=part_path, min_zoom=min_zoom | ||||||
) | ||||||
except ValueError: | ||||||
return None | ||||||
|
||||||
if num_parts > 1: | ||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parts) as pool: | ||||||
dfs = list(pool.map(get_part, range(num_parts))) | ||||||
else: | ||||||
dfs = [get_part(0)] | ||||||
|
||||||
dfs = [df for df in dfs if df is not None] | ||||||
|
||||||
if len(dfs): | ||||||
gdf = pd.concat(dfs) | ||||||
else: | ||||||
print("Failed to get any data") | ||||||
return None | ||||||
|
||||||
hex_polygons = [] | ||||||
|
||||||
if 'geometry' in gdf.columns: | ||||||
gdf['hexagon'] = gdf['geometry'].apply(geometry_to_hexagon) | ||||||
hex_gdf = gpd.GeoDataFrame(gdf, geometry='hexagon', crs='epsg:4326') | ||||||
else: | ||||||
hex_gdf = gpd.GeoDataFrame(gdf) | ||||||
|
||||||
return hex_gdf | ||||||
|
||||||
|
||||||
# Census Data UDF Functions | ||||||
@fused.cache | ||||||
def acs_5yr_bbox(bounds, census_variable='population', suffix='simplify_01', year=2022): | ||||||
if int(year) not in (2021, 2022): | ||||||
raise ValueError('The only available years are 2021 and 2022') | ||||||
|
||||||
import shapely | ||||||
import geopandas as gpd | ||||||
bbox = gpd.GeoDataFrame(geometry=[shapely.box(*bounds)], crs=4326) | ||||||
table_to_tile = fused.utils.common.table_to_tile | ||||||
fused.utils.common.import_env() | ||||||
tid = search_title(census_variable) | ||||||
df = acs_5yr_table(tid, year=year) | ||||||
df['GEOID'] = df.GEO_ID.map(lambda x: x.split('US')[-1]) | ||||||
df = df[['GEOID'] + [i for i in df.columns if '_E' in i]] | ||||||
name_dict = acs_5yr_meta(short=False).set_index('Unique ID').to_dict()['Full Title'] | ||||||
df.columns = ['GEOID'] + [name_dict[i.replace('_E', "_")] for i in df.columns[1:]] | ||||||
df = df.rename(columns={df.columns[1]: 'cnt'}) # Rename population column to 'cnt' | ||||||
table_path = 's3://fused-asset/infra/census_bg_us' | ||||||
print(df['GEOID'] , "there are the geoIDS") | ||||||
if suffix: | ||||||
table_path += f'_{suffix}' | ||||||
print("meow", df.columns) | ||||||
gdf = table_to_tile(bbox, table_path, use_columns=['GEOID', 'geometry'], min_zoom=12) | ||||||
gdf['h3_index'] = gdf['geometry'].apply(lambda x: h3.latlng_to_cell(x.centroid.y, x.centroid.x, 11)) | ||||||
print(gdf) | ||||||
|
||||||
|
||||||
if len(gdf)>0: | ||||||
gdf2 = gdf.merge(df) | ||||||
return gdf2 | ||||||
else: | ||||||
print('No geometry is intersecting with the given bbox.') | ||||||
return gpd.GeoDataFrame({}) | ||||||
|
||||||
@fused.cache | ||||||
def acs_5yr_meta(short=True): | ||||||
import pandas as pd | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Import statement at top of file. If it were a "rare" import statement that introduced latency at import, it'd make sense to have it within the function. In this case, pandas is common enough so it belongs at the top of the file. |
||||||
#Filter only records with cencus block groups data | ||||||
tmp = pd.read_excel('https://www2.census.gov/programs-surveys/acs/summary_file/2021/sequence-based-SF/documentation/tech_docs/ACS_2021_SF_5YR_Appendices.xlsx') | ||||||
table_ids_cbgs = tmp[tmp['Geography Restrictions'].isna()]['Table Number'] | ||||||
#Get the list of tables and filter by only totals (the first row of each table) | ||||||
df_tables = pd.read_csv('https://www2.census.gov/programs-surveys/acs/summary_file/2022/table-based-SF/documentation/ACS20225YR_Table_Shells.txt', delimiter='|') | ||||||
if short: | ||||||
df_tables2 = df_tables.drop_duplicates('Table ID') | ||||||
else: | ||||||
df_tables2 = df_tables | ||||||
df_tables2['Full Title']=df_tables2['Label']+' | '+df_tables2['Title']+' | '+df_tables2['Unique ID'] | ||||||
df_tables2 = df_tables2[df_tables2['Table ID'].isin(table_ids_cbgs)] | ||||||
print(df_tables2, df_tables) | ||||||
return df_tables2 | ||||||
|
||||||
|
||||||
@fused.cache | ||||||
def acs_5yr_table(tid, year=2022): | ||||||
import pandas as pd | ||||||
url=f'https://www2.census.gov/programs-surveys/acs/summary_file/{year}/table-based-SF/data/5YRData/acsdt5y{year}-{tid.lower()}.dat' | ||||||
return pd.read_csv(url, delimiter='|') | ||||||
|
||||||
def search_title(title): | ||||||
df_meta=acs_5yr_meta() | ||||||
#search for title in the list of tables | ||||||
search_column = 'Title' #'Title' #'Topics' | ||||||
meta_dict = df_meta[['Table ID', search_column]].set_index(search_column).to_dict()['Table ID'] | ||||||
List = [[meta_dict[i], i] for i in meta_dict.keys() if title.lower() in i.lower()] | ||||||
print(f'Chosen: {List[0]}\nfrom: {List[:20]}') | ||||||
return List[0][0] | ||||||
|
||||||
|
||||||
import geopandas as gpd | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please run |
||||||
|
||||||
@fused.cache | ||||||
def get_census(bbox, census_variable='Total Pop', scale_factor=200, is_density=True, year=2022): | ||||||
from utils import acs_5yr_bbox | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This import statement is incorrect. This function is already in the |
||||||
import h3 | ||||||
import shapely | ||||||
from shapely import Point, Polygon | ||||||
#different geometry details per zoom level | ||||||
if bbox.z[0]>12: | ||||||
suffix=None | ||||||
elif bbox.z[0]>10: | ||||||
suffix='simplify_0001' | ||||||
elif bbox.z[0]>8: | ||||||
suffix='simplify_001' | ||||||
elif bbox.z[0]>5: | ||||||
suffix='simplify_01' | ||||||
else: | ||||||
suffix='centroid' | ||||||
print(suffix) | ||||||
|
||||||
#read the variables | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Please standardize all comments to use sentence case and have a space between the |
||||||
gdf=acs_5yr_bbox(bbox.total_bounds, census_variable=census_variable, year=year) | ||||||
if len(gdf)==0: | ||||||
return None | ||||||
|
||||||
#shorten the column name | ||||||
gdf.columns = gdf.columns.map(lambda x:(str(x.split('|')[0])+str(x.split('|')[-1])) if '|' in x else x) | ||||||
print(gdf.columns) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need for print statement in helper function. |
||||||
|
||||||
|
||||||
def geometry_to_hexagon(geom): | ||||||
centroid = geom.centroid | ||||||
h3_index = h3.latlng_to_cell(centroid.y, centroid.x, 10) | ||||||
hex_boundary = h3.cell_to_boundary(h3_index) | ||||||
return Polygon([(b[1], b[0]) for b in hex_boundary]) | ||||||
|
||||||
if 'geometry' in gdf.columns: | ||||||
gdf['hexagon'] = gdf['geometry'].apply(geometry_to_hexagon) | ||||||
hex_gdf = gpd.GeoDataFrame(gdf, geometry='hexagon', crs='epsg:4326') | ||||||
else: | ||||||
hex_gdf = gpd.GeoDataFrame(gdf) | ||||||
|
||||||
|
||||||
Comment on lines
+183
to
+184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please run |
||||||
return hex_gdf | ||||||
|
||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In UDFs, import statement should go inside the UDF. You already have this import in line 11.