-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into curriculum_docs_update
- Loading branch information
Showing
67 changed files
with
1,728 additions
and
722 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
29 changes: 29 additions & 0 deletions
29
airflow/dags/create_external_tables/state_geoportal/state_highway_network.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
operator: operators.ExternalTable | ||
bucket: gs://calitp-state-geoportal-scrape | ||
source_objects: | ||
- "state_highway_network_geodata/*.jsonl.gz" | ||
source_format: NEWLINE_DELIMITED_JSON | ||
use_bq_client: true | ||
hive_options: | ||
mode: CUSTOM | ||
require_partition_filter: false | ||
source_uri_prefix: "state_highway_network_geodata/{dt:DATE}/{execution_ts:TIMESTAMP}/" | ||
destination_project_dataset_table: "external_state_geoportal.state_highway_network" | ||
prefix_bucket: false | ||
post_hook: | | ||
SELECT * | ||
FROM `{{ get_project_id() }}`.external_state_geoportal.state_highway_network | ||
LIMIT 1; | ||
schema_fields: | ||
- name: Route | ||
type: INTEGER | ||
- name: County | ||
type: STRING | ||
- name: District | ||
type: INTEGER | ||
- name: RouteType | ||
type: STRING | ||
- name: Direction | ||
type: STRING | ||
- name: wkt_coordinates | ||
type: GEOGRAPHY |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
description: "Scrape State Highway Network from State Geoportal" | ||
schedule_interval: "0 4 1 * *" # 4am UTC first day of every month | ||
tags: | ||
- all_gusty_features | ||
default_args: | ||
owner: airflow | ||
depends_on_past: False | ||
catchup: False | ||
start_date: "2024-09-15" | ||
email: | ||
- "[email protected]" | ||
email_on_failure: True | ||
email_on_retry: False | ||
retries: 1 | ||
retry_delay: !timedelta 'minutes: 2' | ||
concurrency: 50 | ||
#sla: !timedelta 'hours: 2' | ||
wait_for_defaults: | ||
timeout: 3600 |
7 changes: 7 additions & 0 deletions
7
airflow/dags/scrape_state_geoportal/state_highway_network.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
operator: operators.StateGeoportalAPIOperator | ||
|
||
root_url: 'https://caltrans-gis.dot.ca.gov/arcgis/rest/services/' | ||
service: "CHhighway/SHN_Lines" | ||
layer: "0" | ||
product: 'state_highway_network' | ||
resultRecordCount: 2000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 4 additions & 2 deletions
6
...a_xlsx/ridership_historical/complete_monthly_ridership_with_adjustments_and_estimates.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
operator: operators.NtdDataProductXLSXOperator | ||
|
||
product: 'complete_monthly_ridership_with_adjustments_and_estimates' | ||
xlsx_file_url: 'https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-11/September%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_241101.xlsx' | ||
year: 'historical' | ||
xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/monthly-module-raw-data-release' # placeholder for scraped url from scrape_ntd_ridership_url task | ||
year: 'historical' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022) | ||
dependencies: | ||
- scrape_ntd_ridership_xlsx_url |
42 changes: 42 additions & 0 deletions
42
airflow/dags/sync_ntd_data_xlsx/scrape_ntd_ridership_xlsx_url.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# --- | ||
# python_callable: scrape_ntd_ridership_xlsx_url | ||
# provide_context: true | ||
# --- | ||
import logging | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from pydantic import HttpUrl, parse_obj_as | ||
|
||
|
||
# pushes the scraped URL value to XCom | ||
def push_url_to_xcom(scraped_url, context): | ||
task_instance = context["ti"] | ||
task_instance.xcom_push(key="current_url", value=scraped_url) | ||
|
||
|
||
# Look for an anchor tag where the href ends with '.xlsx' and starts with '/sites/fta.dot.gov/files/' | ||
def href_matcher(href): | ||
return ( | ||
href and href.startswith("/sites/fta.dot.gov/files/") and href.endswith(".xlsx") | ||
) | ||
|
||
|
||
def scrape_ntd_ridership_xlsx_url(**context): | ||
# page to find download URL | ||
url = "https://www.transit.dot.gov/ntd/data-product/monthly-module-raw-data-release" | ||
req = requests.get(url) | ||
soup = BeautifulSoup(req.text, "html.parser") | ||
|
||
link = soup.find("a", href=href_matcher) | ||
|
||
# Extract the href if the link is found | ||
file_link = link["href"] if link else None | ||
|
||
updated_url = f"https://www.transit.dot.gov{file_link}" | ||
|
||
validated_url = parse_obj_as(HttpUrl, updated_url) | ||
|
||
logging.info(f"Validated URL: {validated_url}.") | ||
|
||
push_url_to_xcom(scraped_url=validated_url, context=context) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.