-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
dag to scrape and save the current ridership URL from the NTD portal (#…
…3545) * dag to scrape the current ridership URL from the NTD portal * fix naming and add some descriptions * reconfigured airflow dag setup for dependencies and special handling * test storing variables in xcoms * cleaned up imports * rebase * remove and reorganize some lingering and unnecessary code and test * linter not working * refactor lambda for flake8 * flake8 config change * flake8 config change again * create function of url finder * add comment for flake8 suppression * accidentally pushed copy file * suppress whitespace after colon error * last pass at configuration changes * suppress whitespace after colon error * remove testing comments, clean up changed files
- Loading branch information
1 parent
4f07ab5
commit 4a63342
Showing
5 changed files
with
77 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 4 additions & 2 deletions
6
...a_xlsx/ridership_historical/complete_monthly_ridership_with_adjustments_and_estimates.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
operator: operators.NtdDataProductXLSXOperator | ||
|
||
product: 'complete_monthly_ridership_with_adjustments_and_estimates' | ||
xlsx_file_url: 'https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-11/September%202024%20Complete%20Monthly%20Ridership%20%28with%20adjustments%20and%20estimates%29_241101.xlsx' | ||
year: 'historical' | ||
xlsx_file_url: 'https://www.transit.dot.gov/ntd/data-product/monthly-module-raw-data-release' # placeholder for scraped url from scrape_ntd_ridership_url task | ||
year: 'historical' # one of: 'historical' (long history), 'mutli-year' (select history), or a specific year (ex: 2022) | ||
dependencies: | ||
- scrape_ntd_ridership_xlsx_url |
42 changes: 42 additions & 0 deletions
42
airflow/dags/sync_ntd_data_xlsx/scrape_ntd_ridership_xlsx_url.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# --- | ||
# python_callable: scrape_ntd_ridership_xlsx_url | ||
# provide_context: true | ||
# --- | ||
import logging | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from pydantic import HttpUrl, parse_obj_as | ||
|
||
|
||
# pushes the scraped URL value to XCom | ||
def push_url_to_xcom(scraped_url, context): | ||
task_instance = context["ti"] | ||
task_instance.xcom_push(key="current_url", value=scraped_url) | ||
|
||
|
||
# Look for an anchor tag where the href ends with '.xlsx' and starts with '/sites/fta.dot.gov/files/' | ||
def href_matcher(href): | ||
return ( | ||
href and href.startswith("/sites/fta.dot.gov/files/") and href.endswith(".xlsx") | ||
) | ||
|
||
|
||
def scrape_ntd_ridership_xlsx_url(**context): | ||
# page to find download URL | ||
url = "https://www.transit.dot.gov/ntd/data-product/monthly-module-raw-data-release" | ||
req = requests.get(url) | ||
soup = BeautifulSoup(req.text, "html.parser") | ||
|
||
link = soup.find("a", href=href_matcher) | ||
|
||
# Extract the href if the link is found | ||
file_link = link["href"] if link else None | ||
|
||
updated_url = f"https://www.transit.dot.gov{file_link}" | ||
|
||
validated_url = parse_obj_as(HttpUrl, updated_url) | ||
|
||
logging.info(f"Validated URL: {validated_url}.") | ||
|
||
push_url_to_xcom(scraped_url=validated_url, context=context) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters