From 9c22a30a02674fa54f2f23e8621e62910c22ef5e Mon Sep 17 00:00:00 2001 From: mikepal2 <40579649+mikepal2@users.noreply.github.com> Date: Sun, 2 Jun 2024 18:25:45 -0700 Subject: [PATCH] use exceptions instead of None use single async download method --- src/ferry_planner/schedule.py | 264 ++++++++++++++++------------------ 1 file changed, 121 insertions(+), 143 deletions(-) diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py index 07bcb7c..e20329b 100644 --- a/src/ferry_planner/schedule.py +++ b/src/ferry_planner/schedule.py @@ -3,7 +3,7 @@ import itertools import os import time -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from datetime import datetime, timedelta from pathlib import Path from threading import Thread @@ -17,6 +17,13 @@ from ferry_planner.location import LocationId from ferry_planner.utils import datetime_to_timedelta +MONTHS = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] +WEEKDAY_NAMES = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") +NO_SAILINGS_MESSAGES = [ + "Seasonal schedules have not been posted for these dates", + "Schedules for your selected date and route are currently unavailable", +] + class FerrySailing(BaseModel): departure: datetime @@ -25,8 +32,8 @@ class FerrySailing(BaseModel): """Duration in seconds.""" # TODO: price: float # noqa: FIX002 """Price in Canadian dollars (CAD).""" - notes: Sequence[str] | None - """Any notes/comments posted about this sailing""" + notes: tuple[str, ...] = () + """Notes or comments posted about this sailing.""" def __hash__(self) -> int: return hash((self.departure, self.arrival, self.duration, self.notes)) @@ -38,7 +45,7 @@ class FerrySchedule(BaseModel): destination: LocationId sailings: tuple[FerrySailing, ...] url: str - notes: Sequence[str] | None + notes: tuple[str, ...] """Any notes/comments posted about this schedule""" @@ -54,14 +61,29 @@ def __call__( class HtmlParseResult: - redirect_url: str | None = None - sailings: Sequence[FerrySailing] | None = None - notes: list[str] | None = None + redirect_url: str = "" + sailings: tuple[FerrySailing, ...] = () + notes: tuple[str, ...] = () + """Any notes/comments/errors posted about this schedule""" + + @classmethod + def redirect(cls, redirect_url: str) -> "HtmlParseResult": + result = HtmlParseResult() + result.redirect_url = redirect_url + return result + + @classmethod + def from_sailings(cls, sailings: Sequence[FerrySailing], notes: Sequence[str]) -> "HtmlParseResult": + result = HtmlParseResult() + result.sailings = tuple(sailings) + result.notes = tuple(notes) + return result + - def add_note(self, note: str) -> None: - if self.notes is None: - self.notes = [] - self.notes.append(note) +class DownloadScheduleError(Exception): + def __init__(self, url: str, msg: str, *args: Iterable) -> None: + self.url = url + super().__init__(f"Error downloading {url}: {msg}", *args) class ScheduleDB: @@ -82,6 +104,9 @@ def __init__( # noqa: PLR0913 self._refresh_thread = Thread(target=self._refresh_task, daemon=True) self._mem_cache = {} self.cache_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + timeout = httpx.Timeout(30.0, pool=None) + limits = httpx.Limits(max_connections=5) + self._client = httpx.AsyncClient(timeout=timeout, limits=limits, follow_redirects=True) def _get_download_url( self, @@ -144,48 +169,12 @@ def download_schedule( *, date: datetime, ) -> FerrySchedule | None: - url = self._get_download_url(origin_id, destination_id, date=date) - route = f"{origin_id}-{destination_id}" - print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}") - max_requests_count = 3 - requests_count = 0 - while requests_count < max_requests_count: - requests_count += 1 - try: - response = httpx.get(url, follow_redirects=True, timeout=30.0) - except httpx.HTTPError as exc: - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n" - f"{exc!r}\n" - f"{url}", - ) - return None - if not httpx.codes.is_success(response.status_code): - print( - f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}" - f" status {response.status_code}", - ) - return None - print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") - result = parse_schedule_html(response, date) - if result.redirect_url: - url = result.redirect_url - continue - if result.sailings is None: - break - return FerrySchedule( - date=date, - origin=origin_id, - destination=destination_id, - sailings=tuple(result.sailings), - url=url, - notes=result.notes, - ) - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}" - f" - too many redirects", - ) - return None + coro = self.download_schedule_async(origin_id, destination_id, date=date) + try: + loop = asyncio.get_running_loop() + return loop.run_until_complete(coro) + except RuntimeError: + return asyncio.run(coro) async def download_schedule_async( self, @@ -194,37 +183,46 @@ async def download_schedule_async( /, *, date: datetime, - client: httpx.AsyncClient, ) -> FerrySchedule | None: + try: + return await self._download_schedule_async(origin_id, destination_id, date=date) + except (DownloadScheduleError, httpx.HTTPError) as exc: + url = exc.request.url if isinstance(exc, httpx.HTTPError) else exc.url + print( + f"[{self.__class__.__name__}:ERROR] failed to download schedule: " + f"{origin_id}-{destination_id}:{date.date()}\n" + f"\t{exc!r}\n" + f"\tUrl: {url}", + ) + return None + + async def _download_schedule_async( + self, + origin_id: LocationId, + destination_id: LocationId, + /, + *, + date: datetime, + ) -> FerrySchedule: url = self._get_download_url(origin_id, destination_id, date=date) route = f"{origin_id}-{destination_id}" print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}") - max_requests_count = 3 - requests_count = 0 - while requests_count < max_requests_count: - requests_count += 1 - try: - response = await client.get(url, follow_redirects=True, timeout=30.0) - except httpx.HTTPError as exc: - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n" - f"{exc!r}\n" - f"{url}", - ) - return None + max_redirects_count = 3 + redirects = [] + while True: + response = await self._client.get(url) if not httpx.codes.is_success(response.status_code): - print( - f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}" - f" status {response.status_code}", - ) - return None + raise DownloadScheduleError(url, f"Status {response.status_code}") print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") result = parse_schedule_html(response, date) if result.redirect_url: + if len(redirects) > max_redirects_count: + raise DownloadScheduleError(url, "Too many redirects") + if url in redirects: + raise DownloadScheduleError(url, "Redirects loop") url = result.redirect_url + redirects.append(url) continue - if result.sailings is None: - break return FerrySchedule( date=date, origin=origin_id, @@ -233,11 +231,6 @@ async def download_schedule_async( url=url, notes=result.notes, ) - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}" - f" - too many redirects", - ) - return None async def _download_and_save_schedule( self, @@ -246,13 +239,11 @@ async def _download_and_save_schedule( /, *, date: datetime, - client: httpx.AsyncClient, ) -> bool: schedule = await self.download_schedule_async( origin_id, destination_id, date=date, - client=client, ) if schedule is not None: self.put(schedule) @@ -272,28 +263,24 @@ async def refresh_cache(self) -> None: self._mem_cache = {} # download new schedules tasks = [] - timeout = httpx.Timeout(30.0, pool=None) - limits = httpx.Limits(max_connections=5) - async with httpx.AsyncClient(timeout=timeout, limits=limits) as client: - for connection in self.ferry_connections: - for date in dates: - filepath = self._get_filepath( - connection.origin.id, - connection.destination.id, - date=date, - ) - if not filepath.exists(): - tasks.append( - asyncio.create_task( - self._download_and_save_schedule( - connection.origin.id, - connection.destination.id, - date=date, - client=client, - ), + for connection in self.ferry_connections: + for date in dates: + filepath = self._get_filepath( + connection.origin.id, + connection.destination.id, + date=date, + ) + if not filepath.exists(): + tasks.append( + asyncio.create_task( + self._download_and_save_schedule( + connection.origin.id, + connection.destination.id, + date=date, ), - ) - downloaded_schedules = sum(await asyncio.gather(*tasks)) + ), + ) + downloaded_schedules = sum(await asyncio.gather(*tasks)) print( f"[{self.__class__.__name__}:INFO] finished refreshing cache, " f"downloaded {downloaded_schedules} schedules", @@ -309,47 +296,40 @@ def _refresh_task(self) -> None: def parse_schedule_html(response: httpx.Response, date: datetime) -> HtmlParseResult: - result = HtmlParseResult() html = response.text.replace("\u2060", "") soup = BeautifulSoup(markup=html, features="html.parser") table_tag = soup.find("table", id="dailyScheduleTableOnward") daterange_tag = soup.find("div", id="dateRangeModal") # for seasonal - rows: Sequence[Tag] | None = None + rows: Sequence[Tag] = [] if table_tag and isinstance(table_tag, Tag) and table_tag.tbody: rows = table_tag.tbody.find_all("tr") elif daterange_tag and isinstance(daterange_tag, Tag): hrefs = [a["href"] for a in daterange_tag.find_all("a")] index = get_seasonal_schedule_daterange_index(hrefs, date) if index < 0: - pass # date is out of range - else: - url = response.url.scheme + "://" + response.url.host + hrefs[index] - if index == 0 or url == str(response.url): - rows = get_seasonal_schedule_rows(soup, date) - else: - result.redirect_url = url - return result - result.sailings = parse_sailings_from_html_rows(rows, date) - if result.sailings is None: - for note in [ - "Seasonal schedules have not been posted for these dates", - "Schedules for your selected date and route are currently unavailable", - ]: - if note in html: - result.add_note(note) - result.sailings = [] - return result - print(f"No sailings found at {response.url}") - return result - - -def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> Sequence[FerrySailing] | None: - if rows is None: - return None + raise DownloadScheduleError(str(response.url), f"Date {date} is out of seasonal schedules range") + url = response.url.scheme + "://" + response.url.host + hrefs[index] + if index > 0 and url != str(response.url): + return HtmlParseResult.redirect(url) + rows = get_seasonal_schedule_rows(str(response.url), soup, date) + sailings = parse_sailings_from_html_rows(rows, date) + notes = [] + if not sailings: + err = "No sailings found" + for msg in NO_SAILINGS_MESSAGES: + if msg in html: + err = msg + break + notes.append(err) + print(f"{err} at {response.url}") + return HtmlParseResult.from_sailings(sailings, notes) + + +def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequence[FerrySailing]: sailing_row_min_td_count = 3 - sailings: Sequence[FerrySailing] = [] - notes = None + sailings = [] for row in rows: + notes = [] tds = row.find_all("td") if ( len(tds) < sailing_row_min_td_count @@ -386,7 +366,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> departure=departure, arrival=arrival, duration=duration, - notes=notes, + notes=tuple(notes), ) sailings.append(sailing) return sailings @@ -407,13 +387,12 @@ def parse_sailig_comment(comment: str) -> list[str]: return notes -def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[Tag] | None: +def get_seasonal_schedule_rows(url: str, soup: BeautifulSoup, date: datetime) -> Sequence[Tag]: rows: Sequence[Tag] = [] form = soup.find("form", id="seasonalSchedulesForm") if not isinstance(form, Tag): - return None - weekday_names = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") - weekday = weekday_names[date.weekday()] + raise DownloadScheduleError(url, "seasonalSchedulesForm not found") + weekday = WEEKDAY_NAMES[date.weekday()] for thead in form.find_all("thead"): if thead.text.lower().strip().startswith(weekday): rows = [x for x in itertools.takewhile(lambda t: t.name != "thead", thead.next_siblings) if x.name == "tr"] @@ -452,7 +431,6 @@ def is_schedule_excluded_on_date(schedule_comment: str, date: datetime) -> bool: def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: - months: Sequence[str] = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] month: int | None = None schedule_dates = schedule_dates.upper() for c in [".", "&", " ON ", " ON:"]: @@ -460,8 +438,8 @@ def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: tokens = [x.strip() for x in schedule_dates.split(",")] tokens = [x for x in tokens if x and x not in ["ONLY", "EXCEPT", "NOT AVAILABLE"]] for token in tokens: - if token in months: - month = months.index(token) + 1 + if token in MONTHS: + month = MONTHS.index(token) + 1 continue _date: datetime if token.isnumeric(): @@ -472,12 +450,12 @@ def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: else: dt = token.split(" ") expected_tokens_count = 2 - if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in months: + if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in MONTHS: # 01 JAN, 02 JAN, 05 FEB, 06 FEB - _date = datetime(year=date.year, month=months.index(dt[1]) + 1, day=int(dt[0])) - elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in months: + _date = datetime(year=date.year, month=MONTHS.index(dt[1]) + 1, day=int(dt[0])) + elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in MONTHS: # Jan 1, 2, Feb 5 & 6 - month = months.index(dt[0]) + 1 + month = MONTHS.index(dt[0]) + 1 _date = datetime(year=date.year, month=month, day=int(dt[1])) else: print(f"Failed to parse schedule dates: Unknown word '{token}' in '{schedule_dates}")