From 6cb45fe96cf88281cca473514f2afe7fcec3cff6 Mon Sep 17 00:00:00 2001 From: Ali Moghimi Date: Sun, 22 Oct 2023 21:46:47 +0000 Subject: [PATCH] feat: add validation function --- scraper/engine.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scraper/engine.py b/scraper/engine.py index 2764c42..e69d95e 100644 --- a/scraper/engine.py +++ b/scraper/engine.py @@ -17,8 +17,9 @@ def fetch(self) -> pd.DataFrame: f"Successfully fetched content from {self.url}. Now parsing the content." ) df = self.parse_html(content) - logger.info(f"Parsed content from {self.url}. Extracted {len(df)} rows.") logger.debug(f"\n{df}") + self.validate_data(df) + logger.info(f"Parsed content from {self.url}. Extracted {len(df)} rows.") return df def get_content(self): @@ -34,7 +35,7 @@ def parse_html(self, content: str) -> pd.DataFrame: try: dfs = pd.read_html(content) except Exception as e: - logger.error(f"Error reading HTML via pandas. Error: {e}") + logger.error(f"Error parsing HTML via pandas. Error: {e}") raise e if len(dfs): @@ -44,3 +45,10 @@ def parse_html(self, content: str) -> pd.DataFrame: return dfs[0] else: raise ValueError(f"No data found when parsing content from {self.url}.") + + def validate_data(self, df: pd.DataFrame) -> None: + if df["Clearing Date"].str.contains("No data found").any() or not len(df): + logger.error(f"Data validation failed") + raise ValueError("No price data provided by the provider as of now.") + + logger.debug("Data validation succeeded.")