Merge pull request #125 from BU-Spark/poc_2024_03_30

Added documentations and refractored the web scraper ETL from ipynb notebook to python code.
BU-Spark · Apr 2, 2024 · e90bc34 · e90bc34
2 parents 638fd7c + a75fc78
commit e90bc34
Show file tree

Hide file tree

Showing 8 changed files with 195 additions and 0 deletions.
diff --git a/Spring2024/ocr/Herbaria_POC_DocumentAI.docx b/Spring2024/ocr/Herbaria_POC_DocumentAI.docx
diff --git a/Spring2024/ocr/Herbaria_POC_DocumentAI.pdf b/Spring2024/ocr/Herbaria_POC_DocumentAI.pdf
diff --git a/Spring2024/ocr/README_ocr.md b/Spring2024/ocr/README_ocr.md
@@ -0,0 +1,10 @@
+# Documentaion of OCR modules
+The documentation of OCR pipeline of the project. The pipeline could be broken down into three models
+* TrOCR model built and maintained by SU23 and previous semester's team
+* AzureOCR/GPT4 chain by FA23 team
+* GCP Document AI by SP24 team
+
+# Google Document AI
+@mkaramb
+Please refer to Herbaria_POC_DocumentAI.docx for detailed instructions.
+
diff --git a/Spring2024/poc.md b/Spring2024/poc.md
@@ -0,0 +1,16 @@
+# POC documentation
+The Proof of Concept is structured as following. Please refer the following documents below for references.
+
+# OCR Proof of Concept
+@mkaramb
+
+@Aeronyx
+
+Leveraged GCP's Document AI to efficiently perform OCR task.
+Please see ./ocr/README_ocr.md and ./ocr/Herbaria_POC_DocumentAI.docx
+
+# Scraper Proof of Concept
+@palmpalm7
+
+Leveraged Selenium to create scraping and ETL pipeline for the CVH datasets.
+Please see ./scraper/README_scraper.md 
diff --git a/Spring2024/scraper/README_scraper.md b/Spring2024/scraper/README_scraper.md
@@ -0,0 +1,25 @@
+# Documentation for the scraper functionalities
+@PalmPalm7
+
+# How to use
+* Run the ./scraper/src/run.sh shell to start the CVH scraping ETL pipeline. 
+* The pipeline mainly used selenium for automation. Please modify path variables accordingly.
+
+# Progress Report - March 31st, 2024
+@PalmPalm7
+## Progress
+* Finished implemented a webscraping pipeline for the CVH datasets
+* Scraped 180 entities in first batch 
+* Have set up a scraping automation pipeline with crontab and shellscript to perform the ETL process.
+
+## Problem
+* Error Handling for scraper code
+* Headless option to be able to run the code on a VM
+* Currently the CVH's index is meaningless, scraping from the start gives you only the a small subset.
+
+## Plan
+* SE the scraper code
+* Complete headless option scraper
+* Identify handwritten datasets (by year maybe?)
+* Scraper more samples (10000)
+* Capusalize it with docker.
diff --git a/Spring2024/scraper/src/cnh_scraper.py b/Spring2024/scraper/src/cnh_scraper.py
diff --git a/Spring2024/scraper/src/cvh_scraper.py b/Spring2024/scraper/src/cvh_scraper.py
@@ -0,0 +1,142 @@
+import argparse
+import concurrent
+import logging
+import os
+import pandas as pd
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
+from webdriver_manager.chrome import ChromeDriverManager
+from tqdm.auto import tqdm
+
+
+def setup_driver(headless=True):
+    """
+    Setup a WebDriver used for selenium, using --headeless option as default
+    """
+    chrome_options = Options()
+    if headless:
+        chrome_options.add_argument("--headless")
+    service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=service, options=chrome_options)
+    return driver
+
+def wait_for_element(driver, by_method, value, timeout=10, retry_interval=5, max_retries=3):
+    """
+    """
+    retries = 0
+    while retries < max_retries:
+        try:
+            return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by_method, value)))
+        except TimeoutException:
+            print(f"Retry {retries+1}/{max_retries} for element {value} after timeout.")
+            time.sleep(retry_interval)
+            retries += 1
+        finally:
+            print(f"Retry {retries+1}/{max_retries} for element {value} after timeout.")
+            time.sleep(retry_interval)
+    # raise TimeoutException(f"Element {value} not found after {max_retries} retries.")
+    return "Missing Value or Timeout"  # Return None if element is not found
+
+def fetch_data(collection_id):
+    driver = setup_driver()
+    try:
+        print(f"Fetching data for collection: {collection_id}\n.")
+        driver.get(f"https://www.cvh.ac.cn/spms/detail.php?id={collection_id}")
+
+        # Fetch specific details as per your logic
+        data = {
+            'Collection ID': collection_id,
+            'Image Link': wait_for_element(driver, By.ID, "spm_image", max_retries=5).get_attribute('src'),
+            'Phylum (门)': wait_for_element(driver, By.ID, "taxon_phy_c", max_retries=5).text,
+            'Order (目)': wait_for_element(driver, By.ID, "taxon_ord_c", max_retries=5).text,
+            'Family (科)': wait_for_element(driver, By.ID, "taxon_fam_c", max_retries=5).text,
+            'Genus (属)': wait_for_element(driver, By.ID, "taxon_gen_c", max_retries=5).text,
+            'Scientific Name': wait_for_element(driver, By.ID, "formattedName", max_retries=5).text,
+            'Chinese Name': wait_for_element(driver, By.ID, "chineseName", max_retries=5).text,
+            'Identified By': wait_for_element(driver, By.ID, "identifiedBy", max_retries=5).text,
+            'Date Identified': wait_for_element(driver, By.ID, "dateIdentified", max_retries=5).text,
+            'Collector': wait_for_element(driver, By.ID, "recordedBy", max_retries=5).text,
+            'Collection Number': wait_for_element(driver, By.ID, "recordNumber", max_retries=5).text,
+            'Collection Date': wait_for_element(driver, By.ID, "verbatimEventDate", max_retries=5).text,
+            'Collection Location': wait_for_element(driver, By.ID, "locality", max_retries=5).text,
+            'Altitude': wait_for_element(driver, By.ID, "elevation", max_retries=5).text,
+            'Habitat': wait_for_element(driver, By.ID, "habitat", max_retries=5).text,
+            'Phenology': wait_for_element(driver, By.ID, "reproductiveCondition", max_retries=5).text
+        }
+        return data
+    finally:
+        driver.quit()
+
+def fetch_collection_data_concurrently(collection_ids, max_workers=5):
+    results = []
+
+    # Initialize an empty DataFrame to hold column names. This is useful if you expect your first few calls might fail and return None.
+    temp_path = "./scraper_results/temp_results.csv"
+    pd.DataFrame(columns=['Collection ID', 'Image Link', 'Phylum (门)', 'Order (目)', 'Family (科)', 
+                          'Genus (属)', 'Scientific Name', 'Chinese Name', 'Identified By', 
+                          'Date Identified', 'Collector', 'Collection Number', 'Collection Date', 
+                          'Collection Location', 'Altitude', 'Habitat', 'Phenology']).to_csv(temp_path, mode='w', index=False, encoding='utf-8-sig')
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(fetch_data, cid) for cid in collection_ids]
+        for future in tqdm(as_completed(futures), total=len(collection_ids)):
+            results.append(future.result())
+            result = future.result()
+            if result:
+                # Append the result to the CSV file
+                try:
+                    pd.DataFrame([result]).to_csv(temp_path, mode='a', header=False, index=False, encoding='utf-8-sig')
+                except PermissionError:
+                    continue
+
+    df = pd.DataFrame(results)
+    return df
+
+def main():
+    """Main function to orchestrate the web scraping process."""
+    # Set up Selenium WebDriver
+    service = Service(ChromeDriverManager().install())
+    driver = webdriver.Chrome(service=service)
+
+    # URL to scrape
+    url = "https://www.cvh.ac.cn/spms/list.php?&offset=0"
+
+    # Open the URL
+    driver.get(url)
+
+    # Wait for the dynamic content to load
+    time.sleep(5)  # Adjust the sleep time according to your internet speed and website response
+
+    # Find all elements with the specified class
+    rows = driver.find_elements(By.CLASS_NAME, 'spms-row')
+
+    # Extract data-collection-id from each row
+    data_collection_ids = [row.get_attribute('data-collection-id') for row in rows]
+
+    # Close the WebDriver
+    driver.quit()
+
+    # Create a DataFrame
+    data_collection_ids  = pd.DataFrame(data_collection_ids, columns=['Data Collection ID'])
+
+    # Save results
+    sample_collections = list(data_collection_ids['Data Collection ID'])
+    results = fetch_collection_data_concurrently(sample_collections, 30) 
+    results_path = "./scraper_results/results.csv"
+    results.to_csv(results_path, header=True, index=False, encoding='utf-8-sig')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Web Scraper for CVH Data')
+    parser.add_argument('--output_dir', required=True, help='Directory to save the scraping results.')
+    parser.add_argument('--offset', required=False, help='Offset to the CVH website.')
+    args = parser.parse_args()
+
+    main(args.collection_ids_path, args.output_dir)
diff --git a/Spring2024/scraper/src/run.sh b/Spring2024/scraper/src/run.sh
@@ -0,0 +1,2 @@
+# 
+python cvh_scraper.py --output_dir ./scraper_results