Skip to content

Commit

Permalink
Merge pull request #125 from BU-Spark/poc_2024_03_30
Browse files Browse the repository at this point in the history
Added documentations and refractored the web scraper ETL from ipynb notebook to python code.
  • Loading branch information
WilliamLee101 authored Apr 2, 2024
2 parents 638fd7c + a75fc78 commit e90bc34
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 0 deletions.
Binary file added Spring2024/ocr/Herbaria_POC_DocumentAI.docx
Binary file not shown.
Binary file added Spring2024/ocr/Herbaria_POC_DocumentAI.pdf
Binary file not shown.
10 changes: 10 additions & 0 deletions Spring2024/ocr/README_ocr.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Documentaion of OCR modules
The documentation of OCR pipeline of the project. The pipeline could be broken down into three models
* TrOCR model built and maintained by SU23 and previous semester's team
* AzureOCR/GPT4 chain by FA23 team
* GCP Document AI by SP24 team

# Google Document AI
@mkaramb
Please refer to Herbaria_POC_DocumentAI.docx for detailed instructions.

16 changes: 16 additions & 0 deletions Spring2024/poc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# POC documentation
The Proof of Concept is structured as following. Please refer the following documents below for references.

# OCR Proof of Concept
@mkaramb

@Aeronyx

Leveraged GCP's Document AI to efficiently perform OCR task.
Please see ./ocr/README_ocr.md and ./ocr/Herbaria_POC_DocumentAI.docx

# Scraper Proof of Concept
@palmpalm7

Leveraged Selenium to create scraping and ETL pipeline for the CVH datasets.
Please see ./scraper/README_scraper.md
25 changes: 25 additions & 0 deletions Spring2024/scraper/README_scraper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Documentation for the scraper functionalities
@PalmPalm7

# How to use
* Run the ./scraper/src/run.sh shell to start the CVH scraping ETL pipeline.
* The pipeline mainly used selenium for automation. Please modify path variables accordingly.

# Progress Report - March 31st, 2024
@PalmPalm7
## Progress
* Finished implemented a webscraping pipeline for the CVH datasets
* Scraped 180 entities in first batch
* Have set up a scraping automation pipeline with crontab and shellscript to perform the ETL process.

## Problem
* Error Handling for scraper code
* Headless option to be able to run the code on a VM
* Currently the CVH's index is meaningless, scraping from the start gives you only the a small subset.

## Plan
* SE the scraper code
* Complete headless option scraper
* Identify handwritten datasets (by year maybe?)
* Scraper more samples (10000)
* Capusalize it with docker.
Empty file.
142 changes: 142 additions & 0 deletions Spring2024/scraper/src/cvh_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import argparse
import concurrent
import logging
import os
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from tqdm.auto import tqdm


def setup_driver(headless=True):
"""
Setup a WebDriver used for selenium, using --headeless option as default
"""
chrome_options = Options()
if headless:
chrome_options.add_argument("--headless")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver

def wait_for_element(driver, by_method, value, timeout=10, retry_interval=5, max_retries=3):
"""
"""
retries = 0
while retries < max_retries:
try:
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by_method, value)))
except TimeoutException:
print(f"Retry {retries+1}/{max_retries} for element {value} after timeout.")
time.sleep(retry_interval)
retries += 1
finally:
print(f"Retry {retries+1}/{max_retries} for element {value} after timeout.")
time.sleep(retry_interval)
# raise TimeoutException(f"Element {value} not found after {max_retries} retries.")
return "Missing Value or Timeout" # Return None if element is not found

def fetch_data(collection_id):
driver = setup_driver()
try:
print(f"Fetching data for collection: {collection_id}\n.")
driver.get(f"https://www.cvh.ac.cn/spms/detail.php?id={collection_id}")

# Fetch specific details as per your logic
data = {
'Collection ID': collection_id,
'Image Link': wait_for_element(driver, By.ID, "spm_image", max_retries=5).get_attribute('src'),
'Phylum (门)': wait_for_element(driver, By.ID, "taxon_phy_c", max_retries=5).text,
'Order (目)': wait_for_element(driver, By.ID, "taxon_ord_c", max_retries=5).text,
'Family (科)': wait_for_element(driver, By.ID, "taxon_fam_c", max_retries=5).text,
'Genus (属)': wait_for_element(driver, By.ID, "taxon_gen_c", max_retries=5).text,
'Scientific Name': wait_for_element(driver, By.ID, "formattedName", max_retries=5).text,
'Chinese Name': wait_for_element(driver, By.ID, "chineseName", max_retries=5).text,
'Identified By': wait_for_element(driver, By.ID, "identifiedBy", max_retries=5).text,
'Date Identified': wait_for_element(driver, By.ID, "dateIdentified", max_retries=5).text,
'Collector': wait_for_element(driver, By.ID, "recordedBy", max_retries=5).text,
'Collection Number': wait_for_element(driver, By.ID, "recordNumber", max_retries=5).text,
'Collection Date': wait_for_element(driver, By.ID, "verbatimEventDate", max_retries=5).text,
'Collection Location': wait_for_element(driver, By.ID, "locality", max_retries=5).text,
'Altitude': wait_for_element(driver, By.ID, "elevation", max_retries=5).text,
'Habitat': wait_for_element(driver, By.ID, "habitat", max_retries=5).text,
'Phenology': wait_for_element(driver, By.ID, "reproductiveCondition", max_retries=5).text
}
return data
finally:
driver.quit()

def fetch_collection_data_concurrently(collection_ids, max_workers=5):
results = []

# Initialize an empty DataFrame to hold column names. This is useful if you expect your first few calls might fail and return None.
temp_path = "./scraper_results/temp_results.csv"
pd.DataFrame(columns=['Collection ID', 'Image Link', 'Phylum (门)', 'Order (目)', 'Family (科)',
'Genus (属)', 'Scientific Name', 'Chinese Name', 'Identified By',
'Date Identified', 'Collector', 'Collection Number', 'Collection Date',
'Collection Location', 'Altitude', 'Habitat', 'Phenology']).to_csv(temp_path, mode='w', index=False, encoding='utf-8-sig')

with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(fetch_data, cid) for cid in collection_ids]
for future in tqdm(as_completed(futures), total=len(collection_ids)):
results.append(future.result())
result = future.result()
if result:
# Append the result to the CSV file
try:
pd.DataFrame([result]).to_csv(temp_path, mode='a', header=False, index=False, encoding='utf-8-sig')
except PermissionError:
continue

df = pd.DataFrame(results)
return df

def main():
"""Main function to orchestrate the web scraping process."""
# Set up Selenium WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# URL to scrape
url = "https://www.cvh.ac.cn/spms/list.php?&offset=0"

# Open the URL
driver.get(url)

# Wait for the dynamic content to load
time.sleep(5) # Adjust the sleep time according to your internet speed and website response

# Find all elements with the specified class
rows = driver.find_elements(By.CLASS_NAME, 'spms-row')

# Extract data-collection-id from each row
data_collection_ids = [row.get_attribute('data-collection-id') for row in rows]

# Close the WebDriver
driver.quit()

# Create a DataFrame
data_collection_ids = pd.DataFrame(data_collection_ids, columns=['Data Collection ID'])

# Save results
sample_collections = list(data_collection_ids['Data Collection ID'])
results = fetch_collection_data_concurrently(sample_collections, 30)
results_path = "./scraper_results/results.csv"
results.to_csv(results_path, header=True, index=False, encoding='utf-8-sig')


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Web Scraper for CVH Data')
parser.add_argument('--output_dir', required=True, help='Directory to save the scraping results.')
parser.add_argument('--offset', required=False, help='Offset to the CVH website.')
args = parser.parse_args()

main(args.collection_ids_path, args.output_dir)
2 changes: 2 additions & 0 deletions Spring2024/scraper/src/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#
python cvh_scraper.py --output_dir ./scraper_results

0 comments on commit e90bc34

Please sign in to comment.