-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #125 from BU-Spark/poc_2024_03_30
Added documentations and refractored the web scraper ETL from ipynb notebook to python code.
- Loading branch information
Showing
8 changed files
with
195 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Documentaion of OCR modules | ||
The documentation of OCR pipeline of the project. The pipeline could be broken down into three models | ||
* TrOCR model built and maintained by SU23 and previous semester's team | ||
* AzureOCR/GPT4 chain by FA23 team | ||
* GCP Document AI by SP24 team | ||
|
||
# Google Document AI | ||
@mkaramb | ||
Please refer to Herbaria_POC_DocumentAI.docx for detailed instructions. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# POC documentation | ||
The Proof of Concept is structured as following. Please refer the following documents below for references. | ||
|
||
# OCR Proof of Concept | ||
@mkaramb | ||
|
||
@Aeronyx | ||
|
||
Leveraged GCP's Document AI to efficiently perform OCR task. | ||
Please see ./ocr/README_ocr.md and ./ocr/Herbaria_POC_DocumentAI.docx | ||
|
||
# Scraper Proof of Concept | ||
@palmpalm7 | ||
|
||
Leveraged Selenium to create scraping and ETL pipeline for the CVH datasets. | ||
Please see ./scraper/README_scraper.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Documentation for the scraper functionalities | ||
@PalmPalm7 | ||
|
||
# How to use | ||
* Run the ./scraper/src/run.sh shell to start the CVH scraping ETL pipeline. | ||
* The pipeline mainly used selenium for automation. Please modify path variables accordingly. | ||
|
||
# Progress Report - March 31st, 2024 | ||
@PalmPalm7 | ||
## Progress | ||
* Finished implemented a webscraping pipeline for the CVH datasets | ||
* Scraped 180 entities in first batch | ||
* Have set up a scraping automation pipeline with crontab and shellscript to perform the ETL process. | ||
|
||
## Problem | ||
* Error Handling for scraper code | ||
* Headless option to be able to run the code on a VM | ||
* Currently the CVH's index is meaningless, scraping from the start gives you only the a small subset. | ||
|
||
## Plan | ||
* SE the scraper code | ||
* Complete headless option scraper | ||
* Identify handwritten datasets (by year maybe?) | ||
* Scraper more samples (10000) | ||
* Capusalize it with docker. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import argparse | ||
import concurrent | ||
import logging | ||
import os | ||
import pandas as pd | ||
import time | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.options import Options | ||
from selenium.webdriver.chrome.service import Service | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.common.exceptions import TimeoutException | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
from tqdm.auto import tqdm | ||
|
||
|
||
def setup_driver(headless=True): | ||
""" | ||
Setup a WebDriver used for selenium, using --headeless option as default | ||
""" | ||
chrome_options = Options() | ||
if headless: | ||
chrome_options.add_argument("--headless") | ||
service = Service(ChromeDriverManager().install()) | ||
driver = webdriver.Chrome(service=service, options=chrome_options) | ||
return driver | ||
|
||
def wait_for_element(driver, by_method, value, timeout=10, retry_interval=5, max_retries=3): | ||
""" | ||
""" | ||
retries = 0 | ||
while retries < max_retries: | ||
try: | ||
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by_method, value))) | ||
except TimeoutException: | ||
print(f"Retry {retries+1}/{max_retries} for element {value} after timeout.") | ||
time.sleep(retry_interval) | ||
retries += 1 | ||
finally: | ||
print(f"Retry {retries+1}/{max_retries} for element {value} after timeout.") | ||
time.sleep(retry_interval) | ||
# raise TimeoutException(f"Element {value} not found after {max_retries} retries.") | ||
return "Missing Value or Timeout" # Return None if element is not found | ||
|
||
def fetch_data(collection_id): | ||
driver = setup_driver() | ||
try: | ||
print(f"Fetching data for collection: {collection_id}\n.") | ||
driver.get(f"https://www.cvh.ac.cn/spms/detail.php?id={collection_id}") | ||
|
||
# Fetch specific details as per your logic | ||
data = { | ||
'Collection ID': collection_id, | ||
'Image Link': wait_for_element(driver, By.ID, "spm_image", max_retries=5).get_attribute('src'), | ||
'Phylum (门)': wait_for_element(driver, By.ID, "taxon_phy_c", max_retries=5).text, | ||
'Order (目)': wait_for_element(driver, By.ID, "taxon_ord_c", max_retries=5).text, | ||
'Family (科)': wait_for_element(driver, By.ID, "taxon_fam_c", max_retries=5).text, | ||
'Genus (属)': wait_for_element(driver, By.ID, "taxon_gen_c", max_retries=5).text, | ||
'Scientific Name': wait_for_element(driver, By.ID, "formattedName", max_retries=5).text, | ||
'Chinese Name': wait_for_element(driver, By.ID, "chineseName", max_retries=5).text, | ||
'Identified By': wait_for_element(driver, By.ID, "identifiedBy", max_retries=5).text, | ||
'Date Identified': wait_for_element(driver, By.ID, "dateIdentified", max_retries=5).text, | ||
'Collector': wait_for_element(driver, By.ID, "recordedBy", max_retries=5).text, | ||
'Collection Number': wait_for_element(driver, By.ID, "recordNumber", max_retries=5).text, | ||
'Collection Date': wait_for_element(driver, By.ID, "verbatimEventDate", max_retries=5).text, | ||
'Collection Location': wait_for_element(driver, By.ID, "locality", max_retries=5).text, | ||
'Altitude': wait_for_element(driver, By.ID, "elevation", max_retries=5).text, | ||
'Habitat': wait_for_element(driver, By.ID, "habitat", max_retries=5).text, | ||
'Phenology': wait_for_element(driver, By.ID, "reproductiveCondition", max_retries=5).text | ||
} | ||
return data | ||
finally: | ||
driver.quit() | ||
|
||
def fetch_collection_data_concurrently(collection_ids, max_workers=5): | ||
results = [] | ||
|
||
# Initialize an empty DataFrame to hold column names. This is useful if you expect your first few calls might fail and return None. | ||
temp_path = "./scraper_results/temp_results.csv" | ||
pd.DataFrame(columns=['Collection ID', 'Image Link', 'Phylum (门)', 'Order (目)', 'Family (科)', | ||
'Genus (属)', 'Scientific Name', 'Chinese Name', 'Identified By', | ||
'Date Identified', 'Collector', 'Collection Number', 'Collection Date', | ||
'Collection Location', 'Altitude', 'Habitat', 'Phenology']).to_csv(temp_path, mode='w', index=False, encoding='utf-8-sig') | ||
|
||
with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
futures = [executor.submit(fetch_data, cid) for cid in collection_ids] | ||
for future in tqdm(as_completed(futures), total=len(collection_ids)): | ||
results.append(future.result()) | ||
result = future.result() | ||
if result: | ||
# Append the result to the CSV file | ||
try: | ||
pd.DataFrame([result]).to_csv(temp_path, mode='a', header=False, index=False, encoding='utf-8-sig') | ||
except PermissionError: | ||
continue | ||
|
||
df = pd.DataFrame(results) | ||
return df | ||
|
||
def main(): | ||
"""Main function to orchestrate the web scraping process.""" | ||
# Set up Selenium WebDriver | ||
service = Service(ChromeDriverManager().install()) | ||
driver = webdriver.Chrome(service=service) | ||
|
||
# URL to scrape | ||
url = "https://www.cvh.ac.cn/spms/list.php?&offset=0" | ||
|
||
# Open the URL | ||
driver.get(url) | ||
|
||
# Wait for the dynamic content to load | ||
time.sleep(5) # Adjust the sleep time according to your internet speed and website response | ||
|
||
# Find all elements with the specified class | ||
rows = driver.find_elements(By.CLASS_NAME, 'spms-row') | ||
|
||
# Extract data-collection-id from each row | ||
data_collection_ids = [row.get_attribute('data-collection-id') for row in rows] | ||
|
||
# Close the WebDriver | ||
driver.quit() | ||
|
||
# Create a DataFrame | ||
data_collection_ids = pd.DataFrame(data_collection_ids, columns=['Data Collection ID']) | ||
|
||
# Save results | ||
sample_collections = list(data_collection_ids['Data Collection ID']) | ||
results = fetch_collection_data_concurrently(sample_collections, 30) | ||
results_path = "./scraper_results/results.csv" | ||
results.to_csv(results_path, header=True, index=False, encoding='utf-8-sig') | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='Web Scraper for CVH Data') | ||
parser.add_argument('--output_dir', required=True, help='Directory to save the scraping results.') | ||
parser.add_argument('--offset', required=False, help='Offset to the CVH website.') | ||
args = parser.parse_args() | ||
|
||
main(args.collection_ids_path, args.output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# | ||
python cvh_scraper.py --output_dir ./scraper_results |