Skip to content

Commit

Permalink
implemented threading lock
Browse files Browse the repository at this point in the history
  • Loading branch information
mahinth1 committed Nov 4, 2024
1 parent fac299f commit 5e8013b
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions stages/02_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
import pypdf
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import threading

dotenv.load_dotenv()
scraperapi_key = os.getenv("SCRAPERAPI_KEY")

lock = threading.Lock()
##### Functions #####

# download pdf from url
Expand Down Expand Up @@ -55,12 +57,10 @@ def download_pdf(url, file_output_dir, downloaded_hashes, session=None):
if len(reader.pages) > 0:
return outfile_path, content_hash
except Exception:
if outfile_path.exists():
outfile_path.unlink()
return None, None

if outfile_path.exists():
outfile_path.unlink()
with lock:
if outfile_path.exists():
outfile_path.unlink()
return None, None

return None, None

Expand All @@ -82,7 +82,7 @@ def download_pdf(url, file_output_dir, downloaded_hashes, session=None):

# process each file
for file in input_dir.glob('*.parquet'):
df = pd.read_parquet(file)[:150000]
df = pd.read_parquet(file)[:160000]

file_stem = file.stem

Expand All @@ -108,7 +108,7 @@ def download_pdf(url, file_output_dir, downloaded_hashes, session=None):

# Execute download using multiple threads (I/O bound operation)
results_list = []
with ThreadPoolExecutor(max_workers=40) as executor:
with ThreadPoolExecutor(max_workers=96) as executor:
results = list(tqdm(executor.map(
lambda doi_url: (
doi_url[1],
Expand Down

0 comments on commit 5e8013b

Please sign in to comment.