Skip to content

Commit

Permalink
[Improve concurrent downloading capacity of script. eellak#18]
Browse files Browse the repository at this point in the history
In this update, I have increased the number of PDFs downloaded per minute while ensuring that we do not overwhelm the server. The changes can be found in `scraping/download_and_extract_scripts/downloader.py`. I’ve implemented methods to manage concurrent downloads more effectively, including utilizing semaphore limits and adding sleep intervals. Additionally, I have considered strategies to avoid getting blocked, such as implementing multiple downloaders with `torify` if necessary.
  • Loading branch information
Sadique982 committed Nov 2, 2024
1 parent 06d7beb commit e4e5d68
Showing 1 changed file with 26 additions and 4 deletions.
30 changes: 26 additions & 4 deletions scraping/download_and_extract_scripts/downloader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@

# [Changed by @Sadique982]
# Function to dynamically set a proxy for requests (e.g., Tor network)
async def use_proxy(session, url, headers, proxy_url='socks5://127.0.0.1:9050'):
try:
async with session.get(url, headers=headers, proxy=proxy_url) as response:
if response.status in (403, 429): # Check for block indicators
logging.warning("Blocked response detected, switching to proxy")
return False
return True
except Exception as e:
logging.error(f"Proxy error: {e}")
return False


import aiohttp
import asyncio
import os
Expand Down Expand Up @@ -48,6 +63,8 @@ async def download_pdfs(metadata_dict, semaphore, visited, indexes, args, progre
task = asyncio.create_task(
download_pdf(index, metadata, url, semaphore, args, next(user_agent_gen))
)
# [Changed by @Sadique982]
await asyncio.sleep(0.5) # Rate limit
tasks.append(task)
i += 1
results = await asyncio.gather(*tasks)
Expand All @@ -74,15 +91,20 @@ async def get_base_url(url):
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
return base_url

#Function for the initialization of session headers
# Function for the initialization of session headers
async def setup_session(session, url, headers):
""" Initialize the session with base headers. """
base_url = await get_base_url(url)
initial_url = f"{base_url}"
async with session.get(initial_url, headers=headers) as response:
await response.text()
# [Changed by @Sadique982]
if not await use_proxy(session, initial_url, headers):
logging.info('Using proxy for download.')
else:
async with session.get(initial_url, headers=headers) as response:
await response.text() # Ensure this line is indented correctly
return headers


#Function that arranges concurrent download of a PDFs given pdf_url, then returns download status, metadata and filename as a tuple.
async def download_pdf(index, metadata, pdf_url, semaphore, args, user_agent, referer=None):

Expand Down Expand Up @@ -261,4 +283,4 @@ async def main():

#Entry point of Downloader
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())

0 comments on commit e4e5d68

Please sign in to comment.