Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: eellak/glossAPI
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: Sadique982/glossAPI
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
Able to merge. These branches can be automatically merged.
  • 1 commit
  • 1 file changed
  • 1 contributor

Commits on Nov 2, 2024

  1. [Improve concurrent downloading capacity of script. #18]

    In this update, I have increased the number of PDFs downloaded per minute while ensuring that we do not overwhelm the server. The changes can be found in `scraping/download_and_extract_scripts/downloader.py`. I’ve implemented methods to manage concurrent downloads more effectively, including utilizing semaphore limits and adding sleep intervals. Additionally, I have considered strategies to avoid getting blocked, such as implementing multiple downloaders with `torify` if necessary.
    Sadique982 committed Nov 2, 2024

    Verified

    This commit was created on GitHub.com and signed with GitHub’s verified signature.
    Copy the full SHA
    e4e5d68 View commit details
Showing with 26 additions and 4 deletions.
  1. +26 −4 scraping/download_and_extract_scripts/downloader.py
30 changes: 26 additions & 4 deletions scraping/download_and_extract_scripts/downloader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@

# [Changed by @Sadique982]
# Function to dynamically set a proxy for requests (e.g., Tor network)
async def use_proxy(session, url, headers, proxy_url='socks5://127.0.0.1:9050'):
try:
async with session.get(url, headers=headers, proxy=proxy_url) as response:
if response.status in (403, 429): # Check for block indicators
logging.warning("Blocked response detected, switching to proxy")
return False
return True
except Exception as e:
logging.error(f"Proxy error: {e}")
return False


import aiohttp
import asyncio
import os
@@ -48,6 +63,8 @@ async def download_pdfs(metadata_dict, semaphore, visited, indexes, args, progre
task = asyncio.create_task(
download_pdf(index, metadata, url, semaphore, args, next(user_agent_gen))
)
# [Changed by @Sadique982]
await asyncio.sleep(0.5) # Rate limit
tasks.append(task)
i += 1
results = await asyncio.gather(*tasks)
@@ -74,15 +91,20 @@ async def get_base_url(url):
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
return base_url

#Function for the initialization of session headers
# Function for the initialization of session headers
async def setup_session(session, url, headers):
""" Initialize the session with base headers. """
base_url = await get_base_url(url)
initial_url = f"{base_url}"
async with session.get(initial_url, headers=headers) as response:
await response.text()
# [Changed by @Sadique982]
if not await use_proxy(session, initial_url, headers):
logging.info('Using proxy for download.')
else:
async with session.get(initial_url, headers=headers) as response:
await response.text() # Ensure this line is indented correctly
return headers


#Function that arranges concurrent download of a PDFs given pdf_url, then returns download status, metadata and filename as a tuple.
async def download_pdf(index, metadata, pdf_url, semaphore, args, user_agent, referer=None):

@@ -261,4 +283,4 @@ async def main():

#Entry point of Downloader
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())