Skip to content

Commit

Permalink
Add no_js option and add proxy rotation example
Browse files Browse the repository at this point in the history
  • Loading branch information
thefakequake committed Dec 27, 2024
1 parent 9349d78 commit ddc0515
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 8 deletions.
37 changes: 35 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,38 @@ async def get_parts():
asyncio.run(get_parts())
```

Proxy rotation w/ response_retriever override:

```py
import pypartpicker
import requests_html
from itertools import cycle

# replace with own list of proxies
list_proxy = [
"socks5://Username:Password@IP1:20000",
"socks5://Username:Password@IP2:20000",
"socks5://Username:Password@IP3:20000",
"socks5://Username:Password@IP4:20000",
]

proxy_cycle = cycle(list_proxy)
session = requests_html.HTMLSession()


def response_retriever(url):
proxy = next(proxy_cycle)
return session.get(url, proxies={"http": proxy, "https": proxy})


client = pypartpicker.Client(response_retriever=response_retriever)

res = client.get_part_search("cpu")
for result in res.parts:
part = client.get_part(result.url)
print(part.specs)
```

# Documentation

<h2 id="client">Client</h2>
Expand All @@ -105,8 +137,9 @@ Represents a client for interacting with parts-related data and making HTTP requ
- **`max_retries`**: `int` – The maximum number of retries for requests. Default is `3`.
- **`retry_delay`**: `int` – The delay between retries in seconds. Default is `0`.
- **`cookies`**: `Optional[dict]` – Cookies to include in requests.
- **`response_retriever`**: `Optional[Callable]` – A custom function to perform a request, overriding the default one.
Can be used to implement proxy rotation and custom scraping measures.
- **`response_retriever`**: `Optional[Callable]` – A custom function to perform a request, overriding the default one.
Can be used to implement proxy rotation and custom scraping measures.
- **`no_js`**: `bool` – Disables pyppeteer JS rendering. Default is `False`.

---

Expand Down
29 changes: 24 additions & 5 deletions pypartpicker/client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
from .scraper import Scraper
from .types import Part, PartList, PartSearchResult, PartReviewsResult
from .errors import CloudflareException, RateLimitException
Expand All @@ -9,13 +10,19 @@

class Client:
def __init__(
self, max_retries=3, retry_delay=0, response_retriever=None, cookies=None
self,
max_retries=3,
retry_delay=0,
response_retriever=None,
no_js=False,
cookies=None,
):
self.__scraper = Scraper()
self.__session = HTMLSession()
self.max_retries = max_retries
self.retry_delay = retry_delay
self.cookies = cookies
self.no_js = no_js

self.__get_response = (
response_retriever
Expand All @@ -33,11 +40,14 @@ def __default_response_retriever(self, url: str, retries=0) -> Response:

# Check if we are being Cloudflare checked
if self.__scraper.is_cloudflare(res):
if self.no_js:
return self.__default_response_retriever(url, self.max_retries)

res.html.render()

if self.__scraper.is_cloudflare(res):
time.sleep(self.retry_delay)
return self.__get_response(url, retries + 1)
return self.__default_response_retriever(url, retries + 1)
elif self.__scraper.is_rate_limit(res):
raise RateLimitException(f"PCPP rate limit encountered: {url}")

Expand Down Expand Up @@ -84,13 +94,19 @@ def get_part_reviews(

class AsyncClient:
def __init__(
self, max_retries=3, retry_delay=0, response_retriever=None, cookies=None
self,
max_retries=3,
retry_delay=0,
response_retriever=None,
cookies=None,
no_js=False,
):
self.__scraper = Scraper()
self.__session = None
self.max_retries = max_retries
self.retry_delay = retry_delay
self.cookies = cookies
self.no_js = no_js

self.__get_response = (
response_retriever
Expand All @@ -117,11 +133,14 @@ async def __default_response_retriever(

# Check if we are being Cloudflare checked
if self.__scraper.is_cloudflare(res):
if self.no_js:
return await self.__default_response_retriever(url, self.max_retries)

await res.html.arender()

if self.__scraper.is_cloudflare(res):
time.sleep(self.retry_delay)
return self.__get_response(url, retries + 1)
asyncio.sleep(self.retry_delay)
return await self.__default_response_retriever(url, retries + 1)
elif self.__scraper.is_rate_limit(res):
raise RateLimitException(f"PCPP rate limit encountered: {url}")

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pypartpicker"
version = "2.0.3"
version = "2.0.4"
description = "A PCPartPicker data extractor for Python."
authors = ["QuaKe"]
readme = "README.md"
Expand Down

0 comments on commit ddc0515

Please sign in to comment.