Skip to content

Commit bbbc97a

Browse files
authored
Merge pull request #16 from D4Vinci/dev
v0.2.6
2 parents e94c503 + 438867e commit bbbc97a

File tree

7 files changed

+31
-20
lines changed

7 files changed

+31
-20
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,9 +290,11 @@ Using this Fetcher class, you can make requests with:
290290
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
291291
* Using custom flags on launch to hide Playwright even more and make it faster.
292292
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
293-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
293+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
294294
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
295295
296+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
297+
296298
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
297299
298300
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
@@ -314,6 +316,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
314316
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
315317
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
316318
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
319+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
317320
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
318321
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
319322
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |

scrapling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from scrapling.core.custom_types import TextHandler, AttributesHandler
55

66
__author__ = "Karim Shoair ([email protected])"
7-
__version__ = "0.2.5"
7+
__version__ = "0.2.6"
88
__copyright__ = "Copyright (c) 2024 Karim Shoair"
99

1010

scrapling/engines/pw.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ def __init__(
2727
page_action: Callable = do_nothing,
2828
wait_selector: Optional[str] = None,
2929
wait_selector_state: Optional[str] = 'attached',
30-
stealth: bool = False,
31-
hide_canvas: bool = True,
32-
disable_webgl: bool = False,
30+
stealth: Optional[bool] = False,
31+
real_chrome: Optional[bool] = False,
32+
hide_canvas: Optional[bool] = False,
33+
disable_webgl: Optional[bool] = False,
3334
cdp_url: Optional[str] = None,
34-
nstbrowser_mode: bool = False,
35+
nstbrowser_mode: Optional[bool] = False,
3536
nstbrowser_config: Optional[Dict] = None,
3637
google_search: Optional[bool] = True,
3738
extra_headers: Optional[Dict[str, str]] = None,
@@ -51,6 +52,7 @@ def __init__(
5152
:param wait_selector: Wait for a specific css selector to be in a specific state.
5253
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
5354
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
55+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
5456
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
5557
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
5658
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
@@ -67,6 +69,7 @@ def __init__(
6769
self.stealth = bool(stealth)
6870
self.hide_canvas = bool(hide_canvas)
6971
self.disable_webgl = bool(disable_webgl)
72+
self.real_chrome = bool(real_chrome)
7073
self.google_search = bool(google_search)
7174
self.extra_headers = extra_headers or {}
7275
self.proxy = construct_proxy_dict(proxy)
@@ -119,7 +122,8 @@ def fetch(self, url: str) -> Response:
119122
:param url: Target url.
120123
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
121124
"""
122-
if not self.stealth:
125+
if not self.stealth or self.real_chrome:
126+
# Because rebrowser_playwright doesn't play well with real browsers
123127
from playwright.sync_api import sync_playwright
124128
else:
125129
from rebrowser_playwright.sync_api import sync_playwright
@@ -130,8 +134,8 @@ def fetch(self, url: str) -> Response:
130134
extra_headers = {}
131135
useragent = self.useragent
132136
else:
133-
extra_headers = generate_headers(browser_mode=True)
134-
useragent = extra_headers.get('User-Agent')
137+
extra_headers = {}
138+
useragent = generate_headers(browser_mode=True).get('User-Agent')
135139

136140
# Prepare the flags before diving
137141
flags = DEFAULT_STEALTH_FLAGS
@@ -146,9 +150,11 @@ def fetch(self, url: str) -> Response:
146150
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
147151
else:
148152
if self.stealth:
149-
browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
153+
browser = p.chromium.launch(
154+
headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
155+
)
150156
else:
151-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
157+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
152158

153159
# Creating the context
154160
if self.stealth:

scrapling/engines/toolbelt/fingerprints.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
6767
# So we don't raise any inconsistency red flags while websites fingerprinting us
6868
os_name = get_os_name()
6969
return HeaderGenerator(
70-
browser=[Browser(name='chrome', min_version=128)],
70+
browser=[Browser(name='chrome', min_version=130)],
7171
os=os_name, # None is ignored
7272
device='desktop'
7373
).generate()

scrapling/fetchers.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,20 +138,20 @@ class PlayWrightFetcher(BaseFetcher):
138138
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
139139
3) Using custom flags on launch to hide Playwright even more and make it faster.
140140
4) Generates real browser's headers of the same type and same user OS then append it to the request.
141-
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
141+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
142142
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
143143
144144
> Note that these are the main options with PlayWright but it can be mixed together.
145145
"""
146146
def fetch(
147147
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
148148
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
149-
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
150-
hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
149+
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
150+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
151151
proxy: Optional[Union[str, Dict[str, str]]] = None,
152-
stealth: bool = False,
152+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
153153
cdp_url: Optional[str] = None,
154-
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
154+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
155155
) -> Response:
156156
"""Opens up a browser and do your request based on your chosen options below.
157157
@@ -167,6 +167,7 @@ def fetch(
167167
:param wait_selector: Wait for a specific css selector to be in a specific state.
168168
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
169169
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
170+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
170171
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
171172
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
172173
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
@@ -184,6 +185,7 @@ def fetch(
184185
cdp_url=cdp_url,
185186
headless=headless,
186187
useragent=useragent,
188+
real_chrome=real_chrome,
187189
page_action=page_action,
188190
hide_canvas=hide_canvas,
189191
network_idle=network_idle,

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = scrapling
3-
version = 0.2.5
3+
version = 0.2.6
44
author = Karim Shoair
55
author_email = [email protected]
66
description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="scrapling",
9-
version="0.2.5",
9+
version="0.2.6",
1010
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
1111
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
1212
impressive speed improvements over many popular scraping tools.""",
@@ -55,7 +55,7 @@
5555
"orjson>=3",
5656
"tldextract",
5757
'httpx[brotli,zstd]',
58-
'playwright',
58+
'playwright==1.48', # Temporary because currently All libraries that provide CDP patches doesn't support playwright 1.49 yet
5959
'rebrowser-playwright',
6060
'camoufox>=0.3.10',
6161
'browserforge',

0 commit comments

Comments
 (0)