Skip to content

Commit 50cd40c

Browse files
authored
Merge pull request #9 from D4Vinci/dev
v0.2.2
2 parents 105ec5b + 0c6e770 commit 50cd40c

File tree

10 files changed

+41
-18
lines changed

10 files changed

+41
-18
lines changed

README.md

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
66
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
77

88
```python
9-
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
9+
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
1010
# Fetch websites' source under the radar!
11-
>> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
11+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
1212
>> print(page.status)
1313
200
1414
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
@@ -211,12 +211,21 @@ python -m browserforge update
211211
```
212212
213213
## Fetching Websites Features
214-
All fetcher-type classes are imported in the same way
214+
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
215215
```python
216216
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
217217
```
218218
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
219219
220+
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
221+
```python
222+
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
223+
```
224+
then use it right away without initializing like:
225+
```python
226+
page = StealthyFetcher.fetch('https://example.com')
227+
```
228+
220229
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
221230
> [!NOTE]
222231
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.

scrapling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from scrapling.core.custom_types import TextHandler, AttributesHandler
55

66
__author__ = "Karim Shoair ([email protected])"
7-
__version__ = "0.2.1"
7+
__version__ = "0.2.2"
88
__copyright__ = "Copyright (c) 2024 Karim Shoair"
99

1010

scrapling/defaults.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
2+
3+
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4+
Fetcher = Fetcher()
5+
StealthyFetcher = StealthyFetcher()
6+
PlayWrightFetcher = PlayWrightFetcher()

scrapling/engines/camo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,14 +114,14 @@ def fetch(self, url: str) -> Response:
114114
response = Response(
115115
url=res.url,
116116
text=page.content(),
117-
content=res.body(),
117+
body=res.body(),
118118
status=res.status,
119119
reason=res.status_text,
120120
encoding=encoding,
121121
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
122122
headers=res.all_headers(),
123123
request_headers=res.request.all_headers(),
124-
adaptor_arguments=self.adaptor_arguments
124+
**self.adaptor_arguments
125125
)
126126
page.close()
127127

scrapling/engines/pw.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,14 @@ def fetch(self, url: str) -> Response:
224224
response = Response(
225225
url=res.url,
226226
text=page.content(),
227-
content=res.body(),
227+
body=res.body(),
228228
status=res.status,
229229
reason=res.status_text,
230230
encoding=encoding,
231231
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
232232
headers=res.all_headers(),
233233
request_headers=res.request.all_headers(),
234-
adaptor_arguments=self.adaptor_arguments
234+
**self.adaptor_arguments
235235
)
236236
page.close()
237237
return response

scrapling/engines/static.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,14 @@ def _prepare_response(self, response: httpxResponse) -> Response:
5353
return Response(
5454
url=str(response.url),
5555
text=response.text,
56-
content=response.content,
56+
body=response.content,
5757
status=response.status_code,
5858
reason=response.reason_phrase,
5959
encoding=response.encoding or 'utf-8',
6060
cookies=dict(response.cookies),
6161
headers=dict(response.headers),
6262
request_headers=dict(response.request.headers),
63-
adaptor_arguments=self.adaptor_arguments
63+
**self.adaptor_arguments
6464
)
6565

6666
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:

scrapling/engines/toolbelt/custom.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,14 @@
1212
class Response(Adaptor):
1313
"""This class is returned by all engines as a way to unify response type between different libraries."""
1414

15-
def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
15+
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
1616
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
17-
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
18-
1917
self.status = status
2018
self.reason = reason
2119
self.cookies = cookies
2220
self.headers = headers
2321
self.request_headers = request_headers
22+
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
2423
# For back-ward compatibility
2524
self.adaptor = self
2625

@@ -31,7 +30,7 @@ def __init__(self, url: str, text: str, content: bytes, status: int, reason: str
3130
class BaseFetcher:
3231
def __init__(
3332
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
34-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
33+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
3534
automatch_domain: Optional[str] = None,
3635
):
3736
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments

scrapling/parser.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(
3232
storage: Any = SQLiteStorageSystem,
3333
storage_args: Optional[Dict] = None,
3434
debug: Optional[bool] = True,
35+
**kwargs
3536
):
3637
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
3738
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -117,6 +118,10 @@ def __init__(
117118
self.__attributes = None
118119
self.__tag = None
119120
self.__debug = debug
121+
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
122+
self.__response_data = {
123+
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
124+
} if hasattr(self, 'status') else {}
120125

121126
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
122127
@staticmethod
@@ -138,10 +143,14 @@ def __get_correct_result(
138143
return TextHandler(str(element))
139144
else:
140145
if issubclass(type(element), html.HtmlMixin):
146+
141147
return self.__class__(
142-
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
148+
root=element,
149+
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
150+
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
143151
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
144-
huge_tree=self.__huge_tree_enabled, debug=self.__debug
152+
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
153+
**self.__response_data
145154
)
146155
return element
147156

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = scrapling
3-
version = 0.2.1
3+
version = 0.2.2
44
author = Karim Shoair
55
author_email = [email protected]
66
description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="scrapling",
9-
version="0.2.1",
9+
version="0.2.2",
1010
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
1111
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
1212
impressive speed improvements over many popular scraping tools.""",

0 commit comments

Comments
 (0)