Skip to content

Commit e94c503

Browse files
authored
Merge pull request #15 from D4Vinci/docs
Doc adjustments to use Sphinx soon
2 parents 4cea8c9 + 1929d9b commit e94c503

File tree

5 files changed

+40
-27
lines changed

5 files changed

+40
-27
lines changed

scrapling/core/custom_types.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,8 @@ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entiti
129129

130130

131131
class AttributesHandler(Mapping):
132-
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
133-
at the same time I use it to add more functionalities.
134-
If standard dictionary is needed, just convert this class to dictionary with `dict` function
132+
"""A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
133+
If standard dictionary is needed, just convert this class to dictionary with `dict` function
135134
"""
136135
__slots__ = ('_data',)
137136

scrapling/core/translator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""
22
Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
3-
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
4-
which will be important in future releases but most importantly...
5-
so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
6-
> if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
3+
4+
To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
5+
6+
So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
7+
8+
if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
79
"""
810

911
import re

scrapling/engines/static.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, f
2323
@staticmethod
2424
def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
2525
"""Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26-
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
26+
finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
2727
2828
:param headers: Current headers in the request if the user passed any
2929
:param url: The Target URL.
@@ -65,6 +65,7 @@ def _prepare_response(self, response: httpxResponse) -> Response:
6565

6666
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
6767
"""Make basic HTTP GET request for you but with some added flavors.
68+
6869
:param url: Target url.
6970
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
7071
create a referer header as if this request had came from Google's search of this URL's domain.
@@ -77,6 +78,7 @@ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict)
7778

7879
def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
7980
"""Make basic HTTP POST request for you but with some added flavors.
81+
8082
:param url: Target url.
8183
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
8284
create a referer header as if this request had came from Google's search of this URL's domain.
@@ -89,6 +91,7 @@ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict
8991

9092
def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
9193
"""Make basic HTTP DELETE request for you but with some added flavors.
94+
9295
:param url: Target url.
9396
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
9497
create a referer header as if this request had came from Google's search of this URL's domain.
@@ -101,6 +104,7 @@ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Di
101104

102105
def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103106
"""Make basic HTTP PUT request for you but with some added flavors.
107+
104108
:param url: Target url.
105109
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106110
create a referer header as if this request had came from Google's search of this URL's domain.

scrapling/engines/toolbelt/custom.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212

1313

1414
class ResponseEncoding:
15-
DEFAULT_ENCODING = "utf-8"
16-
ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
15+
__DEFAULT_ENCODING = "utf-8"
16+
__ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
1717

1818
@classmethod
1919
@cache(maxsize=None)
@@ -43,17 +43,17 @@ def get_value(cls, content_type: Optional[str]) -> str:
4343
"""Determine the appropriate character encoding from a content-type header.
4444
4545
The encoding is determined by these rules in order:
46-
1. If no content-type is provided, use UTF-8
47-
2. If charset parameter is present, use that encoding
48-
3. If content-type is text/*, use ISO-8859-1 per HTTP/1.1 spec
49-
4. If content-type is application/json, use UTF-8 per RFC 4627
50-
5. Default to UTF-8 if nothing else matches
46+
1. If no content-type is provided, use UTF-8
47+
2. If charset parameter is present, use that encoding
48+
3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
49+
4. If content-type is application/json, use UTF-8 per RFC 4627
50+
5. Default to UTF-8 if nothing else matches
5151
5252
:param content_type: Content-Type header value or None
5353
:return: String naming the character encoding
5454
"""
5555
if not content_type:
56-
return cls.DEFAULT_ENCODING
56+
return cls.__DEFAULT_ENCODING
5757

5858
try:
5959
content_type, params = cls.__parse_content_type(content_type)
@@ -65,16 +65,16 @@ def get_value(cls, content_type: Optional[str]) -> str:
6565
return encoding
6666

6767
# Apply content-type specific rules
68-
if content_type in cls.ISO_8859_1_CONTENT_TYPES:
68+
if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
6969
return "ISO-8859-1"
7070

7171
if content_type == "application/json":
72-
return cls.DEFAULT_ENCODING
72+
return cls.__DEFAULT_ENCODING
7373

74-
return cls.DEFAULT_ENCODING
74+
return cls.__DEFAULT_ENCODING
7575

7676
except (ValueError, LookupError, UnicodeEncodeError):
77-
return cls.DEFAULT_ENCODING
77+
return cls.__DEFAULT_ENCODING
7878

7979

8080
class Response(Adaptor):

scrapling/fetchers.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
1111
"""
1212
def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
1313
"""Make basic HTTP GET request for you but with some added flavors.
14+
1415
:param url: Target url.
1516
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
1617
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -24,6 +25,7 @@ def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[i
2425

2526
def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
2627
"""Make basic HTTP POST request for you but with some added flavors.
28+
2729
:param url: Target url.
2830
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
2931
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -37,19 +39,22 @@ def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[
3739

3840
def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
3941
"""Make basic HTTP PUT request for you but with some added flavors.
42+
4043
:param url: Target url
4144
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
4245
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
4346
:param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
44-
create a referer header as if this request came from Google's search of this URL's domain.
47+
create a referer header as if this request came from Google's search of this URL's domain.
4548
:param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
49+
4650
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
4751
"""
4852
response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
4953
return response_object
5054

5155
def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
5256
"""Make basic HTTP DELETE request for you but with some added flavors.
57+
5358
:param url: Target url
5459
:param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
5560
:param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -77,6 +82,7 @@ def fetch(
7782
) -> Response:
7883
"""
7984
Opens up a browser and do your request based on your chosen options below.
85+
8086
:param url: Target url.
8187
:param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
8288
:param block_images: Prevent the loading of images through Firefox preferences.
@@ -127,14 +133,15 @@ class PlayWrightFetcher(BaseFetcher):
127133
Using this Fetcher class, you can do requests with:
128134
- Vanilla Playwright without any modifications other than the ones you chose.
129135
- Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
130-
Some of the things stealth mode does include:
131-
1) Patches the CDP runtime fingerprint.
132-
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
133-
3) Using custom flags on launch to hide Playwright even more and make it faster.
134-
4) Generates real browser's headers of the same type and same user OS then append it to the request.
136+
Some of the things stealth mode does include:
137+
1) Patches the CDP runtime fingerprint.
138+
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
139+
3) Using custom flags on launch to hide Playwright even more and make it faster.
140+
4) Generates real browser's headers of the same type and same user OS then append it to the request.
135141
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
136142
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
137-
> Note that these are the main options with PlayWright but it can be mixed together.
143+
144+
> Note that these are the main options with PlayWright but it can be mixed together.
138145
"""
139146
def fetch(
140147
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
@@ -147,6 +154,7 @@ def fetch(
147154
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
148155
) -> Response:
149156
"""Opens up a browser and do your request based on your chosen options below.
157+
150158
:param url: Target url.
151159
:param headless: Run the browser in headless/hidden (default), or headful/visible mode.
152160
:param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.

0 commit comments

Comments
 (0)