Skip to content

Commit

Permalink
Adding keep_cdata argument for Adaptor and Response classes
Browse files Browse the repository at this point in the history
This will force lxml to keep cdata while parsing html if you want
  • Loading branch information
D4Vinci committed Dec 10, 2024
1 parent 5f9c398 commit b4f9061
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 5 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
```python
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
```
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
```python
Expand Down
4 changes: 3 additions & 1 deletion scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,15 @@ class BaseFetcher:
def __init__(
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
automatch_domain: Optional[str] = None,
automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
):
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
are detected and passed automatically from the Fetcher based on the response for accessibility.
:param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
priority over all auto-match related arguments/functions in the class.
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
Expand All @@ -127,6 +128,7 @@ def __init__(
self.adaptor_arguments = dict(
huge_tree=huge_tree,
keep_comments=keep_comments,
keep_cdata=keep_cdata,
auto_match=auto_match,
storage=storage,
storage_args=storage_args,
Expand Down
10 changes: 7 additions & 3 deletions scrapling/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class Adaptor(SelectorsGeneration):
__slots__ = (
'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
'__keep_cdata',
)

def __init__(
Expand All @@ -36,6 +37,7 @@ def __init__(
huge_tree: bool = True,
root: Optional[html.HtmlElement] = None,
keep_comments: Optional[bool] = False,
keep_cdata: Optional[bool] = False,
auto_match: Optional[bool] = True,
storage: Any = SQLiteStorageSystem,
storage_args: Optional[Dict] = None,
Expand All @@ -59,6 +61,7 @@ def __init__(
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
Don't use it unless you know what you are doing!
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
priority over all auto-match related arguments/functions in the class.
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
Expand All @@ -84,8 +87,8 @@ def __init__(

# https://lxml.de/api/lxml.etree.HTMLParser-class.html
parser = html.HTMLParser(
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
compact=True, huge_tree=huge_tree, default_doctype=True
recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
)
self._root = etree.fromstring(body, parser=parser, base_url=url)
if is_jsonable(text or body.decode()):
Expand Down Expand Up @@ -119,6 +122,7 @@ def __init__(
self._storage = storage(**storage_args)

self.__keep_comments = keep_comments
self.__keep_cdata = keep_cdata
self.__huge_tree_enabled = huge_tree
self.encoding = encoding
self.url = url
Expand Down Expand Up @@ -156,7 +160,7 @@ def __get_correct_result(
root=element,
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
**self.__response_data
)
Expand Down

0 comments on commit b4f9061

Please sign in to comment.