Adding keep_cdata argument for Adaptor and Response classes

This will force lxml to keep cdata while parsing html if you want
D4Vinci · Dec 10, 2024 · b4f9061 · b4f9061
1 parent 5f9c398
commit b4f9061
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -219,7 +219,7 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
 ```python
 from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
-All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
+All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
 
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python

diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -105,14 +105,15 @@ class BaseFetcher:
     def __init__(
             self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
-            automatch_domain: Optional[str] = None,
+            automatch_domain: Optional[str] = None, keep_cdata: Optional[bool] = False,
     ):
         """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
         are detected and passed automatically from the Fetcher based on the response for accessibility.
 
         :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
             libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
+        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
@@ -127,6 +128,7 @@ def __init__(
         self.adaptor_arguments = dict(
             huge_tree=huge_tree,
             keep_comments=keep_comments,
+            keep_cdata=keep_cdata,
             auto_match=auto_match,
             storage=storage,
             storage_args=storage_args,

diff --git a/scrapling/parser.py b/scrapling/parser.py
@@ -25,6 +25,7 @@ class Adaptor(SelectorsGeneration):
     __slots__ = (
         'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
         '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
+        '__keep_cdata',
     )
 
     def __init__(
@@ -36,6 +37,7 @@ def __init__(
             huge_tree: bool = True,
             root: Optional[html.HtmlElement] = None,
             keep_comments: Optional[bool] = False,
+            keep_cdata: Optional[bool] = False,
             auto_match: Optional[bool] = True,
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
@@ -59,6 +61,7 @@ def __init__(
         :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
             Don't use it unless you know what you are doing!
         :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
+        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
         :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
             priority over all auto-match related arguments/functions in the class.
         :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
@@ -84,8 +87,8 @@ def __init__(
 
             # https://lxml.de/api/lxml.etree.HTMLParser-class.html
             parser = html.HTMLParser(
-                recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
-                compact=True, huge_tree=huge_tree, default_doctype=True
+                recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
+                compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
             if is_jsonable(text or body.decode()):
@@ -119,6 +122,7 @@ def __init__(
             self._storage = storage(**storage_args)
 
         self.__keep_comments = keep_comments
+        self.__keep_cdata = keep_cdata
         self.__huge_tree_enabled = huge_tree
         self.encoding = encoding
         self.url = url
@@ -156,7 +160,7 @@ def __get_correct_result(
                     root=element,
                     text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
                     url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
-                    keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
+                    keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
                     huge_tree=self.__huge_tree_enabled, debug=self.__debug,
                     **self.__response_data
                 )