Fixing the way Response object handles sub items in some edge cases

+ Do it with the least impact on performance
D4Vinci · Nov 16, 2024 · 572df6b · 572df6b
1 parent 105ec5b
commit 572df6b
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 11 deletions.
diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py
@@ -114,14 +114,14 @@ def fetch(self, url: str) -> Response:
             response = Response(
                 url=res.url,
                 text=page.content(),
-                content=res.body(),
+                body=res.body(),
                 status=res.status,
                 reason=res.status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),
                 request_headers=res.request.all_headers(),
-                adaptor_arguments=self.adaptor_arguments
+                **self.adaptor_arguments
             )
             page.close()
 

diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py
@@ -224,14 +224,14 @@ def fetch(self, url: str) -> Response:
             response = Response(
                 url=res.url,
                 text=page.content(),
-                content=res.body(),
+                body=res.body(),
                 status=res.status,
                 reason=res.status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),
                 request_headers=res.request.all_headers(),
-                adaptor_arguments=self.adaptor_arguments
+                **self.adaptor_arguments
             )
             page.close()
         return response
diff --git a/scrapling/engines/static.py b/scrapling/engines/static.py
@@ -53,14 +53,14 @@ def _prepare_response(self, response: httpxResponse) -> Response:
         return Response(
             url=str(response.url),
             text=response.text,
-            content=response.content,
+            body=response.content,
             status=response.status_code,
             reason=response.reason_phrase,
             encoding=response.encoding or 'utf-8',
             cookies=dict(response.cookies),
             headers=dict(response.headers),
             request_headers=dict(response.request.headers),
-            adaptor_arguments=self.adaptor_arguments
+            **self.adaptor_arguments
         )
 
     def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:

diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -12,15 +12,14 @@
 class Response(Adaptor):
     """This class is returned by all engines as a way to unify response type between different libraries."""
 
-    def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
+    def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
         automatch_domain = adaptor_arguments.pop('automatch_domain', None)
-        super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
-
         self.status = status
         self.reason = reason
         self.cookies = cookies
         self.headers = headers
         self.request_headers = request_headers
+        super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
         # For back-ward compatibility
         self.adaptor = self
 

diff --git a/scrapling/parser.py b/scrapling/parser.py
@@ -32,6 +32,7 @@ def __init__(
             storage: Any = SQLiteStorageSystem,
             storage_args: Optional[Dict] = None,
             debug: Optional[bool] = True,
+            **kwargs
     ):
         """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
         with expressions in CSS, XPath, or with simply text. Check the docs for more info.
@@ -117,6 +118,10 @@ def __init__(
         self.__attributes = None
         self.__tag = None
         self.__debug = debug
+        # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
+        self.__response_data = {
+            key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
+        } if hasattr(self, 'status') else {}
 
     # Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
     @staticmethod
@@ -138,10 +143,14 @@ def __get_correct_result(
             return TextHandler(str(element))
         else:
             if issubclass(type(element), html.HtmlMixin):
+
                 return self.__class__(
-                    root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
+                    root=element,
+                    text='', body=b'',  # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
+                    url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
                     keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
-                    huge_tree=self.__huge_tree_enabled, debug=self.__debug
+                    huge_tree=self.__huge_tree_enabled, debug=self.__debug,
+                    **self.__response_data
                 )
             return element