Skip to content

Commit

Permalink
Fixing the way Response object handles sub items in some edge cases
Browse files Browse the repository at this point in the history
+ Do it with the least impact on performance
  • Loading branch information
D4Vinci committed Nov 16, 2024
1 parent 105ec5b commit 572df6b
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 11 deletions.
4 changes: 2 additions & 2 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def fetch(self, url: str) -> Response:
response = Response(
url=res.url,
text=page.content(),
content=res.body(),
body=res.body(),
status=res.status,
reason=res.status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
adaptor_arguments=self.adaptor_arguments
**self.adaptor_arguments
)
page.close()

Expand Down
4 changes: 2 additions & 2 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,14 @@ def fetch(self, url: str) -> Response:
response = Response(
url=res.url,
text=page.content(),
content=res.body(),
body=res.body(),
status=res.status,
reason=res.status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
request_headers=res.request.all_headers(),
adaptor_arguments=self.adaptor_arguments
**self.adaptor_arguments
)
page.close()
return response
4 changes: 2 additions & 2 deletions scrapling/engines/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ def _prepare_response(self, response: httpxResponse) -> Response:
return Response(
url=str(response.url),
text=response.text,
content=response.content,
body=response.content,
status=response.status_code,
reason=response.reason_phrase,
encoding=response.encoding or 'utf-8',
cookies=dict(response.cookies),
headers=dict(response.headers),
request_headers=dict(response.request.headers),
adaptor_arguments=self.adaptor_arguments
**self.adaptor_arguments
)

def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
Expand Down
5 changes: 2 additions & 3 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
class Response(Adaptor):
"""This class is returned by all engines as a way to unify response type between different libraries."""

def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)

self.status = status
self.reason = reason
self.cookies = cookies
self.headers = headers
self.request_headers = request_headers
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
# For back-ward compatibility
self.adaptor = self

Expand Down
13 changes: 11 additions & 2 deletions scrapling/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
storage: Any = SQLiteStorageSystem,
storage_args: Optional[Dict] = None,
debug: Optional[bool] = True,
**kwargs
):
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
Expand Down Expand Up @@ -117,6 +118,10 @@ def __init__(
self.__attributes = None
self.__tag = None
self.__debug = debug
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
self.__response_data = {
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
} if hasattr(self, 'status') else {}

# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
@staticmethod
Expand All @@ -138,10 +143,14 @@ def __get_correct_result(
return TextHandler(str(element))
else:
if issubclass(type(element), html.HtmlMixin):

return self.__class__(
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
root=element,
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
huge_tree=self.__huge_tree_enabled, debug=self.__debug
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
**self.__response_data
)
return element

Expand Down

0 comments on commit 572df6b

Please sign in to comment.