From 2006be217364e1eb2306303a8f3ffd247193bc5a Mon Sep 17 00:00:00 2001 From: Karim shoair Date: Thu, 26 Dec 2024 00:29:42 +0200 Subject: [PATCH] fix(StealthyFetcher): Use more dependable response data --- scrapling/engines/camo.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py index f1d8f95..6d28634 100644 --- a/scrapling/engines/camo.py +++ b/scrapling/engines/camo.py @@ -89,7 +89,7 @@ def fetch(self, url: str) -> Response: def handle_response(finished_response): nonlocal final_response - if finished_response.request.resource_type == "document": + if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request(): final_response = finished_response with Camoufox( @@ -133,7 +133,6 @@ def handle_response(finished_response): if self.network_idle: page.wait_for_load_state('networkidle') - response_bytes = final_response.body() if final_response else page.content().encode('utf-8') # In case we didn't catch a document type somehow final_response = final_response if final_response else first_response # This will be parsed inside `Response` @@ -142,15 +141,15 @@ def handle_response(finished_response): status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=final_response.url, + url=page.url, text=page.content(), - body=response_bytes, + body=page.content().encode('utf-8'), status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()}, - headers=final_response.all_headers(), - request_headers=final_response.request.all_headers(), + headers=first_response.all_headers(), + request_headers=first_response.request.all_headers(), **self.adaptor_arguments ) page.close() @@ -169,7 +168,7 @@ async def async_fetch(self, url: str) -> Response: async def handle_response(finished_response): nonlocal final_response - if finished_response.request.resource_type == "document": + if finished_response.request.resource_type == "document" and finished_response.request.is_navigation_request(): final_response = finished_response async with AsyncCamoufox( @@ -213,7 +212,6 @@ async def handle_response(finished_response): if self.network_idle: await page.wait_for_load_state('networkidle') - response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8') # In case we didn't catch a document type somehow final_response = final_response if final_response else first_response # This will be parsed inside `Response` @@ -222,15 +220,15 @@ async def handle_response(finished_response): status_text = final_response.status_text or StatusText.get(final_response.status) response = Response( - url=final_response.url, + url=page.url, text=await page.content(), - body=response_bytes, + body=(await page.content()).encode('utf-8'), status=final_response.status, reason=status_text, encoding=encoding, cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()}, - headers=await final_response.all_headers(), - request_headers=await final_response.request.all_headers(), + headers=await first_response.all_headers(), + request_headers=await first_response.request.all_headers(), **self.adaptor_arguments ) await page.close()