diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index dd40482d4..43808135a 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -1,10 +1,12 @@ from __future__ import annotations +import base64 import codecs -import pickle +import json from logging import getLogger from typing import Any, cast +from pydantic import BaseModel from scrapy import Request as ScrapyRequest from scrapy import Spider from scrapy.http.headers import Headers @@ -17,6 +19,45 @@ logger = getLogger(__name__) +# Sentinel key used to wrap bytes values in JSON-safe dicts. +_BYTES_SENTINEL = '__b64__' + + +def _encode_for_json(obj: Any) -> Any: + """Recursively encode an object so it is JSON-serializable. + + ``bytes`` values are replaced with ``{"__b64__": ""}``. + ``bytes`` dict keys are decoded to UTF-8 strings (Scrapy headers use bytes keys). + Pydantic ``BaseModel`` instances are converted via ``model_dump()``. + Enum members are replaced with their ``.value``. + """ + if isinstance(obj, bytes): + return {_BYTES_SENTINEL: base64.b64encode(obj).decode('ascii')} + if isinstance(obj, BaseModel): + return _encode_for_json(obj.model_dump(by_alias=True)) + if isinstance(obj, dict): + return {(k.decode('utf-8') if isinstance(k, bytes) else k): _encode_for_json(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [_encode_for_json(v) for v in obj] + # Handle enum values (e.g. RequestState) + if hasattr(obj, 'value') and not isinstance(obj, (str, int, float, bool)): + return obj.value + return obj + + +def _decode_from_json(obj: Any) -> Any: + """Reverse :func:`_encode_for_json`. + + Dicts of the form ``{"__b64__": ""}`` are decoded back to ``bytes``. + """ + if isinstance(obj, dict): + if _BYTES_SENTINEL in obj and len(obj) == 1: + return base64.b64decode(obj[_BYTES_SENTINEL]) + return {k: _decode_from_json(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_decode_from_json(v) for v in obj] + return obj + def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None: """Convert a Scrapy request to an Apify request. @@ -78,11 +119,12 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ apify_request = ApifyRequest.from_url(**request_kwargs) # Serialize the Scrapy ScrapyRequest and store it in the apify_request. - # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64, + # - This process involves converting the Scrapy ScrapyRequest object into a dictionary, + # JSON-encoding it (with bytes values base64-wrapped), then base64-encoding the result, # and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request. - # - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/. scrapy_request_dict = scrapy_request.to_dict(spider=spider) - scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode() + scrapy_request_json = json.dumps(_encode_for_json(scrapy_request_dict)) + scrapy_request_dict_encoded = codecs.encode(scrapy_request_json.encode('utf-8'), 'base64').decode() apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded except Exception as exc: @@ -123,7 +165,15 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ if not isinstance(scrapy_request_dict_encoded, str): raise TypeError('scrapy_request_dict_encoded must be a string') - scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64')) + scrapy_request_json = codecs.decode(scrapy_request_dict_encoded.encode(), 'base64').decode('utf-8') + scrapy_request_dict = _decode_from_json(json.loads(scrapy_request_json)) + + # Scrapy's request_from_dict expects bytes keys in the headers dict. + if 'headers' in scrapy_request_dict and isinstance(scrapy_request_dict['headers'], dict): + scrapy_request_dict['headers'] = { + k.encode('utf-8') if isinstance(k, str) else k: v for k, v in scrapy_request_dict['headers'].items() + } + if not isinstance(scrapy_request_dict, dict): raise TypeError('scrapy_request_dict must be a dictionary') diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index fadb25249..93285ddf9 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -1,6 +1,9 @@ from __future__ import annotations import binascii +import codecs +import json +import pickle import pytest from scrapy import Request, Spider @@ -8,7 +11,7 @@ from crawlee._types import HttpHeaders from apify import Request as ApifyRequest -from apify.scrapy.requests import to_scrapy_request +from apify.scrapy.requests import to_apify_request, to_scrapy_request class DummySpider(Spider): @@ -21,6 +24,23 @@ def spider() -> DummySpider: return DummySpider() +# JSON-encoded (safe) fixture matching the previous pickle-encoded test data. +_SCRAPY_REQUEST_JSON_ENCODED = ( + 'eyJ1cmwiOiJodHRwczovL2FwaWZ5LmNvbSIsImNhbGxiYWNrIjpudWxsLCJlcnJiYWNrIjpudWxs' + 'LCJoZWFkZXJzIjp7IkFjY2VwdCI6W3siX19iNjRfXyI6ImRHVjRkQzlvZEcxc0xHRndjR3hwWTJG' + 'MGFXOXVMM2hvZEcxc0szaHRiQ3hoY0hCc2FXTmhkR2x2Ymk5NGJXdzdjVDB3TGprc0tpOHFPM0U5' + 'TUM0NCJ9XSwiQWNjZXB0LUxhbmd1YWdlIjpbeyJfX2I2NF9fIjoiWlc0PSJ9XSwiVXNlci1BZ2Vu' + 'dCI6W3siX19iNjRfXyI6IlUyTnlZWEI1THpJdU1URXVNQ0FvSzJoMGRIQnpPaTh2YzJOeVlYQjVM' + 'bTl5WnlrPSJ9XSwiQWNjZXB0LUVuY29kaW5nIjpbeyJfX2I2NF9fIjoiWjNwcGNDd2daR1ZtYkdG' + 'MFpRPT0ifV19LCJtZXRob2QiOiJHRVQiLCJib2R5Ijp7Il9fYjY0X18iOiIifSwiY29va2llcyI6' + 'e30sIm1ldGEiOnsiYXBpZnlfcmVxdWVzdF9pZCI6ImZ2d3NjTzJVSkxkcjEwQiIsImFwaWZ5X3Jl' + 'cXVlc3RfdW5pcXVlX2tleSI6Imh0dHBzOi8vYXBpZnkuY29tIiwiZG93bmxvYWRfdGltZW91dCI6' + 'MTgwLjAsImRvd25sb2FkX3Nsb3QiOiJhcGlmeS5jb20iLCJkb3dubG9hZF9sYXRlbmN5IjowLjA4' + 'Mzk4MTk5MDgxNDIwODk4fSwiZW5jb2RpbmciOiJ1dGYtOCIsInByaW9yaXR5IjowLCJkb250X2Zp' + 'bHRlciI6ZmFsc2UsImZsYWdzIjpbXSwiY2Jfa3dhcmdzIjp7fX0=\n' +) + + def test_without_reconstruction(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request apify_request = ApifyRequest( @@ -62,13 +82,13 @@ def test_without_reconstruction_with_optional_fields(spider: Spider) -> None: def test_with_reconstruction(spider: Spider) -> None: - # With reconstruction of encoded Scrapy request + # With reconstruction of JSON-encoded Scrapy request apify_request = ApifyRequest( url='https://apify.com', method='GET', unique_key='https://apify.com', user_data={ - 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 + 'scrapy_request': _SCRAPY_REQUEST_JSON_ENCODED, }, ) @@ -82,7 +102,7 @@ def test_with_reconstruction(spider: Spider) -> None: def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: - # With reconstruction of encoded Scrapy request + # With reconstruction of JSON-encoded Scrapy request apify_request = ApifyRequest( url='https://apify.com', method='GET', @@ -90,7 +110,7 @@ def test_with_reconstruction_with_optional_fields(spider: Spider) -> None: headers=HttpHeaders({'Authorization': 'Bearer access_token'}), user_data={ 'some_user_data': 'hello', - 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 + 'scrapy_request': _SCRAPY_REQUEST_JSON_ENCODED, }, ) @@ -119,3 +139,84 @@ def test_invalid_request_for_reconstruction(spider: Spider) -> None: with pytest.raises(binascii.Error): to_scrapy_request(apify_request, spider) + + +def test_pickle_payload_rejected(spider: Spider) -> None: + """Verify that a pickle-serialized payload is rejected (CWE-502 mitigation). + + Previously, scrapy_request data was serialized with pickle, which allows + arbitrary code execution when deserializing untrusted data from the + Apify Request Queue. The new implementation uses JSON and must reject + pickle payloads. + """ + # Build a pickle payload (like the old code produced) + scrapy_request_dict = { + 'url': 'https://evil.com', + 'callback': None, + 'errback': None, + 'headers': {}, + 'method': 'GET', + 'body': b'', + 'cookies': {}, + 'meta': {}, + 'encoding': 'utf-8', + 'priority': 0, + 'dont_filter': False, + 'flags': [], + 'cb_kwargs': {}, + } + pickle_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode() + + apify_request = ApifyRequest( + url='https://evil.com', + method='GET', + unique_key='https://evil.com', + user_data={'scrapy_request': pickle_encoded}, + ) + + # The new JSON-based deserialization must reject the pickle payload + with pytest.raises((json.JSONDecodeError, UnicodeDecodeError, ValueError)): + to_scrapy_request(apify_request, spider) + + +def test_roundtrip_serialization(spider: Spider) -> None: + """Verify that to_apify_request -> to_scrapy_request roundtrip works with JSON encoding.""" + original_request = Request( + url='https://example.com/test', + method='POST', + body=b'test body content', + headers={'Content-Type': 'application/json', 'X-Custom': 'value'}, + meta={'userData': {'custom_key': 'custom_value'}}, + ) + + apify_request = to_apify_request(original_request, spider) + assert apify_request is not None + + # Verify the encoded data is valid JSON (not pickle) + encoded = apify_request.user_data['scrapy_request'] + decoded_bytes = codecs.decode(encoded.encode(), 'base64') + decoded_json = json.loads(decoded_bytes.decode('utf-8')) + assert isinstance(decoded_json, dict) + assert decoded_json['url'] == 'https://example.com/test' + + # Reconstruct the Scrapy request + restored = to_scrapy_request(apify_request, spider) + assert isinstance(restored, Request) + assert restored.url == original_request.url + assert restored.method == original_request.method + assert restored.body == original_request.body + + +def test_no_pickle_in_serialized_output(spider: Spider) -> None: + """Confirm that to_apify_request never produces pickle-serialized output.""" + scrapy_request = Request(url='https://example.com') + apify_request = to_apify_request(scrapy_request, spider) + assert apify_request is not None + + encoded = apify_request.user_data['scrapy_request'] + raw_bytes = codecs.decode(encoded.encode(), 'base64') + + # Pickle protocol 4 starts with b'\x80\x04'; JSON starts with b'{' + assert not raw_bytes.startswith(b'\x80'), 'Output must not be pickle-serialized' + # Verify it's valid JSON + json.loads(raw_bytes.decode('utf-8'))