Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 55 additions & 5 deletions src/apify/scrapy/requests.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import annotations

import base64
import codecs
import pickle
import json
from logging import getLogger
from typing import Any, cast

from pydantic import BaseModel
from scrapy import Request as ScrapyRequest
from scrapy import Spider
from scrapy.http.headers import Headers
Expand All @@ -17,6 +19,45 @@

logger = getLogger(__name__)

# Sentinel key used to wrap bytes values in JSON-safe dicts.
_BYTES_SENTINEL = '__b64__'


def _encode_for_json(obj: Any) -> Any:
"""Recursively encode an object so it is JSON-serializable.

``bytes`` values are replaced with ``{"__b64__": "<base64-ascii>"}``.
``bytes`` dict keys are decoded to UTF-8 strings (Scrapy headers use bytes keys).
Pydantic ``BaseModel`` instances are converted via ``model_dump()``.
Enum members are replaced with their ``.value``.
"""
if isinstance(obj, bytes):
return {_BYTES_SENTINEL: base64.b64encode(obj).decode('ascii')}
if isinstance(obj, BaseModel):
return _encode_for_json(obj.model_dump(by_alias=True))
if isinstance(obj, dict):
return {(k.decode('utf-8') if isinstance(k, bytes) else k): _encode_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_encode_for_json(v) for v in obj]
# Handle enum values (e.g. RequestState)
if hasattr(obj, 'value') and not isinstance(obj, (str, int, float, bool)):
return obj.value
return obj


def _decode_from_json(obj: Any) -> Any:
"""Reverse :func:`_encode_for_json`.

Dicts of the form ``{"__b64__": "<data>"}`` are decoded back to ``bytes``.
"""
if isinstance(obj, dict):
if _BYTES_SENTINEL in obj and len(obj) == 1:
return base64.b64decode(obj[_BYTES_SENTINEL])
return {k: _decode_from_json(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_decode_from_json(v) for v in obj]
return obj


def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequest | None:
"""Convert a Scrapy request to an Apify request.
Expand Down Expand Up @@ -78,11 +119,12 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
apify_request = ApifyRequest.from_url(**request_kwargs)

# Serialize the Scrapy ScrapyRequest and store it in the apify_request.
# - This process involves converting the Scrapy ScrapyRequest object into a dictionary, encoding it to base64,
# - This process involves converting the Scrapy ScrapyRequest object into a dictionary,
# JSON-encoding it (with bytes values base64-wrapped), then base64-encoding the result,
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
scrapy_request_json = json.dumps(_encode_for_json(scrapy_request_dict))
scrapy_request_dict_encoded = codecs.encode(scrapy_request_json.encode('utf-8'), 'base64').decode()
apify_request.user_data['scrapy_request'] = scrapy_request_dict_encoded

except Exception as exc:
Expand Down Expand Up @@ -123,7 +165,15 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ
if not isinstance(scrapy_request_dict_encoded, str):
raise TypeError('scrapy_request_dict_encoded must be a string')

scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
scrapy_request_json = codecs.decode(scrapy_request_dict_encoded.encode(), 'base64').decode('utf-8')
scrapy_request_dict = _decode_from_json(json.loads(scrapy_request_json))

# Scrapy's request_from_dict expects bytes keys in the headers dict.
if 'headers' in scrapy_request_dict and isinstance(scrapy_request_dict['headers'], dict):
scrapy_request_dict['headers'] = {
k.encode('utf-8') if isinstance(k, str) else k: v for k, v in scrapy_request_dict['headers'].items()
}

if not isinstance(scrapy_request_dict, dict):
raise TypeError('scrapy_request_dict must be a dictionary')

Expand Down
111 changes: 106 additions & 5 deletions tests/unit/scrapy/requests/test_to_scrapy_request.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
from __future__ import annotations

import binascii
import codecs
import json
import pickle

import pytest
from scrapy import Request, Spider

from crawlee._types import HttpHeaders

from apify import Request as ApifyRequest
from apify.scrapy.requests import to_scrapy_request
from apify.scrapy.requests import to_apify_request, to_scrapy_request


class DummySpider(Spider):
Expand All @@ -21,6 +24,23 @@ def spider() -> DummySpider:
return DummySpider()


# JSON-encoded (safe) fixture matching the previous pickle-encoded test data.
_SCRAPY_REQUEST_JSON_ENCODED = (
'eyJ1cmwiOiJodHRwczovL2FwaWZ5LmNvbSIsImNhbGxiYWNrIjpudWxsLCJlcnJiYWNrIjpudWxs'
'LCJoZWFkZXJzIjp7IkFjY2VwdCI6W3siX19iNjRfXyI6ImRHVjRkQzlvZEcxc0xHRndjR3hwWTJG'
'MGFXOXVMM2hvZEcxc0szaHRiQ3hoY0hCc2FXTmhkR2x2Ymk5NGJXdzdjVDB3TGprc0tpOHFPM0U5'
'TUM0NCJ9XSwiQWNjZXB0LUxhbmd1YWdlIjpbeyJfX2I2NF9fIjoiWlc0PSJ9XSwiVXNlci1BZ2Vu'
'dCI6W3siX19iNjRfXyI6IlUyTnlZWEI1THpJdU1URXVNQ0FvSzJoMGRIQnpPaTh2YzJOeVlYQjVM'
'bTl5WnlrPSJ9XSwiQWNjZXB0LUVuY29kaW5nIjpbeyJfX2I2NF9fIjoiWjNwcGNDd2daR1ZtYkdG'
'MFpRPT0ifV19LCJtZXRob2QiOiJHRVQiLCJib2R5Ijp7Il9fYjY0X18iOiIifSwiY29va2llcyI6'
'e30sIm1ldGEiOnsiYXBpZnlfcmVxdWVzdF9pZCI6ImZ2d3NjTzJVSkxkcjEwQiIsImFwaWZ5X3Jl'
'cXVlc3RfdW5pcXVlX2tleSI6Imh0dHBzOi8vYXBpZnkuY29tIiwiZG93bmxvYWRfdGltZW91dCI6'
'MTgwLjAsImRvd25sb2FkX3Nsb3QiOiJhcGlmeS5jb20iLCJkb3dubG9hZF9sYXRlbmN5IjowLjA4'
'Mzk4MTk5MDgxNDIwODk4fSwiZW5jb2RpbmciOiJ1dGYtOCIsInByaW9yaXR5IjowLCJkb250X2Zp'
'bHRlciI6ZmFsc2UsImZsYWdzIjpbXSwiY2Jfa3dhcmdzIjp7fX0=\n'
)


def test_without_reconstruction(spider: Spider) -> None:
# Without reconstruction of encoded Scrapy request
apify_request = ApifyRequest(
Expand Down Expand Up @@ -62,13 +82,13 @@ def test_without_reconstruction_with_optional_fields(spider: Spider) -> None:


def test_with_reconstruction(spider: Spider) -> None:
# With reconstruction of encoded Scrapy request
# With reconstruction of JSON-encoded Scrapy request
apify_request = ApifyRequest(
url='https://apify.com',
method='GET',
unique_key='https://apify.com',
user_data={
'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501
'scrapy_request': _SCRAPY_REQUEST_JSON_ENCODED,
},
)

Expand All @@ -82,15 +102,15 @@ def test_with_reconstruction(spider: Spider) -> None:


def test_with_reconstruction_with_optional_fields(spider: Spider) -> None:
# With reconstruction of encoded Scrapy request
# With reconstruction of JSON-encoded Scrapy request
apify_request = ApifyRequest(
url='https://apify.com',
method='GET',
unique_key='https://apify.com',
headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
user_data={
'some_user_data': 'hello',
'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501
'scrapy_request': _SCRAPY_REQUEST_JSON_ENCODED,
},
)

Expand Down Expand Up @@ -119,3 +139,84 @@ def test_invalid_request_for_reconstruction(spider: Spider) -> None:

with pytest.raises(binascii.Error):
to_scrapy_request(apify_request, spider)


def test_pickle_payload_rejected(spider: Spider) -> None:
"""Verify that a pickle-serialized payload is rejected (CWE-502 mitigation).

Previously, scrapy_request data was serialized with pickle, which allows
arbitrary code execution when deserializing untrusted data from the
Apify Request Queue. The new implementation uses JSON and must reject
pickle payloads.
"""
# Build a pickle payload (like the old code produced)
scrapy_request_dict = {
'url': 'https://evil.com',
'callback': None,
'errback': None,
'headers': {},
'method': 'GET',
'body': b'',
'cookies': {},
'meta': {},
'encoding': 'utf-8',
'priority': 0,
'dont_filter': False,
'flags': [],
'cb_kwargs': {},
}
pickle_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()

apify_request = ApifyRequest(
url='https://evil.com',
method='GET',
unique_key='https://evil.com',
user_data={'scrapy_request': pickle_encoded},
)

# The new JSON-based deserialization must reject the pickle payload
with pytest.raises((json.JSONDecodeError, UnicodeDecodeError, ValueError)):
to_scrapy_request(apify_request, spider)


def test_roundtrip_serialization(spider: Spider) -> None:
"""Verify that to_apify_request -> to_scrapy_request roundtrip works with JSON encoding."""
original_request = Request(
url='https://example.com/test',
method='POST',
body=b'test body content',
headers={'Content-Type': 'application/json', 'X-Custom': 'value'},
meta={'userData': {'custom_key': 'custom_value'}},
)

apify_request = to_apify_request(original_request, spider)
assert apify_request is not None

# Verify the encoded data is valid JSON (not pickle)
encoded = apify_request.user_data['scrapy_request']
decoded_bytes = codecs.decode(encoded.encode(), 'base64')
decoded_json = json.loads(decoded_bytes.decode('utf-8'))
assert isinstance(decoded_json, dict)
assert decoded_json['url'] == 'https://example.com/test'

# Reconstruct the Scrapy request
restored = to_scrapy_request(apify_request, spider)
assert isinstance(restored, Request)
assert restored.url == original_request.url
assert restored.method == original_request.method
assert restored.body == original_request.body


def test_no_pickle_in_serialized_output(spider: Spider) -> None:
"""Confirm that to_apify_request never produces pickle-serialized output."""
scrapy_request = Request(url='https://example.com')
apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None

encoded = apify_request.user_data['scrapy_request']
raw_bytes = codecs.decode(encoded.encode(), 'base64')

# Pickle protocol 4 starts with b'\x80\x04'; JSON starts with b'{'
assert not raw_bytes.startswith(b'\x80'), 'Output must not be pickle-serialized'
# Verify it's valid JSON
json.loads(raw_bytes.decode('utf-8'))