diff --git a/pyproject.toml b/pyproject.toml index 699d68dffe..6d56a9fec9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ dependencies = [ "pydantic>=2.11.0", "pyee>=9.0.0", "tldextract>=5.1.0", - "typing-extensions>=4.1.0", + "typing-extensions>=4.10.0", "yarl>=1.18.0", ] diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 1e35510340..d999149e7e 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -27,9 +27,7 @@ from crawlee.storage_clients import StorageClient from crawlee.storages import KeyValueStore - # Workaround for https://github.com/pydantic/pydantic/issues/9445 - J = TypeVar('J', bound='JsonSerializable') - JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None + JsonSerializable = dict[str, 'JsonSerializable'] | list['JsonSerializable'] | str | int | float | bool | None else: from pydantic import JsonValue as JsonSerializable @@ -198,7 +196,7 @@ class PushDataKwargs(TypedDict): class PushDataFunctionCall(PushDataKwargs): - data: list[dict[str, Any]] | dict[str, Any] + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable] dataset_id: str | None dataset_name: str | None dataset_alias: str | None @@ -300,7 +298,7 @@ async def add_requests( async def push_data( self, - data: list[dict[str, Any]] | dict[str, Any], + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, @@ -392,7 +390,7 @@ def __call__( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, rq_id: str | None = None, rq_name: str | None = None, @@ -417,7 +415,7 @@ def __call__( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, rq_id: str | None = None, @@ -465,7 +463,7 @@ def __call__( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, list[Request]]: @@ -543,7 +541,7 @@ class PushDataFunction(Protocol): def __call__( self, - data: list[dict[str, Any]] | dict[str, Any], + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index 1d297fa724..b3ee5662f4 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -10,12 +10,12 @@ from typing import TYPE_CHECKING, overload if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping from typing import Any, TextIO from typing_extensions import Unpack - from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs + from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable if sys.platform == 'win32': @@ -150,7 +150,7 @@ async def atomic_write( async def export_json_to_stream( - iterator: AsyncIterator[dict[str, Any]], + iterator: AsyncIterator[Mapping[str, JsonSerializable]], dst: TextIO, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: @@ -159,7 +159,7 @@ async def export_json_to_stream( async def export_csv_to_stream( - iterator: AsyncIterator[dict[str, Any]], + iterator: AsyncIterator[Mapping[str, JsonSerializable]], dst: TextIO, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 56046f0b64..c2fe8f5c74 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -4,7 +4,7 @@ import logging from abc import ABC from datetime import timedelta -from typing import TYPE_CHECKING, Any, Generic +from typing import TYPE_CHECKING, Generic from more_itertools import partition from pydantic import ValidationError @@ -26,7 +26,7 @@ from typing_extensions import Unpack from crawlee import RequestTransformAction - from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction + from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction, JsonSerializable from ._abstract_http_parser import AbstractHttpParser @@ -200,7 +200,7 @@ async def extract_links( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 1c79c4eadd..207a1968ad 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -80,7 +80,7 @@ if TYPE_CHECKING: import re - from collections.abc import Iterator + from collections.abc import Iterator, Mapping from contextlib import AbstractAsyncContextManager from crawlee._types import ( @@ -941,7 +941,7 @@ async def export_data( async def _push_data( self, - data: list[dict[str, Any]] | dict[str, Any], + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, @@ -1015,7 +1015,7 @@ async def enqueue_links( selector: str | None = None, attribute: str | None = None, label: str | None = None, - user_data: dict[str, Any] | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 19b98c79c3..73e4d26d36 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -53,6 +53,7 @@ HttpHeaders, HttpMethod, HttpPayload, + JsonSerializable, ) from crawlee.browsers._types import BrowserType @@ -384,7 +385,7 @@ async def extract_links( selector: str = 'a', attribute: str = 'href', label: str | None = None, - user_data: dict | None = None, + user_data: dict[str, JsonSerializable] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, **kwargs: Unpack[EnqueueLinksKwargs], diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index 2f5b4a0483..c2e17ad439 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -13,6 +13,8 @@ computed_field, ) +from crawlee._types import JsonSerializable + from ._cookies import CookieParam from ._session import Session @@ -24,7 +26,7 @@ class SessionModel(BaseModel): id: Annotated[str, Field(alias='id')] max_age: Annotated[timedelta, Field(alias='maxAge')] - user_data: Annotated[dict, Field(alias='userData')] + user_data: Annotated[dict[str, JsonSerializable], Field(alias='userData')] max_error_score: Annotated[float, Field(alias='maxErrorScore')] error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')] created_at: Annotated[datetime, Field(alias='createdAt')] diff --git a/src/crawlee/sessions/_session.py b/src/crawlee/sessions/_session.py index b36d1b4970..6663d43cfe 100644 --- a/src/crawlee/sessions/_session.py +++ b/src/crawlee/sessions/_session.py @@ -11,8 +11,10 @@ from crawlee.sessions._cookies import CookieParam, SessionCookies if TYPE_CHECKING: + from collections.abc import Mapping, MutableMapping from http.cookiejar import CookieJar + from crawlee._types import JsonSerializable from crawlee.sessions._models import SessionModel logger = getLogger(__name__) @@ -36,7 +38,7 @@ def __init__( *, id: str | None = None, max_age: timedelta = timedelta(minutes=50), - user_data: dict | None = None, + user_data: Mapping[str, JsonSerializable] | None = None, max_error_score: float = 3.0, error_score_decrement: float = 0.5, created_at: datetime | None = None, @@ -63,7 +65,7 @@ def __init__( """ self._id = id or crypto_random_object_id(length=10) self._max_age = max_age - self._user_data = user_data or {} + self._user_data: dict[str, JsonSerializable] = dict(user_data) if user_data is not None else {} self._max_error_score = max_error_score self._error_score_decrement = error_score_decrement self._created_at = created_at or datetime.now(timezone.utc) @@ -117,7 +119,7 @@ def id(self) -> str: return self._id @property - def user_data(self) -> dict: + def user_data(self) -> MutableMapping[str, JsonSerializable]: """Get the user data.""" return self._user_data diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index eb1099708d..68d3a6a3df 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -4,9 +4,11 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import AsyncIterator - from typing import Any + from collections.abc import AsyncIterator, Mapping, Sequence + from typing_extensions import TypeIs + + from crawlee._types import JsonSerializable from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -42,7 +44,7 @@ async def purge(self) -> None: """ @abstractmethod - async def push_data(self, data: list[Any] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: """Push data to the dataset. The backend method for the `Dataset.push_data` call. @@ -82,7 +84,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: """Iterate over the dataset items with filtering options. The backend method for the `Dataset.iterate_items` call. @@ -91,3 +93,9 @@ async def iterate_items( raise NotImplementedError if False: yield {} + + @staticmethod + def _is_list_of_items( + data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable], + ) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]: + return isinstance(data, list) diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index b970a98928..38c0ede0f1 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -3,6 +3,7 @@ import asyncio import json import shutil +from collections.abc import Mapping from datetime import datetime, timezone from logging import getLogger from pathlib import Path @@ -12,6 +13,7 @@ from typing_extensions import Self, override from crawlee._consts import METADATA_FILENAME +from crawlee._types import JsonSerializable from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.file import atomic_write, json_dumps from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs @@ -19,7 +21,7 @@ from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Sequence from crawlee.configuration import Configuration @@ -220,10 +222,10 @@ async def purge(self) -> None: ) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: async with self._lock: new_item_count = self._metadata.item_count - if isinstance(data, list): + if self._is_list_of_items(data): for item in data: new_item_count += 1 await self._push_item(item, new_item_count) @@ -304,7 +306,7 @@ async def get_data( selected_files = selected_files[:limit] # Read and parse each data file. - items = list[dict[str, Any]]() + items = list[Mapping[str, JsonSerializable]]() for file_path in selected_files: try: file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') @@ -441,7 +443,7 @@ async def _update_metadata( data = await json_dumps(self._metadata.model_dump()) await atomic_write(self.path_to_metadata, data) - async def _push_item(self, item: dict[str, Any], item_id: int) -> None: + async def _push_item(self, item: Mapping[str, JsonSerializable], item_id: int) -> None: """Push a single item to the dataset. This method writes the item as a JSON file with a zero-padded numeric filename diff --git a/src/crawlee/storage_clients/_memory/_dataset_client.py b/src/crawlee/storage_clients/_memory/_dataset_client.py index 67abc6f6dc..f98e1f5296 100644 --- a/src/crawlee/storage_clients/_memory/_dataset_client.py +++ b/src/crawlee/storage_clients/_memory/_dataset_client.py @@ -1,18 +1,21 @@ from __future__ import annotations +from collections.abc import Mapping from datetime import datetime, timezone from logging import getLogger from typing import TYPE_CHECKING, Any from typing_extensions import Self, override +from crawlee._types import JsonSerializable from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Sequence + logger = getLogger(__name__) @@ -41,7 +44,7 @@ def __init__( """ self._metadata = metadata - self._records = list[dict[str, Any]]() + self._records = list[Mapping[str, JsonSerializable]]() """List to hold dataset items. Each item is a dictionary representing a record.""" @override @@ -113,11 +116,11 @@ async def purge(self) -> None: ) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: metadata = await self.get_metadata() new_item_count = metadata.item_count - if isinstance(data, list): + if self._is_list_of_items(data): for item in data: new_item_count += 1 await self._push_item(item) @@ -203,7 +206,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: # Check for unsupported arguments and log a warning if found unsupported_args: dict[str, Any] = { 'clean': clean, @@ -260,7 +263,7 @@ async def _update_metadata( if new_item_count is not None: self._metadata.item_count = new_item_count - async def _push_item(self, item: dict[str, Any]) -> None: + async def _push_item(self, item: Mapping[str, JsonSerializable]) -> None: """Push a single item to the dataset. Args: diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index db2b6375c6..5777dc2cd5 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -14,11 +14,13 @@ from ._utils import await_redis_response if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping, Sequence from redis.asyncio import Redis from redis.asyncio.client import Pipeline + from crawlee._types import JsonSerializable + logger = getLogger(__name__) @@ -126,8 +128,8 @@ async def purge(self) -> None: @retry_on_error(RedisError) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: - if isinstance(data, dict): + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: + if not self._is_list_of_items(data): data = [data] async with self._get_pipeline() as pipe: diff --git a/src/crawlee/storage_clients/_sql/_dataset_client.py b/src/crawlee/storage_clients/_sql/_dataset_client.py index d17031ca89..239ac16623 100644 --- a/src/crawlee/storage_clients/_sql/_dataset_client.py +++ b/src/crawlee/storage_clients/_sql/_dataset_client.py @@ -17,12 +17,14 @@ from ._db_models import DatasetItemDb, DatasetMetadataBufferDb, DatasetMetadataDb if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping, Sequence from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import NotRequired + from crawlee._types import JsonSerializable + from ._storage_client import SqlStorageClient @@ -144,8 +146,8 @@ async def purge(self) -> None: @retry_on_error(SQLAlchemyError) @override - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: - if not isinstance(data, list): + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: + if not self._is_list_of_items(data): data = [data] db_items = [{'dataset_id': self._id, 'data': item} for item in data] diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index 2ebd65914d..1725340a53 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -6,10 +6,13 @@ from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from typing_extensions import TypeVar -from crawlee._types import HttpMethod +from crawlee._types import HttpMethod, JsonSerializable from crawlee._utils.docs import docs_group from crawlee._utils.urls import validate_http_url +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence + KvsValueType = TypeVar('KvsValueType', default=Any) @@ -129,7 +132,7 @@ class DatasetItemsListPage(BaseModel): # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: - items: list[dict] = [] + items: Sequence[Mapping[str, JsonSerializable]] = [] """The list of dataset items returned on this page.""" else: items: Annotated[list[dict], Field(default_factory=list)] diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a0436aa576..67c201cfd4 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -15,12 +15,12 @@ from ._utils import validate_storage_name if TYPE_CHECKING: - from collections.abc import AsyncIterator + from collections.abc import AsyncIterator, Mapping, Sequence from typing import Any, Literal from typing_extensions import Unpack - from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs + from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import DatasetClient @@ -134,7 +134,7 @@ async def drop(self) -> None: async def purge(self) -> None: await self._client.purge() - async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None: + async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None: """Store an object or an array of objects to the dataset. The size of the data is limited by the receiving API and therefore `push_data()` will only @@ -210,7 +210,7 @@ async def iterate_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> AsyncIterator[dict[str, Any]]: + ) -> AsyncIterator[Mapping[str, JsonSerializable]]: """Iterate over items in the dataset according to specified filters and sorting. This method allows for asynchronously iterating through dataset items while applying various filters such as @@ -258,7 +258,7 @@ async def list_items( unwind: list[str] | None = None, skip_empty: bool = False, skip_hidden: bool = False, - ) -> list[dict[str, Any]]: + ) -> list[Mapping[str, JsonSerializable]]: """Retrieve a list of all items from the dataset according to specified filters and sorting. This method collects all dataset items into a list while applying various filters such as diff --git a/uv.lock b/uv.lock index a9e945e853..b63298c0c7 100644 --- a/uv.lock +++ b/uv.lock @@ -956,7 +956,7 @@ requires-dist = [ { name = "sqlalchemy", extras = ["asyncio"], marker = "extra == 'sql-sqlite'", specifier = ">=2.0.0,<3.0.0" }, { name = "tldextract", specifier = ">=5.1.0" }, { name = "typer", marker = "extra == 'cli'", specifier = ">=0.12.0" }, - { name = "typing-extensions", specifier = ">=4.1.0" }, + { name = "typing-extensions", specifier = ">=4.10.0" }, { name = "wrapt", marker = "extra == 'otel'", specifier = ">=1.17.0" }, { name = "yarl", specifier = ">=1.18.0" }, ]