Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ dependencies = [
"pydantic>=2.11.0",
"pyee>=9.0.0",
"tldextract>=5.1.0",
"typing-extensions>=4.1.0",
"typing-extensions>=4.10.0",
"yarl>=1.18.0",
]

Expand Down
16 changes: 7 additions & 9 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@
from crawlee.storage_clients import StorageClient
from crawlee.storages import KeyValueStore

# Workaround for https://github.com/pydantic/pydantic/issues/9445
J = TypeVar('J', bound='JsonSerializable')
JsonSerializable = list[J] | dict[str, J] | str | bool | int | float | None
JsonSerializable = dict[str, 'JsonSerializable'] | list['JsonSerializable'] | str | int | float | bool | None
else:
from pydantic import JsonValue as JsonSerializable
Comment on lines -30 to 32
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering whether now we can't just use Pydantic's type for type checking as well, have you tried it? 🙂


Expand Down Expand Up @@ -198,7 +196,7 @@ class PushDataKwargs(TypedDict):


class PushDataFunctionCall(PushDataKwargs):
data: list[dict[str, Any]] | dict[str, Any]
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]
dataset_id: str | None
dataset_name: str | None
dataset_alias: str | None
Expand Down Expand Up @@ -300,7 +298,7 @@ async def add_requests(

async def push_data(
self,
data: list[dict[str, Any]] | dict[str, Any],
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
Expand Down Expand Up @@ -392,7 +390,7 @@ def __call__(
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
rq_id: str | None = None,
rq_name: str | None = None,
Expand All @@ -417,7 +415,7 @@ def __call__(
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
requests: Sequence[str | Request] | None = None,
rq_id: str | None = None,
Expand Down Expand Up @@ -465,7 +463,7 @@ def __call__(
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, list[Request]]:
Expand Down Expand Up @@ -543,7 +541,7 @@ class PushDataFunction(Protocol):

def __call__(
self,
data: list[dict[str, Any]] | dict[str, Any],
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
from typing import TYPE_CHECKING, overload

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Mapping
from typing import Any, TextIO

from typing_extensions import Unpack

from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs
from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs, JsonSerializable

if sys.platform == 'win32':

Expand Down Expand Up @@ -150,7 +150,7 @@ async def atomic_write(


async def export_json_to_stream(
iterator: AsyncIterator[dict[str, Any]],
iterator: AsyncIterator[Mapping[str, JsonSerializable]],
Comment thread
vdusek marked this conversation as resolved.
dst: TextIO,
**kwargs: Unpack[ExportDataJsonKwargs],
) -> None:
Expand All @@ -159,7 +159,7 @@ async def export_json_to_stream(


async def export_csv_to_stream(
iterator: AsyncIterator[dict[str, Any]],
iterator: AsyncIterator[Mapping[str, JsonSerializable]],
dst: TextIO,
**kwargs: Unpack[ExportDataCsvKwargs],
) -> None:
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
from abc import ABC
from datetime import timedelta
from typing import TYPE_CHECKING, Any, Generic
from typing import TYPE_CHECKING, Generic

from more_itertools import partition
from pydantic import ValidationError
Expand All @@ -26,7 +26,7 @@
from typing_extensions import Unpack

from crawlee import RequestTransformAction
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, ExtractLinksFunction, JsonSerializable

from ._abstract_http_parser import AbstractHttpParser

Expand Down Expand Up @@ -200,7 +200,7 @@ async def extract_links(
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
Comment thread
vdusek marked this conversation as resolved.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mapping / MutableMapping no?

transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
Expand Down
6 changes: 3 additions & 3 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@

if TYPE_CHECKING:
import re
from collections.abc import Iterator
from collections.abc import Iterator, Mapping
from contextlib import AbstractAsyncContextManager

from crawlee._types import (
Expand Down Expand Up @@ -941,7 +941,7 @@ async def export_data(

async def _push_data(
self,
data: list[dict[str, Any]] | dict[str, Any],
data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
Comment thread
vdusek marked this conversation as resolved.
dataset_id: str | None = None,
dataset_name: str | None = None,
dataset_alias: str | None = None,
Expand Down Expand Up @@ -1015,7 +1015,7 @@ async def enqueue_links(
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
user_data: dict[str, JsonSerializable] | None = None,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mapping / MutableMapping no?

transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
requests: Sequence[str | Request] | None = None,
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
HttpHeaders,
HttpMethod,
HttpPayload,
JsonSerializable,
)
from crawlee.browsers._types import BrowserType

Expand Down Expand Up @@ -384,7 +385,7 @@ async def extract_links(
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict | None = None,
user_data: dict[str, JsonSerializable] | None = None,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mapping / MutableMapping no?

transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/sessions/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
computed_field,
)

from crawlee._types import JsonSerializable

from ._cookies import CookieParam
from ._session import Session

Expand All @@ -24,7 +26,7 @@ class SessionModel(BaseModel):

id: Annotated[str, Field(alias='id')]
max_age: Annotated[timedelta, Field(alias='maxAge')]
user_data: Annotated[dict, Field(alias='userData')]
user_data: Annotated[dict[str, JsonSerializable], Field(alias='userData')]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mapping / MutableMapping no?

max_error_score: Annotated[float, Field(alias='maxErrorScore')]
error_score_decrement: Annotated[float, Field(alias='errorScoreDecrement')]
created_at: Annotated[datetime, Field(alias='createdAt')]
Expand Down
8 changes: 5 additions & 3 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from crawlee.sessions._cookies import CookieParam, SessionCookies

if TYPE_CHECKING:
from collections.abc import Mapping, MutableMapping
from http.cookiejar import CookieJar

from crawlee._types import JsonSerializable
from crawlee.sessions._models import SessionModel

logger = getLogger(__name__)
Expand All @@ -36,7 +38,7 @@ def __init__(
*,
id: str | None = None,
max_age: timedelta = timedelta(minutes=50),
user_data: dict | None = None,
user_data: Mapping[str, JsonSerializable] | None = None,
max_error_score: float = 3.0,
error_score_decrement: float = 0.5,
created_at: datetime | None = None,
Expand All @@ -63,7 +65,7 @@ def __init__(
"""
self._id = id or crypto_random_object_id(length=10)
self._max_age = max_age
self._user_data = user_data or {}
self._user_data: dict[str, JsonSerializable] = dict(user_data) if user_data is not None else {}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no?

Suggested change
self._user_data: dict[str, JsonSerializable] = dict(user_data) if user_data is not None else {}
self._user_data: MutableMapping[str, JsonSerializable] = dict(user_data) if user_data is not None else {}

self._max_error_score = max_error_score
self._error_score_decrement = error_score_decrement
self._created_at = created_at or datetime.now(timezone.utc)
Expand Down Expand Up @@ -117,7 +119,7 @@ def id(self) -> str:
return self._id

@property
def user_data(self) -> dict:
def user_data(self) -> MutableMapping[str, JsonSerializable]:
"""Get the user data."""
return self._user_data

Expand Down
16 changes: 12 additions & 4 deletions src/crawlee/storage_clients/_base/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from typing import Any
from collections.abc import AsyncIterator, Mapping, Sequence

from typing_extensions import TypeIs

from crawlee._types import JsonSerializable
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata


Expand Down Expand Up @@ -42,7 +44,7 @@ async def purge(self) -> None:
"""

@abstractmethod
async def push_data(self, data: list[Any] | dict[str, Any]) -> None:
async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None:
"""Push data to the dataset.

The backend method for the `Dataset.push_data` call.
Expand Down Expand Up @@ -82,7 +84,7 @@ async def iterate_items(
unwind: list[str] | None = None,
skip_empty: bool = False,
skip_hidden: bool = False,
) -> AsyncIterator[dict[str, Any]]:
) -> AsyncIterator[Mapping[str, JsonSerializable]]:
"""Iterate over the dataset items with filtering options.

The backend method for the `Dataset.iterate_items` call.
Expand All @@ -91,3 +93,9 @@ async def iterate_items(
raise NotImplementedError
if False:
yield {}

@staticmethod
def _is_list_of_items(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: maybe we can name it "is sequence"? instead of "is list" 🙂

data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable],
) -> TypeIs[Sequence[Mapping[str, JsonSerializable]]]:
return isinstance(data, list)
12 changes: 7 additions & 5 deletions src/crawlee/storage_clients/_file_system/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import json
import shutil
from collections.abc import Mapping
from datetime import datetime, timezone
from logging import getLogger
from pathlib import Path
Expand All @@ -12,14 +13,15 @@
from typing_extensions import Self, override

from crawlee._consts import METADATA_FILENAME
from crawlee._types import JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, json_dumps
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Sequence

from crawlee.configuration import Configuration

Expand Down Expand Up @@ -220,10 +222,10 @@ async def purge(self) -> None:
)

@override
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None:
async with self._lock:
new_item_count = self._metadata.item_count
if isinstance(data, list):
if self._is_list_of_items(data):
for item in data:
new_item_count += 1
await self._push_item(item, new_item_count)
Expand Down Expand Up @@ -304,7 +306,7 @@ async def get_data(
selected_files = selected_files[:limit]

# Read and parse each data file.
items = list[dict[str, Any]]()
items = list[Mapping[str, JsonSerializable]]()
for file_path in selected_files:
try:
file_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8')
Expand Down Expand Up @@ -441,7 +443,7 @@ async def _update_metadata(
data = await json_dumps(self._metadata.model_dump())
await atomic_write(self.path_to_metadata, data)

async def _push_item(self, item: dict[str, Any], item_id: int) -> None:
async def _push_item(self, item: Mapping[str, JsonSerializable], item_id: int) -> None:
"""Push a single item to the dataset.

This method writes the item as a JSON file with a zero-padded numeric filename
Expand Down
15 changes: 9 additions & 6 deletions src/crawlee/storage_clients/_memory/_dataset_client.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
from __future__ import annotations

from collections.abc import Mapping
from datetime import datetime, timezone
from logging import getLogger
from typing import TYPE_CHECKING, Any

from typing_extensions import Self, override

from crawlee._types import JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Sequence


logger = getLogger(__name__)

Expand Down Expand Up @@ -41,7 +44,7 @@ def __init__(
"""
self._metadata = metadata

self._records = list[dict[str, Any]]()
self._records = list[Mapping[str, JsonSerializable]]()
"""List to hold dataset items. Each item is a dictionary representing a record."""

@override
Expand Down Expand Up @@ -113,11 +116,11 @@ async def purge(self) -> None:
)

@override
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
async def push_data(self, data: Sequence[Mapping[str, JsonSerializable]] | Mapping[str, JsonSerializable]) -> None:
metadata = await self.get_metadata()
new_item_count = metadata.item_count

if isinstance(data, list):
if self._is_list_of_items(data):
for item in data:
new_item_count += 1
await self._push_item(item)
Expand Down Expand Up @@ -203,7 +206,7 @@ async def iterate_items(
unwind: list[str] | None = None,
skip_empty: bool = False,
skip_hidden: bool = False,
) -> AsyncIterator[dict[str, Any]]:
) -> AsyncIterator[Mapping[str, JsonSerializable]]:
# Check for unsupported arguments and log a warning if found
unsupported_args: dict[str, Any] = {
'clean': clean,
Expand Down Expand Up @@ -260,7 +263,7 @@ async def _update_metadata(
if new_item_count is not None:
self._metadata.item_count = new_item_count

async def _push_item(self, item: dict[str, Any]) -> None:
async def _push_item(self, item: Mapping[str, JsonSerializable]) -> None:
"""Push a single item to the dataset.

Args:
Expand Down
Loading
Loading