diff --git a/haystack/dataclasses/document.py b/haystack/dataclasses/document.py index 6f6853d8e1..d1b6d09ebe 100644 --- a/haystack/dataclasses/document.py +++ b/haystack/dataclasses/document.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import hashlib +import json from dataclasses import asdict, dataclass, field, fields from typing import Any @@ -113,7 +114,9 @@ def _create_id(self) -> str: dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed blob = self.blob.data if self.blob is not None else None mime_type = self.blob.mime_type if self.blob is not None else None - meta = self.meta or {} + # Sort keys so meta order doesn't affect the hash. Keep "{}" for empty meta + # so existing IDs stay stable. + meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}" embedding = self.embedding if self.embedding is not None else None sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else "" data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}" diff --git a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml new file mode 100644 index 0000000000..e2d55f8484 --- /dev/null +++ b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml @@ -0,0 +1,27 @@ +--- +upgrade: + - | + The hash used to auto-generate ``Document.id`` is now computed from a + canonical (key-sorted) JSON serialization of ``meta``. Documents with + empty ``meta`` are unaffected, but most other documents will get different + IDs than they did before: + + * documents with non-empty ``meta`` (the serialization changes from + ``dict``'s repr to JSON); + * documents whose ``meta`` contains non-JSON-serializable values such as + ``datetime`` or custom classes (these are now serialized via ``str(...)`` + rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of + ``"datetime.datetime(2024, 1, 1, 0, 0)"``). + + If you rely on auto-generated IDs to match documents already persisted in a + ``DocumentStore``, you will need to re-ingest the affected documents (or + pass the previous ``id`` explicitly when constructing the ``Document``). +fixes: + - | + ``Document.id`` is now deterministic regardless of the insertion order of + keys in ``meta``. Previously the hash was built from ``dict``'s repr, which + reflects insertion order, so two documents with the same content and the + same ``meta`` could get different IDs depending on how the ``meta`` dict was + constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and + any cache or dedup table keyed on the document ID whenever upstream code + produced ``meta`` in different orders. diff --git a/test/core/pipeline/features/test_run.py b/test/core/pipeline/features/test_run.py index 57383b53be..51ec43c61c 100644 --- a/test/core/pipeline/features/test_run.py +++ b/test/core/pipeline/features/test_run.py @@ -3104,7 +3104,7 @@ def run(self, query: str) -> dict[str, list[Document]]: ("rag_prompt", 1): { "documents": [ Document( - id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed", + id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1", content="This is a document potentially answering the question.", meta={"access_group": 1}, ) @@ -4381,7 +4381,7 @@ def pipeline_that_converts_files(pipeline_class): content="Some test content", meta={ "file_type": "json", - "source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5", + "source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74", "page_number": 1, "split_id": 0, "split_idx_start": 0, @@ -4391,7 +4391,7 @@ def pipeline_that_converts_files(pipeline_class): content="Text file content ", meta={ "file_type": "txt", - "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42", + "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27", "page_number": 1, "split_id": 0, "split_idx_start": 0, @@ -4401,7 +4401,7 @@ def pipeline_that_converts_files(pipeline_class): content="for testing this.", meta={ "file_type": "txt", - "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42", + "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27", "page_number": 1, "split_id": 1, "split_idx_start": 18, diff --git a/test/dataclasses/test_document.py b/test/dataclasses/test_document.py index 9d7774db5a..18e6566328 100644 --- a/test/dataclasses/test_document.py +++ b/test/dataclasses/test_document.py @@ -52,7 +52,7 @@ def test_init_with_parameters(): embedding=[0.1, 0.2, 0.3], sparse_embedding=sparse_embedding, ) - assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56" + assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe" assert doc.content == "test text" assert doc.blob is not None assert doc.blob.data == blob_data @@ -95,7 +95,7 @@ def test_init_with_legacy_field(): embedding=[0.1, 0.2, 0.3], meta={"date": "10-10-2023", "type": "article"}, ) - assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43" + assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac" assert doc.content == "test text" assert doc.meta == {"date": "10-10-2023", "type": "article"} assert doc.score == 0.812 @@ -123,6 +123,21 @@ def test_basic_equality_id(): assert doc1 != doc2 +def test_id_is_independent_of_meta_key_order(): + doc1 = Document(content="hello", meta={"a": 1, "b": 2}) + doc2 = Document(content="hello", meta={"b": 2, "a": 1}) + + assert doc1.meta == doc2.meta + assert doc1.id == doc2.id + + +def test_id_is_independent_of_nested_meta_key_order(): + doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}}) + doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}}) + + assert doc1.id == doc2.id + + def test_to_dict(): doc = Document() assert doc.to_dict() == {