From 987d02d9bb11eeb55c3099cbbf51c9dbe5e49225 Mon Sep 17 00:00:00 2001 From: Aarkin7 Date: Sun, 31 May 2026 23:28:25 +0530 Subject: [PATCH 1/3] fix: make Document.id deterministic regardless of meta key order The hash was built from dict's repr, which reflects insertion order, so two Documents with equal meta could get different IDs. Serialize meta with sort_keys=True before hashing. Empty-meta IDs are unchanged. --- haystack/dataclasses/document.py | 5 ++++- ...cross-meta-key-order-f0293d51712e82be.yaml | 20 +++++++++++++++++++ test/dataclasses/test_document.py | 19 ++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml diff --git a/haystack/dataclasses/document.py b/haystack/dataclasses/document.py index 6f6853d8e1..d1b6d09ebe 100644 --- a/haystack/dataclasses/document.py +++ b/haystack/dataclasses/document.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import hashlib +import json from dataclasses import asdict, dataclass, field, fields from typing import Any @@ -113,7 +114,9 @@ def _create_id(self) -> str: dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed blob = self.blob.data if self.blob is not None else None mime_type = self.blob.mime_type if self.blob is not None else None - meta = self.meta or {} + # Sort keys so meta order doesn't affect the hash. Keep "{}" for empty meta + # so existing IDs stay stable. + meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}" embedding = self.embedding if self.embedding is not None else None sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else "" data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}" diff --git a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml new file mode 100644 index 0000000000..fa37d23489 --- /dev/null +++ b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml @@ -0,0 +1,20 @@ +--- +upgrade: + - | + The hash used to auto-generate ``Document.id`` is now computed from a + canonical (key-sorted) serialization of ``meta``. Documents created with + non-empty ``meta`` will therefore get different IDs than they did before. + Documents with empty ``meta`` are unaffected. + + If you rely on auto-generated IDs to match documents already persisted in a + ``DocumentStore``, you will need to re-ingest the affected documents (or + pass the previous ``id`` explicitly when constructing the ``Document``). +fixes: + - | + ``Document.id`` is now deterministic regardless of the insertion order of + keys in ``meta``. Previously the hash was built from ``dict``'s repr, which + reflects insertion order, so two documents with the same content and the + same ``meta`` could get different IDs depending on how the ``meta`` dict was + constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and + any cache or dedup table keyed on the document ID whenever upstream code + produced ``meta`` in different orders. diff --git a/test/dataclasses/test_document.py b/test/dataclasses/test_document.py index 9d7774db5a..18e6566328 100644 --- a/test/dataclasses/test_document.py +++ b/test/dataclasses/test_document.py @@ -52,7 +52,7 @@ def test_init_with_parameters(): embedding=[0.1, 0.2, 0.3], sparse_embedding=sparse_embedding, ) - assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56" + assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe" assert doc.content == "test text" assert doc.blob is not None assert doc.blob.data == blob_data @@ -95,7 +95,7 @@ def test_init_with_legacy_field(): embedding=[0.1, 0.2, 0.3], meta={"date": "10-10-2023", "type": "article"}, ) - assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43" + assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac" assert doc.content == "test text" assert doc.meta == {"date": "10-10-2023", "type": "article"} assert doc.score == 0.812 @@ -123,6 +123,21 @@ def test_basic_equality_id(): assert doc1 != doc2 +def test_id_is_independent_of_meta_key_order(): + doc1 = Document(content="hello", meta={"a": 1, "b": 2}) + doc2 = Document(content="hello", meta={"b": 2, "a": 1}) + + assert doc1.meta == doc2.meta + assert doc1.id == doc2.id + + +def test_id_is_independent_of_nested_meta_key_order(): + doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}}) + doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}}) + + assert doc1.id == doc2.id + + def test_to_dict(): doc = Document() assert doc.to_dict() == { From e3d1ffe8077683d5e8ecf915283ed1b15fd8cb29 Mon Sep 17 00:00:00 2001 From: Aarkin7 Date: Mon, 1 Jun 2026 00:24:54 +0530 Subject: [PATCH 2/3] test: update stale Document IDs in pipeline BDD scenarios Two BDD scenarios pinned IDs that were computed from documents with non-empty meta, so the deterministic-id fix changes them. Recompute and update the expected values; no behavior change. --- test/core/pipeline/features/test_run.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/core/pipeline/features/test_run.py b/test/core/pipeline/features/test_run.py index 57383b53be..51ec43c61c 100644 --- a/test/core/pipeline/features/test_run.py +++ b/test/core/pipeline/features/test_run.py @@ -3104,7 +3104,7 @@ def run(self, query: str) -> dict[str, list[Document]]: ("rag_prompt", 1): { "documents": [ Document( - id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed", + id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1", content="This is a document potentially answering the question.", meta={"access_group": 1}, ) @@ -4381,7 +4381,7 @@ def pipeline_that_converts_files(pipeline_class): content="Some test content", meta={ "file_type": "json", - "source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5", + "source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74", "page_number": 1, "split_id": 0, "split_idx_start": 0, @@ -4391,7 +4391,7 @@ def pipeline_that_converts_files(pipeline_class): content="Text file content ", meta={ "file_type": "txt", - "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42", + "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27", "page_number": 1, "split_id": 0, "split_idx_start": 0, @@ -4401,7 +4401,7 @@ def pipeline_that_converts_files(pipeline_class): content="for testing this.", meta={ "file_type": "txt", - "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42", + "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27", "page_number": 1, "split_id": 1, "split_idx_start": 18, From ff2fe994db3e593a0b8ef78a91fdf0b6d0547fef Mon Sep 17 00:00:00 2001 From: Aarkin7 Date: Tue, 2 Jun 2026 20:42:07 +0530 Subject: [PATCH 3/3] docs: clarify Document.id upgrade note for non-JSON-serializable meta --- ...stic-across-meta-key-order-f0293d51712e82be.yaml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml index fa37d23489..e2d55f8484 100644 --- a/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml +++ b/releasenotes/notes/make-document-id-deterministic-across-meta-key-order-f0293d51712e82be.yaml @@ -2,9 +2,16 @@ upgrade: - | The hash used to auto-generate ``Document.id`` is now computed from a - canonical (key-sorted) serialization of ``meta``. Documents created with - non-empty ``meta`` will therefore get different IDs than they did before. - Documents with empty ``meta`` are unaffected. + canonical (key-sorted) JSON serialization of ``meta``. Documents with + empty ``meta`` are unaffected, but most other documents will get different + IDs than they did before: + + * documents with non-empty ``meta`` (the serialization changes from + ``dict``'s repr to JSON); + * documents whose ``meta`` contains non-JSON-serializable values such as + ``datetime`` or custom classes (these are now serialized via ``str(...)`` + rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of + ``"datetime.datetime(2024, 1, 1, 0, 0)"``). If you rely on auto-generated IDs to match documents already persisted in a ``DocumentStore``, you will need to re-ingest the affected documents (or