From 1ee86b5041713193d927f089632ef2efa082285f Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Tue, 4 Feb 2025 05:51:06 -0800 Subject: [PATCH 1/5] fix: Fix filters to handle date times with timezones (loading and comparison) (#8800) * Fix on date time parsing with timezones. And comparing naive and aware date times. * Add release note * Add more filter tests --- .../components/routers/metadata_router.py | 10 +- haystack/utils/filters.py | 43 ++++- .../fix-date-comparison-ced1d6ef64534951.yaml | 6 + .../routers/test_metadata_router.py | 27 +++ test/utils/test_filters.py | 168 ++---------------- 5 files changed, 87 insertions(+), 167 deletions(-) create mode 100644 releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py index 7c6c007d16..f0604d912d 100644 --- a/haystack/components/routers/metadata_router.py +++ b/haystack/components/routers/metadata_router.py @@ -76,6 +76,11 @@ def __init__(self, rules: Dict[str, Dict]): ``` """ self.rules = rules + for rule in self.rules.values(): + if "operator" not in rule: + raise ValueError( + "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." + ) component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules}) def run(self, documents: List[Document]): @@ -95,11 +100,6 @@ def run(self, documents: List[Document]): for document in documents: cur_document_matched = False for edge, rule in self.rules.items(): - if "operator" not in rule: - raise ValueError( - "Invalid filter syntax. " - "See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." - ) if document_matches_filter(rule, document): output[edge].append(document) cur_document_matched = True diff --git a/haystack/utils/filters.py b/haystack/utils/filters.py index bddb422efe..d9178b62d7 100644 --- a/haystack/utils/filters.py +++ b/haystack/utils/filters.py @@ -6,6 +6,7 @@ from datetime import datetime from typing import Any, Dict, List, Optional +import dateutil.parser import pandas as pd from haystack.dataclasses import Document @@ -69,18 +70,48 @@ def _greater_than(document_value: Any, filter_value: Any) -> bool: if isinstance(document_value, str) or isinstance(filter_value, str): try: - document_value = datetime.fromisoformat(document_value) - filter_value = datetime.fromisoformat(filter_value) + document_value = _parse_date(document_value) + filter_value = _parse_date(filter_value) + document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value) + except FilterError as exc: + raise exc + if type(filter_value) in [list, pd.DataFrame]: + msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return document_value > filter_value + + +def _parse_date(value): + """Try parsing the value as an ISO format date, then fall back to dateutil.parser.""" + try: + return datetime.fromisoformat(value) + except (ValueError, TypeError): + try: + return dateutil.parser.parse(value) except (ValueError, TypeError) as exc: msg = ( "Can't compare strings using operators '>', '>=', '<', '<='. " "Strings are only comparable if they are ISO formatted dates." ) raise FilterError(msg) from exc - if type(filter_value) in [list, pd.DataFrame]: - msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='" - raise FilterError(msg) - return document_value > filter_value + + +def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime): + """Ensure that both dates are either naive or aware.""" + # Both naive + if date1.tzinfo is None and date2.tzinfo is None: + return date1, date2 + + # Both aware + if date1.tzinfo is not None and date2.tzinfo is not None: + return date1, date2 + + # One naive, one aware + if date1.tzinfo is None: + date1 = date1.replace(tzinfo=date2.tzinfo) + else: + date2 = date2.replace(tzinfo=date1.tzinfo) + return date1, date2 def _greater_than_equal(document_value: Any, filter_value: Any) -> bool: diff --git a/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml b/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml new file mode 100644 index 0000000000..b6e071ecf6 --- /dev/null +++ b/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Enhancements to Date Filtering in MetadataRouter + - Improved date parsing in filter utilities by introducing `_parse_date`, which first attempts `datetime.fromisoformat(value)` for backward compatibility and then falls back to dateutil.parser.parse() for broader ISO 8601 support. + - Resolved a common issue where comparing naive and timezone-aware datetimes resulted in TypeError. Added `_ensure_both_dates_naive_or_aware`, which ensures both datetimes are either naive or aware. If one is missing a timezone, it is assigned the timezone of the other for consistency. diff --git a/test/components/routers/test_metadata_router.py b/test/components/routers/test_metadata_router.py index d9e8ce9875..2b7e964705 100644 --- a/test/components/routers/test_metadata_router.py +++ b/test/components/routers/test_metadata_router.py @@ -35,3 +35,30 @@ def test_run(self): assert output["edge_1"][0].meta["created_at"] == "2023-02-01" assert output["edge_2"][0].meta["created_at"] == "2023-05-01" assert output["unmatched"][0].meta["created_at"] == "2023-08-01" + + def test_run_wrong_filter(self): + rules = { + "edge_1": {"field": "meta.created_at", "operator": ">=", "value": "2023-01-01"}, + "wrong_filter": {"wrong_value": "meta.created_at == 2023-04-01"}, + } + with pytest.raises(ValueError): + MetadataRouter(rules=rules) + + def test_run_datetime_with_timezone(self): + rules = { + "edge_1": { + "operator": "AND", + "conditions": [{"field": "meta.created_at", "operator": ">=", "value": "2025-02-01"}], + } + } + router = MetadataRouter(rules=rules) + documents = [ + Document(meta={"created_at": "2025-02-03T12:45:46.435816Z"}), + Document(meta={"created_at": "2025-02-01T12:45:46.435816Z"}), + Document(meta={"created_at": "2025-01-03T12:45:46.435816Z"}), + ] + output = router.run(documents=documents) + assert len(output["edge_1"]) == 2 + assert output["edge_1"][0].meta["created_at"] == "2025-02-03T12:45:46.435816Z" + assert output["edge_1"][1].meta["created_at"] == "2025-02-01T12:45:46.435816Z" + assert output["unmatched"][0].meta["created_at"] == "2025-01-03T12:45:46.435816Z" diff --git a/test/utils/test_filters.py b/test/utils/test_filters.py index 708e50cb7b..7c3da7cc4e 100644 --- a/test/utils/test_filters.py +++ b/test/utils/test_filters.py @@ -485,6 +485,18 @@ True, id="NOT operator with Document matching no condition", ), + pytest.param( + {"field": "meta.date", "operator": "==", "value": "2025-02-03T12:45:46.435816Z"}, + Document(meta={"date": "2025-02-03T12:45:46.435816Z"}), + True, + id="== operator with ISO 8601 datetime Document value", + ), + pytest.param( + {"field": "meta.date", "operator": ">=", "value": "2025-02-01"}, + Document(meta={"date": "2025-02-03T12:45:46.435816Z"}), + True, + id=">= operator with naive and aware ISO 8601 datetime Document value", + ), ] @@ -552,159 +564,3 @@ def test_document_matches_filter_raises_error(filter): with pytest.raises(FilterError): document = Document(meta={"page": 10}) document_matches_filter(filter, document) - - -filters_data = [ - pytest.param( - { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - } - }, - { - "operator": "AND", - "conditions": [ - {"field": "type", "operator": "==", "value": "article"}, - {"field": "date", "operator": ">=", "value": "2015-01-01"}, - {"field": "date", "operator": "<", "value": "2021-01-01"}, - {"field": "rating", "operator": ">=", "value": 3}, - { - "operator": "OR", - "conditions": [ - {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, - {"field": "publisher", "operator": "==", "value": "nytimes"}, - ], - }, - ], - }, - id="All operators explicit", - ), - pytest.param( - { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": ["economy", "politics"], "publisher": "nytimes"}, - }, - { - "operator": "AND", - "conditions": [ - {"field": "type", "operator": "==", "value": "article"}, - {"field": "date", "operator": ">=", "value": "2015-01-01"}, - {"field": "date", "operator": "<", "value": "2021-01-01"}, - {"field": "rating", "operator": ">=", "value": 3}, - { - "operator": "OR", - "conditions": [ - {"field": "genre", "operator": "in", "value": ["economy", "politics"]}, - {"field": "publisher", "operator": "==", "value": "nytimes"}, - ], - }, - ], - }, - id="Root $and implicit", - ), - pytest.param( - { - "$or": [ - {"Type": "News Paper", "Date": {"$lt": "2019-01-01"}}, - {"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}}, - ] - }, - { - "operator": "OR", - "conditions": [ - { - "operator": "AND", - "conditions": [ - {"field": "Type", "operator": "==", "value": "News Paper"}, - {"field": "Date", "operator": "<", "value": "2019-01-01"}, - ], - }, - { - "operator": "AND", - "conditions": [ - {"field": "Type", "operator": "==", "value": "Blog Post"}, - {"field": "Date", "operator": ">=", "value": "2019-01-01"}, - ], - }, - ], - }, - id="Root $or with list and multiple comparisons", - ), - pytest.param( - {"text": "A Foo Document 1"}, - {"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]}, - id="Implicit root $and and field $eq", - ), - pytest.param( - {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}, - { - "operator": "OR", - "conditions": [ - { - "operator": "OR", - "conditions": [ - {"field": "name", "operator": "==", "value": "name_0"}, - {"field": "name", "operator": "==", "value": "name_1"}, - ], - }, - {"field": "number", "operator": "<", "value": 1.0}, - ], - }, - id="Root $or with dict and field $or with list", - ), - pytest.param( - {"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]}, - { - "operator": "AND", - "conditions": [ - {"field": "number", "operator": "<=", "value": 2}, - {"field": "number", "operator": ">=", "value": 0}, - {"field": "name", "operator": "in", "value": ["name_0", "name_1"]}, - ], - }, - id="Implicit $and and field $in", - ), - pytest.param( - {"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}}, - { - "operator": "AND", - "conditions": [ - {"field": "number", "operator": "<=", "value": 2}, - {"field": "number", "operator": ">=", "value": 0}, - ], - }, - id="Implicit root $and and field $and with list", - ), - pytest.param( - { - "$not": { - "number": {"$lt": 1.0}, - "$and": {"name": {"$in": ["name_0", "name_1"]}, "$not": {"chapter": {"$eq": "intro"}}}, - } - }, - { - "operator": "NOT", - "conditions": [ - {"field": "number", "operator": "<", "value": 1.0}, - { - "operator": "AND", - "conditions": [ - {"field": "name", "operator": "in", "value": ["name_0", "name_1"]}, - {"operator": "NOT", "conditions": [{"field": "chapter", "operator": "==", "value": "intro"}]}, - ], - }, - ], - }, - id="Root explicit $not", - ), - pytest.param( - {"page": {"$not": 102}}, - {"operator": "NOT", "conditions": [{"field": "page", "operator": "==", "value": 102}]}, - id="Explicit $not with implicit $eq", - ), -] From f1679f1dca11355769bfec0c4acb56286155c3e5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 4 Feb 2025 15:01:54 +0100 Subject: [PATCH 2/5] build(deps): bump fossas/fossa-action from 1.4.0 to 1.5.0 (#8771) Bumps [fossas/fossa-action](https://github.com/fossas/fossa-action) from 1.4.0 to 1.5.0. - [Release notes](https://github.com/fossas/fossa-action/releases) - [Commits](https://github.com/fossas/fossa-action/compare/v1.4.0...v1.5.0) --- updated-dependencies: - dependency-name: fossas/fossa-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/license_compliance.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/license_compliance.yml b/.github/workflows/license_compliance.yml index 1bf55ee536..969b2ecbd0 100644 --- a/.github/workflows/license_compliance.yml +++ b/.github/workflows/license_compliance.yml @@ -47,7 +47,7 @@ jobs: # We keep the license inventory on FOSSA - name: Send license report to Fossa - uses: fossas/fossa-action@v1.4.0 + uses: fossas/fossa-action@v1.5.0 continue-on-error: true # not critical with: api-key: ${{ secrets.FOSSA_LICENSE_SCAN_TOKEN }} From 5ae94886b2b937a13c648c6525b17422ddbaf731 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Tue, 4 Feb 2025 19:08:37 +0100 Subject: [PATCH 3/5] fix: fix test failures with Transformers models in PRs from forks (#8809) * trigger * try pinning sentence transformers * make integr tests run right away * pin transformers instead * older transformers version * rm transformers pin * try ignoring cache * change ubuntu version * try removing token * try again * more HF_API_TOKEN local deletions * restore test priority * rm leftover * more deletions * moreee * more * deletions * restore jobs order --- .../test_zero_shot_document_classifier.py | 3 ++- .../test_sentence_transformers_text_embedder.py | 3 ++- test/components/evaluators/test_sas_evaluator.py | 15 ++++++++++----- .../generators/chat/test_hugging_face_local.py | 3 ++- .../test_hugging_face_local_generator.py | 6 ++++-- .../test_sentence_transformers_diversity.py | 9 ++++++--- test/components/readers/test_extractive.py | 9 ++++++--- .../routers/test_transformers_text_router.py | 6 ++++-- .../routers/test_zero_shot_text_router.py | 3 ++- 9 files changed, 38 insertions(+), 19 deletions(-) diff --git a/test/components/classifiers/test_zero_shot_document_classifier.py b/test/components/classifiers/test_zero_shot_document_classifier.py index be4d04a9fe..35948e8dda 100644 --- a/test/components/classifiers/test_zero_shot_document_classifier.py +++ b/test/components/classifiers/test_zero_shot_document_classifier.py @@ -137,7 +137,8 @@ def test_run_unit(self, hf_pipeline_mock): assert result["documents"][1].to_dict()["classification"]["label"] == "negative" @pytest.mark.integration - def test_run(self): + def test_run(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 component = TransformersZeroShotDocumentClassifier( model="cross-encoder/nli-deberta-v3-xsmall", labels=["positive", "negative"] ) diff --git a/test/components/embedders/test_sentence_transformers_text_embedder.py b/test/components/embedders/test_sentence_transformers_text_embedder.py index 195ee8efd1..293b07aada 100644 --- a/test/components/embedders/test_sentence_transformers_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_text_embedder.py @@ -261,10 +261,11 @@ def test_run_wrong_input_format(self): embedder.run(text=list_integers_input) @pytest.mark.integration - def test_run_trunc(self): + def test_run_trunc(self, monkeypatch): """ sentence-transformers/paraphrase-albert-small-v2 maps sentences & paragraphs to a 768 dimensional dense vector space """ + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 checkpoint = "sentence-transformers/paraphrase-albert-small-v2" text = "a nice text to embed" diff --git a/test/components/evaluators/test_sas_evaluator.py b/test/components/evaluators/test_sas_evaluator.py index d1dad15151..cab8581ad7 100644 --- a/test/components/evaluators/test_sas_evaluator.py +++ b/test/components/evaluators/test_sas_evaluator.py @@ -104,7 +104,8 @@ def test_run_not_warmed_up(self): evaluator.run(ground_truth_answers=ground_truths, predicted_answers=predictions) @pytest.mark.integration - def test_run_with_matching_predictions(self): + def test_run_with_matching_predictions(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 evaluator = SASEvaluator() ground_truths = [ "A construction budget of US $2.3 billion", @@ -124,7 +125,8 @@ def test_run_with_matching_predictions(self): assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0]) @pytest.mark.integration - def test_run_with_single_prediction(self): + def test_run_with_single_prediction(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 evaluator = SASEvaluator() ground_truths = ["US $2.3 billion"] @@ -137,7 +139,8 @@ def test_run_with_single_prediction(self): assert result["individual_scores"] == pytest.approx([0.689089], abs=1e-5) @pytest.mark.integration - def test_run_with_mismatched_predictions(self): + def test_run_with_mismatched_predictions(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 evaluator = SASEvaluator() ground_truths = [ "US $2.3 billion", @@ -156,7 +159,8 @@ def test_run_with_mismatched_predictions(self): assert result["individual_scores"] == pytest.approx([0.689089, 0.870389, 0.908679], abs=1e-5) @pytest.mark.integration - def test_run_with_bi_encoder_model(self): + def test_run_with_bi_encoder_model(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 evaluator = SASEvaluator(model="sentence-transformers/all-mpnet-base-v2") ground_truths = [ "A construction budget of US $2.3 billion", @@ -175,7 +179,8 @@ def test_run_with_bi_encoder_model(self): assert result["individual_scores"] == pytest.approx([1.0, 1.0, 1.0]) @pytest.mark.integration - def test_run_with_cross_encoder_model(self): + def test_run_with_cross_encoder_model(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 evaluator = SASEvaluator(model="cross-encoder/ms-marco-MiniLM-L-6-v2") ground_truths = [ "A construction budget of US $2.3 billion", diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py index 7c2e05df3a..1f6b478370 100644 --- a/test/components/generators/chat/test_hugging_face_local.py +++ b/test/components/generators/chat/test_hugging_face_local.py @@ -293,7 +293,8 @@ def test_messages_conversion_is_called(self, mock_convert, model_info_mock): @pytest.mark.integration @pytest.mark.flaky(reruns=3, reruns_delay=10) - def test_live_run(self): + def test_live_run(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 messages = [ChatMessage.from_user("Please create a summary about the following topic: Climate change")] llm = HuggingFaceLocalChatGenerator( diff --git a/test/components/generators/test_hugging_face_local_generator.py b/test/components/generators/test_hugging_face_local_generator.py index 8d64bc44fc..d6432a360b 100644 --- a/test/components/generators/test_hugging_face_local_generator.py +++ b/test/components/generators/test_hugging_face_local_generator.py @@ -454,8 +454,9 @@ def test_stop_words_criteria_using_hf_tokenizer(self): assert criteria(generated_text_ids, scores=None) is True @pytest.mark.integration - def test_hf_pipeline_runs_with_our_criteria(self): + def test_hf_pipeline_runs_with_our_criteria(self, monkeypatch): """Test that creating our own StopWordsCriteria and passing it to a Huggingface pipeline works.""" + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 generator = HuggingFaceLocalGenerator( model="google/flan-t5-small", task="text2text-generation", stop_words=["unambiguously"] ) @@ -466,7 +467,8 @@ def test_hf_pipeline_runs_with_our_criteria(self): @pytest.mark.integration @pytest.mark.flaky(reruns=3, reruns_delay=10) - def test_live_run(self): + def test_live_run(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 llm = HuggingFaceLocalGenerator(model="Qwen/Qwen2.5-0.5B-Instruct", generation_kwargs={"max_new_tokens": 50}) llm.warm_up() diff --git a/test/components/rankers/test_sentence_transformers_diversity.py b/test/components/rankers/test_sentence_transformers_diversity.py index 018b443987..a4dcd7e89f 100644 --- a/test/components/rankers/test_sentence_transformers_diversity.py +++ b/test/components/rankers/test_sentence_transformers_diversity.py @@ -574,10 +574,11 @@ def test_pipeline_serialise_deserialise(self): @pytest.mark.integration @pytest.mark.parametrize("similarity", ["dot_product", "cosine"]) - def test_run(self, similarity): + def test_run(self, similarity, monkeypatch): """ Tests that run method returns documents in the correct order """ + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 ranker = SentenceTransformersDiversityRanker( model="sentence-transformers/all-MiniLM-L6-v2", similarity=similarity ) @@ -601,7 +602,8 @@ def test_run(self, similarity): @pytest.mark.integration @pytest.mark.parametrize("similarity", ["dot_product", "cosine"]) - def test_run_real_world_use_case(self, similarity): + def test_run_real_world_use_case(self, similarity, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 ranker = SentenceTransformersDiversityRanker( model="sentence-transformers/all-MiniLM-L6-v2", similarity=similarity ) @@ -673,7 +675,8 @@ def test_run_real_world_use_case(self, similarity): @pytest.mark.integration @pytest.mark.parametrize("similarity", ["dot_product", "cosine"]) - def test_run_with_maximum_margin_relevance_strategy(self, similarity): + def test_run_with_maximum_margin_relevance_strategy(self, similarity, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 query = "renewable energy sources" docs = [ Document(content="18th-century French literature"), diff --git a/test/components/readers/test_extractive.py b/test/components/readers/test_extractive.py index a2f658b79b..2c33e51f9c 100644 --- a/test/components/readers/test_extractive.py +++ b/test/components/readers/test_extractive.py @@ -776,7 +776,8 @@ def test_deduplicate_by_overlap( @pytest.mark.integration -def test_t5(): +def test_t5(monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 reader = ExtractiveReader("sjrhuschlee/flan-t5-base-squad2") reader.warm_up() answers = reader.run(example_queries[0], example_documents[0], top_k=2)[ @@ -800,7 +801,8 @@ def test_t5(): @pytest.mark.integration -def test_roberta(): +def test_roberta(monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 reader = ExtractiveReader("deepset/tinyroberta-squad2") reader.warm_up() answers = reader.run(example_queries[0], example_documents[0], top_k=2)[ @@ -829,7 +831,8 @@ def test_roberta(): @pytest.mark.integration -def test_matches_hf_pipeline(): +def test_matches_hf_pipeline(monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 reader = ExtractiveReader( "deepset/tinyroberta-squad2", device=ComponentDevice.from_str("cpu"), overlap_threshold=None ) diff --git a/test/components/routers/test_transformers_text_router.py b/test/components/routers/test_transformers_text_router.py index 67ec163524..a539e0d3c3 100644 --- a/test/components/routers/test_transformers_text_router.py +++ b/test/components/routers/test_transformers_text_router.py @@ -172,7 +172,8 @@ def test_run_unit(self, hf_pipeline_mock, mock_auto_config_from_pretrained): assert out == {"en": "What is the color of the sky?"} @pytest.mark.integration - def test_run(self): + def test_run(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 router = TransformersTextRouter(model="papluca/xlm-roberta-base-language-detection") router.warm_up() out = router.run("What is the color of the sky?") @@ -202,7 +203,8 @@ def test_run(self): assert out == {"en": "What is the color of the sky?"} @pytest.mark.integration - def test_wrong_labels(self): + def test_wrong_labels(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 router = TransformersTextRouter(model="papluca/xlm-roberta-base-language-detection", labels=["en", "de"]) with pytest.raises(ValueError): router.warm_up() diff --git a/test/components/routers/test_zero_shot_text_router.py b/test/components/routers/test_zero_shot_text_router.py index 3b931c39bb..874ede64d5 100644 --- a/test/components/routers/test_zero_shot_text_router.py +++ b/test/components/routers/test_zero_shot_text_router.py @@ -106,7 +106,8 @@ def test_run_unit(self, hf_pipeline_mock): assert out == {"query": "What is the color of the sky?"} @pytest.mark.integration - def test_run(self): + def test_run(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 router = TransformersZeroShotTextRouter(labels=["query", "passage"]) router.warm_up() out = router.run("What is the color of the sky?") From 2828d9e4aeeef1f6d95928a8cfc9e47fa29a5866 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 5 Feb 2025 14:43:19 +0100 Subject: [PATCH 4/5] refactor!: `DOCXToDocument` converter - store DOCX metadata as a dict (#8804) * DOCXToDocument - store DOCX metadata as a dict * do not export DOCXMetadata to converters package --- haystack/components/converters/__init__.py | 3 +- haystack/components/converters/docx.py | 4 +- ...docxmetadata-as-dict-20cf2ef0abf7af8a.yaml | 6 + .../converters/test_docx_file_to_document.py | 136 +++++++++--------- 4 files changed, 77 insertions(+), 72 deletions(-) create mode 100644 releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 2c7ed33505..d0057ea33d 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -4,7 +4,7 @@ from haystack.components.converters.azure import AzureOCRDocumentConverter from haystack.components.converters.csv import CSVToDocument -from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument +from haystack.components.converters.docx import DOCXToDocument from haystack.components.converters.html import HTMLToDocument from haystack.components.converters.json import JSONConverter from haystack.components.converters.markdown import MarkdownToDocument @@ -28,7 +28,6 @@ "OpenAPIServiceToFunctions", "OutputAdapter", "DOCXToDocument", - "DOCXMetadata", "PPTXToDocument", "CSVToDocument", "JSONConverter", diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index 8f9a58004d..3607ae4544 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -5,7 +5,7 @@ import csv import io import os -from dataclasses import dataclass +from dataclasses import asdict, dataclass from enum import Enum from io import StringIO from pathlib import Path @@ -189,7 +189,7 @@ def run( ) continue - docx_metadata = self._get_docx_metadata(document=docx_document) + docx_metadata = asdict(self._get_docx_metadata(document=docx_document)) merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata} if not self.store_full_path and "file_path" in bytestream.meta: diff --git a/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml new file mode 100644 index 0000000000..60f4c17ce3 --- /dev/null +++ b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml @@ -0,0 +1,6 @@ +--- +upgrade: + - | + The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a + dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass. + This change does not impact reading from or writing to a Document Store. diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py index c013759938..2de6bfefbe 100644 --- a/test/components/converters/test_docx_file_to_document.py +++ b/test/components/converters/test_docx_file_to_document.py @@ -121,23 +121,23 @@ def test_run(self, test_files_path, docx_converter): assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { "file_path": os.path.basename(paths[0]), - "docx": DOCXMetadata( - author="Microsoft Office User", - category="", - comments="", - content_status="", - created="2024-06-09T21:17:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Carlos Fernández Lorán", - last_printed=None, - modified="2024-06-09T21:27:00+00:00", - revision=2, - subject="", - title="", - version="", - ), + "docx": { + "author": "Microsoft Office User", + "category": "", + "comments": "", + "content_status": "", + "created": "2024-06-09T21:17:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Carlos Fernández Lorán", + "last_printed": None, + "modified": "2024-06-09T21:27:00+00:00", + "revision": 2, + "subject": "", + "title": "", + "version": "", + }, } def test_run_with_table(self, test_files_path): @@ -153,23 +153,23 @@ def test_run_with_table(self, test_files_path): assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { "file_path": os.path.basename(paths[0]), - "docx": DOCXMetadata( - author="Saha, Anirban", - category="", - comments="", - content_status="", - created="2020-07-14T08:14:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Saha, Anirban", - last_printed=None, - modified="2020-07-14T08:16:00+00:00", - revision=1, - subject="", - title="", - version="", - ), + "docx": { + "author": "Saha, Anirban", + "category": "", + "comments": "", + "content_status": "", + "created": "2020-07-14T08:14:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Saha, Anirban", + "last_printed": None, + "modified": "2020-07-14T08:16:00+00:00", + "revision": 1, + "subject": "", + "title": "", + "version": "", + }, } # let's now detect that the table markdown is correctly added and that order of elements is correct content_parts = docs[0].content.split("\n\n") @@ -193,23 +193,23 @@ def test_run_with_store_full_path_false(self, test_files_path): assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { "file_path": "sample_docx_1.docx", - "docx": DOCXMetadata( - author="Microsoft Office User", - category="", - comments="", - content_status="", - created="2024-06-09T21:17:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Carlos Fernández Lorán", - last_printed=None, - modified="2024-06-09T21:27:00+00:00", - revision=2, - subject="", - title="", - version="", - ), + "docx": { + "author": "Microsoft Office User", + "category": "", + "comments": "", + "content_status": "", + "created": "2024-06-09T21:17:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Carlos Fernández Lorán", + "last_printed": None, + "modified": "2024-06-09T21:27:00+00:00", + "revision": 2, + "subject": "", + "title": "", + "version": "", + }, } @pytest.mark.parametrize("table_format", ["markdown", "csv"]) @@ -285,23 +285,23 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter): doc = output["documents"][0] assert doc.meta == { "file_path": os.path.basename(paths[0]), - "docx": DOCXMetadata( - author="Microsoft Office User", - category="", - comments="", - content_status="", - created="2024-06-09T21:17:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Carlos Fernández Lorán", - last_printed=None, - modified="2024-06-09T21:27:00+00:00", - revision=2, - subject="", - title="", - version="", - ), + "docx": { + "author": "Microsoft Office User", + "category": "", + "comments": "", + "content_status": "", + "created": "2024-06-09T21:17:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Carlos Fernández Lorán", + "last_printed": None, + "modified": "2024-06-09T21:27:00+00:00", + "revision": 2, + "subject": "", + "title": "", + "version": "", + }, "language": "it", "author": "test_author", } From d2348ad46285dc435b4f7904f2e3dc5e606b7f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Orosz?= Date: Wed, 5 Feb 2025 17:09:35 +0100 Subject: [PATCH 5/5] feat: SentenceTransformersDocumentEmbedder and SentenceTransformersTextEmbedder can accept and pass any arguments to SentenceTransformer.encode (#8806) * feat: SentenceTransformersDocumentEmbedder and SentenceTransformersTextEmbedder can accept and pass any arguments to SentenceTransformer.encode * refactor: encode_kwargs parameter of SentenceTransformersDocumentEmbedder and SentenceTransformersTextEmbedder mae to be the last positional parameter for backward compatibility reasons * docs: added explanation for encode_kwargs in SentenceTransformersTextEmbedder and SentenceTransformersDocumentEmbedder * test: added tests for encode_kwargs in SentenceTransformersTextEmbedder and SentenceTransformersDocumentEmbedder * doc: removed empty lines from docstrings of SentenceTransformersTextEmbedder and SentenceTransformersDocumentEmbedder * refactor: encode_kwargs parameter of SentenceTransformersDocumentEmbedder and SentenceTransformersTextEmbedder mae to be the last positional parameter for backward compatibility (part II.) --- ...sentence_transformers_document_embedder.py | 8 +++++++ .../sentence_transformers_text_embedder.py | 8 +++++++ ...entence-transformers-f4d885f6c5b1706f.yaml | 6 ++++++ ...sentence_transformers_document_embedder.py | 19 ++++++++++++++++- ...est_sentence_transformers_text_embedder.py | 21 +++++++++++++++++-- 5 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/add-encode-kwargs-sentence-transformers-f4d885f6c5b1706f.yaml diff --git a/haystack/components/embedders/sentence_transformers_document_embedder.py b/haystack/components/embedders/sentence_transformers_document_embedder.py index a5eaa9ae8b..0125be3f5c 100644 --- a/haystack/components/embedders/sentence_transformers_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_document_embedder.py @@ -56,6 +56,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments tokenizer_kwargs: Optional[Dict[str, Any]] = None, config_kwargs: Optional[Dict[str, Any]] = None, precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32", + encode_kwargs: Optional[Dict[str, Any]] = None, ): """ Creates a SentenceTransformersDocumentEmbedder component. @@ -104,6 +105,10 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments All non-float32 precisions are quantized embeddings. Quantized embeddings are smaller and faster to compute, but may have a lower accuracy. They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks. + :param encode_kwargs: + Additional keyword arguments for `SentenceTransformer.encode` when embedding documents. + This parameter is provided for fine customization. Be careful not to clash with already set parameters and + avoid passing parameters that change the output type. """ self.model = model @@ -121,6 +126,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments self.model_kwargs = model_kwargs self.tokenizer_kwargs = tokenizer_kwargs self.config_kwargs = config_kwargs + self.encode_kwargs = encode_kwargs self.embedding_backend = None self.precision = precision @@ -155,6 +161,7 @@ def to_dict(self) -> Dict[str, Any]: tokenizer_kwargs=self.tokenizer_kwargs, config_kwargs=self.config_kwargs, precision=self.precision, + encode_kwargs=self.encode_kwargs, ) if serialization_dict["init_parameters"].get("model_kwargs") is not None: serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"]) @@ -232,6 +239,7 @@ def run(self, documents: List[Document]): show_progress_bar=self.progress_bar, normalize_embeddings=self.normalize_embeddings, precision=self.precision, + **(self.encode_kwargs if self.encode_kwargs else {}), ) for doc, emb in zip(documents, embeddings): diff --git a/haystack/components/embedders/sentence_transformers_text_embedder.py b/haystack/components/embedders/sentence_transformers_text_embedder.py index 0785caddbd..2cb1622e97 100644 --- a/haystack/components/embedders/sentence_transformers_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_text_embedder.py @@ -50,6 +50,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments tokenizer_kwargs: Optional[Dict[str, Any]] = None, config_kwargs: Optional[Dict[str, Any]] = None, precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32", + encode_kwargs: Optional[Dict[str, Any]] = None, ): """ Create a SentenceTransformersTextEmbedder component. @@ -94,6 +95,10 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments All non-float32 precisions are quantized embeddings. Quantized embeddings are smaller in size and faster to compute, but may have a lower accuracy. They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks. + :param encode_kwargs: + Additional keyword arguments for `SentenceTransformer.encode` when embedding texts. + This parameter is provided for fine customization. Be careful not to clash with already set parameters and + avoid passing parameters that change the output type. """ self.model = model @@ -109,6 +114,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments self.model_kwargs = model_kwargs self.tokenizer_kwargs = tokenizer_kwargs self.config_kwargs = config_kwargs + self.encode_kwargs = encode_kwargs self.embedding_backend = None self.precision = precision @@ -141,6 +147,7 @@ def to_dict(self) -> Dict[str, Any]: tokenizer_kwargs=self.tokenizer_kwargs, config_kwargs=self.config_kwargs, precision=self.precision, + encode_kwargs=self.encode_kwargs, ) if serialization_dict["init_parameters"].get("model_kwargs") is not None: serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"]) @@ -209,5 +216,6 @@ def run(self, text: str): show_progress_bar=self.progress_bar, normalize_embeddings=self.normalize_embeddings, precision=self.precision, + **(self.encode_kwargs if self.encode_kwargs else {}), )[0] return {"embedding": embedding} diff --git a/releasenotes/notes/add-encode-kwargs-sentence-transformers-f4d885f6c5b1706f.yaml b/releasenotes/notes/add-encode-kwargs-sentence-transformers-f4d885f6c5b1706f.yaml new file mode 100644 index 0000000000..407a1b7ae7 --- /dev/null +++ b/releasenotes/notes/add-encode-kwargs-sentence-transformers-f4d885f6c5b1706f.yaml @@ -0,0 +1,6 @@ +--- +enhancements: + - | + Enhanced `SentenceTransformersDocumentEmbedder` and `SentenceTransformersTextEmbedder` to accept + an additional parameter, which is passed directly to the underlying `SentenceTransformer.encode` method + for greater flexibility in embedding customization. diff --git a/test/components/embedders/test_sentence_transformers_document_embedder.py b/test/components/embedders/test_sentence_transformers_document_embedder.py index d8813f36f4..205feac10f 100644 --- a/test/components/embedders/test_sentence_transformers_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_document_embedder.py @@ -1,9 +1,9 @@ # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +import random from unittest.mock import MagicMock, patch -import random import pytest import torch @@ -79,6 +79,7 @@ def test_to_dict(self): "truncate_dim": None, "model_kwargs": None, "tokenizer_kwargs": None, + "encode_kwargs": None, "config_kwargs": None, "precision": "float32", }, @@ -102,6 +103,7 @@ def test_to_dict_with_custom_init_parameters(self): tokenizer_kwargs={"model_max_length": 512}, config_kwargs={"use_memory_efficient_attention": True}, precision="int8", + encode_kwargs={"task": "clustering"}, ) data = component.to_dict() @@ -124,6 +126,7 @@ def test_to_dict_with_custom_init_parameters(self): "tokenizer_kwargs": {"model_max_length": 512}, "config_kwargs": {"use_memory_efficient_attention": True}, "precision": "int8", + "encode_kwargs": {"task": "clustering"}, }, } @@ -316,6 +319,20 @@ def test_embed_metadata(self): precision="float32", ) + def test_embed_encode_kwargs(self): + embedder = SentenceTransformersDocumentEmbedder(model="model", encode_kwargs={"task": "retrieval.passage"}) + embedder.embedding_backend = MagicMock() + documents = [Document(content=f"document number {i}") for i in range(5)] + embedder.run(documents=documents) + embedder.embedding_backend.embed.assert_called_once_with( + ["document number 0", "document number 1", "document number 2", "document number 3", "document number 4"], + batch_size=32, + show_progress_bar=True, + normalize_embeddings=False, + precision="float32", + task="retrieval.passage", + ) + def test_prefix_suffix(self): embedder = SentenceTransformersDocumentEmbedder( model="model", diff --git a/test/components/embedders/test_sentence_transformers_text_embedder.py b/test/components/embedders/test_sentence_transformers_text_embedder.py index 293b07aada..6d0e239a15 100644 --- a/test/components/embedders/test_sentence_transformers_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_text_embedder.py @@ -1,11 +1,11 @@ # SPDX-FileCopyrightText: 2022-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +import random from unittest.mock import MagicMock, patch -import torch -import random import pytest +import torch from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder from haystack.utils import ComponentDevice, Secret @@ -70,6 +70,7 @@ def test_to_dict(self): "truncate_dim": None, "model_kwargs": None, "tokenizer_kwargs": None, + "encode_kwargs": None, "config_kwargs": None, "precision": "float32", }, @@ -91,6 +92,7 @@ def test_to_dict_with_custom_init_parameters(self): tokenizer_kwargs={"model_max_length": 512}, config_kwargs={"use_memory_efficient_attention": False}, precision="int8", + encode_kwargs={"task": "clustering"}, ) data = component.to_dict() assert data == { @@ -110,6 +112,7 @@ def test_to_dict_with_custom_init_parameters(self): "tokenizer_kwargs": {"model_max_length": 512}, "config_kwargs": {"use_memory_efficient_attention": False}, "precision": "int8", + "encode_kwargs": {"task": "clustering"}, }, } @@ -297,3 +300,17 @@ def test_run_quantization(self): assert len(embedding_def) == 768 assert all(isinstance(el, int) for el in embedding_def) + + def test_embed_encode_kwargs(self): + embedder = SentenceTransformersTextEmbedder(model="model", encode_kwargs={"task": "retrieval.query"}) + embedder.embedding_backend = MagicMock() + text = "a nice text to embed" + embedder.run(text=text) + embedder.embedding_backend.embed.assert_called_once_with( + [text], + batch_size=32, + show_progress_bar=True, + normalize_embeddings=False, + precision="float32", + task="retrieval.query", + )