Merge branch 'main' into add-list-joiner

deepset-ai · Feb 5, 2025 · ff1050d · ff1050d
2 parents 43e313c + d2348ad
commit ff1050d
Show file tree

Hide file tree

Showing 23 changed files with 262 additions and 262 deletions.
diff --git a/.github/workflows/license_compliance.yml b/.github/workflows/license_compliance.yml
@@ -47,7 +47,7 @@ jobs:
 
       # We keep the license inventory on FOSSA
       - name: Send license report to Fossa
-        uses: fossas/fossa-action@v1.4.0
+        uses: fossas/fossa-action@v1.5.0
         continue-on-error: true # not critical
         with:
           api-key: ${{ secrets.FOSSA_LICENSE_SCAN_TOKEN }}

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -4,7 +4,7 @@
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
 from haystack.components.converters.csv import CSVToDocument
-from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
+from haystack.components.converters.docx import DOCXToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.json import JSONConverter
 from haystack.components.converters.markdown import MarkdownToDocument
@@ -28,7 +28,6 @@
     "OpenAPIServiceToFunctions",
     "OutputAdapter",
     "DOCXToDocument",
-    "DOCXMetadata",
     "PPTXToDocument",
     "CSVToDocument",
     "JSONConverter",

diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -5,7 +5,7 @@
 import csv
 import io
 import os
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from enum import Enum
 from io import StringIO
 from pathlib import Path
@@ -189,7 +189,7 @@ def run(
                 )
                 continue
 
-            docx_metadata = self._get_docx_metadata(document=docx_document)
+            docx_metadata = asdict(self._get_docx_metadata(document=docx_document))
             merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
 
             if not self.store_full_path and "file_path" in bytestream.meta:

diff --git a/haystack/components/embedders/sentence_transformers_document_embedder.py b/haystack/components/embedders/sentence_transformers_document_embedder.py
@@ -56,6 +56,7 @@ def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
         config_kwargs: Optional[Dict[str, Any]] = None,
         precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
+        encode_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """
         Creates a SentenceTransformersDocumentEmbedder component.
@@ -104,6 +105,10 @@ def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
             All non-float32 precisions are quantized embeddings.
             Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
             They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
+        :param encode_kwargs:
+            Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
+            This parameter is provided for fine customization. Be careful not to clash with already set parameters and
+            avoid passing parameters that change the output type.
         """
 
         self.model = model
@@ -121,6 +126,7 @@ def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
         self.model_kwargs = model_kwargs
         self.tokenizer_kwargs = tokenizer_kwargs
         self.config_kwargs = config_kwargs
+        self.encode_kwargs = encode_kwargs
         self.embedding_backend = None
         self.precision = precision
 
@@ -155,6 +161,7 @@ def to_dict(self) -> Dict[str, Any]:
             tokenizer_kwargs=self.tokenizer_kwargs,
             config_kwargs=self.config_kwargs,
             precision=self.precision,
+            encode_kwargs=self.encode_kwargs,
         )
         if serialization_dict["init_parameters"].get("model_kwargs") is not None:
             serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
@@ -232,6 +239,7 @@ def run(self, documents: List[Document]):
             show_progress_bar=self.progress_bar,
             normalize_embeddings=self.normalize_embeddings,
             precision=self.precision,
+            **(self.encode_kwargs if self.encode_kwargs else {}),
         )
 
         for doc, emb in zip(documents, embeddings):

diff --git a/haystack/components/embedders/sentence_transformers_text_embedder.py b/haystack/components/embedders/sentence_transformers_text_embedder.py
@@ -50,6 +50,7 @@ def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
         tokenizer_kwargs: Optional[Dict[str, Any]] = None,
         config_kwargs: Optional[Dict[str, Any]] = None,
         precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
+        encode_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """
         Create a SentenceTransformersTextEmbedder component.
@@ -94,6 +95,10 @@ def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
             All non-float32 precisions are quantized embeddings.
             Quantized embeddings are smaller in size and faster to compute, but may have a lower accuracy.
             They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
+        :param encode_kwargs:
+            Additional keyword arguments for `SentenceTransformer.encode` when embedding texts.
+            This parameter is provided for fine customization. Be careful not to clash with already set parameters and
+            avoid passing parameters that change the output type.
         """
 
         self.model = model
@@ -109,6 +114,7 @@ def __init__(  # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
         self.model_kwargs = model_kwargs
         self.tokenizer_kwargs = tokenizer_kwargs
         self.config_kwargs = config_kwargs
+        self.encode_kwargs = encode_kwargs
         self.embedding_backend = None
         self.precision = precision
 
@@ -141,6 +147,7 @@ def to_dict(self) -> Dict[str, Any]:
             tokenizer_kwargs=self.tokenizer_kwargs,
             config_kwargs=self.config_kwargs,
             precision=self.precision,
+            encode_kwargs=self.encode_kwargs,
         )
         if serialization_dict["init_parameters"].get("model_kwargs") is not None:
             serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
@@ -209,5 +216,6 @@ def run(self, text: str):
             show_progress_bar=self.progress_bar,
             normalize_embeddings=self.normalize_embeddings,
             precision=self.precision,
+            **(self.encode_kwargs if self.encode_kwargs else {}),
         )[0]
         return {"embedding": embedding}
diff --git a/haystack/components/routers/metadata_router.py b/haystack/components/routers/metadata_router.py
@@ -76,6 +76,11 @@ def __init__(self, rules: Dict[str, Dict]):
             ```
         """
         self.rules = rules
+        for rule in self.rules.values():
+            if "operator" not in rule:
+                raise ValueError(
+                    "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
+                )
         component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})
 
     def run(self, documents: List[Document]):
@@ -95,11 +100,6 @@ def run(self, documents: List[Document]):
         for document in documents:
             cur_document_matched = False
             for edge, rule in self.rules.items():
-                if "operator" not in rule:
-                    raise ValueError(
-                        "Invalid filter syntax. "
-                        "See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
-                    )
                 if document_matches_filter(rule, document):
                     output[edge].append(document)
                     cur_document_matched = True

diff --git a/haystack/utils/filters.py b/haystack/utils/filters.py
@@ -6,6 +6,7 @@
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 
+import dateutil.parser
 import pandas as pd
 
 from haystack.dataclasses import Document
@@ -69,18 +70,48 @@ def _greater_than(document_value: Any, filter_value: Any) -> bool:
 
     if isinstance(document_value, str) or isinstance(filter_value, str):
         try:
-            document_value = datetime.fromisoformat(document_value)
-            filter_value = datetime.fromisoformat(filter_value)
+            document_value = _parse_date(document_value)
+            filter_value = _parse_date(filter_value)
+            document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value)
+        except FilterError as exc:
+            raise exc
+    if type(filter_value) in [list, pd.DataFrame]:
+        msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
+        raise FilterError(msg)
+    return document_value > filter_value
+
+
+def _parse_date(value):
+    """Try parsing the value as an ISO format date, then fall back to dateutil.parser."""
+    try:
+        return datetime.fromisoformat(value)
+    except (ValueError, TypeError):
+        try:
+            return dateutil.parser.parse(value)
         except (ValueError, TypeError) as exc:
             msg = (
                 "Can't compare strings using operators '>', '>=', '<', '<='. "
                 "Strings are only comparable if they are ISO formatted dates."
             )
             raise FilterError(msg) from exc
-    if type(filter_value) in [list, pd.DataFrame]:
-        msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
-        raise FilterError(msg)
-    return document_value > filter_value
+
+
+def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime):
+    """Ensure that both dates are either naive or aware."""
+    # Both naive
+    if date1.tzinfo is None and date2.tzinfo is None:
+        return date1, date2
+
+    # Both aware
+    if date1.tzinfo is not None and date2.tzinfo is not None:
+        return date1, date2
+
+    # One naive, one aware
+    if date1.tzinfo is None:
+        date1 = date1.replace(tzinfo=date2.tzinfo)
+    else:
+        date2 = date2.replace(tzinfo=date1.tzinfo)
+    return date1, date2
 
 
 def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:

diff --git a/releasenotes/notes/add-encode-kwargs-sentence-transformers-f4d885f6c5b1706f.yaml b/releasenotes/notes/add-encode-kwargs-sentence-transformers-f4d885f6c5b1706f.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Enhanced `SentenceTransformersDocumentEmbedder` and `SentenceTransformersTextEmbedder` to accept
+    an additional parameter, which is passed directly to the underlying `SentenceTransformer.encode` method
+    for greater flexibility in embedding customization.
diff --git a/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml
@@ -0,0 +1,6 @@
+---
+upgrade:
+  - |
+    The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a
+    dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass.
+    This change does not impact reading from or writing to a Document Store.
diff --git a/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml b/releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Enhancements to Date Filtering in MetadataRouter
+    - Improved date parsing in filter utilities by introducing `_parse_date`, which first attempts `datetime.fromisoformat(value)` for backward compatibility and then falls back to dateutil.parser.parse() for broader ISO 8601 support.
+    - Resolved a common issue where comparing naive and timezone-aware datetimes resulted in TypeError. Added `_ensure_both_dates_naive_or_aware`, which ensures both datetimes are either naive or aware. If one is missing a timezone, it is assigned the timezone of the other for consistency.
diff --git a/test/components/classifiers/test_zero_shot_document_classifier.py b/test/components/classifiers/test_zero_shot_document_classifier.py
@@ -137,7 +137,8 @@ def test_run_unit(self, hf_pipeline_mock):
         assert result["documents"][1].to_dict()["classification"]["label"] == "negative"
 
     @pytest.mark.integration
-    def test_run(self):
+    def test_run(self, monkeypatch):
+        monkeypatch.delenv("HF_API_TOKEN", raising=False)  # https://github.com/deepset-ai/haystack/issues/8811
         component = TransformersZeroShotDocumentClassifier(
             model="cross-encoder/nli-deberta-v3-xsmall", labels=["positive", "negative"]
         )