Skip to content

Commit

Permalink
Merge branch 'main' into add-list-joiner
Browse files Browse the repository at this point in the history
  • Loading branch information
Amnah199 authored Feb 5, 2025
2 parents 43e313c + d2348ad commit ff1050d
Show file tree
Hide file tree
Showing 23 changed files with 262 additions and 262 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/license_compliance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:

# We keep the license inventory on FOSSA
- name: Send license report to Fossa
uses: fossas/fossa-action@v1.4.0
uses: fossas/fossa-action@v1.5.0
continue-on-error: true # not critical
with:
api-key: ${{ secrets.FOSSA_LICENSE_SCAN_TOKEN }}
Expand Down
3 changes: 1 addition & 2 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.csv import CSVToDocument
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
from haystack.components.converters.docx import DOCXToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.json import JSONConverter
from haystack.components.converters.markdown import MarkdownToDocument
Expand All @@ -28,7 +28,6 @@
"OpenAPIServiceToFunctions",
"OutputAdapter",
"DOCXToDocument",
"DOCXMetadata",
"PPTXToDocument",
"CSVToDocument",
"JSONConverter",
Expand Down
4 changes: 2 additions & 2 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import csv
import io
import os
from dataclasses import dataclass
from dataclasses import asdict, dataclass
from enum import Enum
from io import StringIO
from pathlib import Path
Expand Down Expand Up @@ -189,7 +189,7 @@ def run(
)
continue

docx_metadata = self._get_docx_metadata(document=docx_document)
docx_metadata = asdict(self._get_docx_metadata(document=docx_document))
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}

if not self.store_full_path and "file_path" in bytestream.meta:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
encode_kwargs: Optional[Dict[str, Any]] = None,
):
"""
Creates a SentenceTransformersDocumentEmbedder component.
Expand Down Expand Up @@ -104,6 +105,10 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
All non-float32 precisions are quantized embeddings.
Quantized embeddings are smaller and faster to compute, but may have a lower accuracy.
They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
:param encode_kwargs:
Additional keyword arguments for `SentenceTransformer.encode` when embedding documents.
This parameter is provided for fine customization. Be careful not to clash with already set parameters and
avoid passing parameters that change the output type.
"""

self.model = model
Expand All @@ -121,6 +126,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
self.model_kwargs = model_kwargs
self.tokenizer_kwargs = tokenizer_kwargs
self.config_kwargs = config_kwargs
self.encode_kwargs = encode_kwargs
self.embedding_backend = None
self.precision = precision

Expand Down Expand Up @@ -155,6 +161,7 @@ def to_dict(self) -> Dict[str, Any]:
tokenizer_kwargs=self.tokenizer_kwargs,
config_kwargs=self.config_kwargs,
precision=self.precision,
encode_kwargs=self.encode_kwargs,
)
if serialization_dict["init_parameters"].get("model_kwargs") is not None:
serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
Expand Down Expand Up @@ -232,6 +239,7 @@ def run(self, documents: List[Document]):
show_progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
precision=self.precision,
**(self.encode_kwargs if self.encode_kwargs else {}),
)

for doc, emb in zip(documents, embeddings):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32",
encode_kwargs: Optional[Dict[str, Any]] = None,
):
"""
Create a SentenceTransformersTextEmbedder component.
Expand Down Expand Up @@ -94,6 +95,10 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
All non-float32 precisions are quantized embeddings.
Quantized embeddings are smaller in size and faster to compute, but may have a lower accuracy.
They are useful for reducing the size of the embeddings of a corpus for semantic search, among other tasks.
:param encode_kwargs:
Additional keyword arguments for `SentenceTransformer.encode` when embedding texts.
This parameter is provided for fine customization. Be careful not to clash with already set parameters and
avoid passing parameters that change the output type.
"""

self.model = model
Expand All @@ -109,6 +114,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments
self.model_kwargs = model_kwargs
self.tokenizer_kwargs = tokenizer_kwargs
self.config_kwargs = config_kwargs
self.encode_kwargs = encode_kwargs
self.embedding_backend = None
self.precision = precision

Expand Down Expand Up @@ -141,6 +147,7 @@ def to_dict(self) -> Dict[str, Any]:
tokenizer_kwargs=self.tokenizer_kwargs,
config_kwargs=self.config_kwargs,
precision=self.precision,
encode_kwargs=self.encode_kwargs,
)
if serialization_dict["init_parameters"].get("model_kwargs") is not None:
serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"])
Expand Down Expand Up @@ -209,5 +216,6 @@ def run(self, text: str):
show_progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
precision=self.precision,
**(self.encode_kwargs if self.encode_kwargs else {}),
)[0]
return {"embedding": embedding}
10 changes: 5 additions & 5 deletions haystack/components/routers/metadata_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def __init__(self, rules: Dict[str, Dict]):
```
"""
self.rules = rules
for rule in self.rules.values():
if "operator" not in rule:
raise ValueError(
"Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
component.set_output_types(self, unmatched=List[Document], **{edge: List[Document] for edge in rules})

def run(self, documents: List[Document]):
Expand All @@ -95,11 +100,6 @@ def run(self, documents: List[Document]):
for document in documents:
cur_document_matched = False
for edge, rule in self.rules.items():
if "operator" not in rule:
raise ValueError(
"Invalid filter syntax. "
"See https://docs.haystack.deepset.ai/docs/metadata-filtering for details."
)
if document_matches_filter(rule, document):
output[edge].append(document)
cur_document_matched = True
Expand Down
43 changes: 37 additions & 6 deletions haystack/utils/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime
from typing import Any, Dict, List, Optional

import dateutil.parser
import pandas as pd

from haystack.dataclasses import Document
Expand Down Expand Up @@ -69,18 +70,48 @@ def _greater_than(document_value: Any, filter_value: Any) -> bool:

if isinstance(document_value, str) or isinstance(filter_value, str):
try:
document_value = datetime.fromisoformat(document_value)
filter_value = datetime.fromisoformat(filter_value)
document_value = _parse_date(document_value)
filter_value = _parse_date(filter_value)
document_value, filter_value = _ensure_both_dates_naive_or_aware(document_value, filter_value)
except FilterError as exc:
raise exc
if type(filter_value) in [list, pd.DataFrame]:
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return document_value > filter_value


def _parse_date(value):
"""Try parsing the value as an ISO format date, then fall back to dateutil.parser."""
try:
return datetime.fromisoformat(value)
except (ValueError, TypeError):
try:
return dateutil.parser.parse(value)
except (ValueError, TypeError) as exc:
msg = (
"Can't compare strings using operators '>', '>=', '<', '<='. "
"Strings are only comparable if they are ISO formatted dates."
)
raise FilterError(msg) from exc
if type(filter_value) in [list, pd.DataFrame]:
msg = f"Filter value can't be of type {type(filter_value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return document_value > filter_value


def _ensure_both_dates_naive_or_aware(date1: datetime, date2: datetime):
"""Ensure that both dates are either naive or aware."""
# Both naive
if date1.tzinfo is None and date2.tzinfo is None:
return date1, date2

# Both aware
if date1.tzinfo is not None and date2.tzinfo is not None:
return date1, date2

# One naive, one aware
if date1.tzinfo is None:
date1 = date1.replace(tzinfo=date2.tzinfo)
else:
date2 = date2.replace(tzinfo=date1.tzinfo)
return date1, date2


def _greater_than_equal(document_value: Any, filter_value: Any) -> bool:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
Enhanced `SentenceTransformersDocumentEmbedder` and `SentenceTransformersTextEmbedder` to accept
an additional parameter, which is passed directly to the underlying `SentenceTransformer.encode` method
for greater flexibility in embedding customization.
6 changes: 6 additions & 0 deletions releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
upgrade:
- |
The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a
dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass.
This change does not impact reading from or writing to a Document Store.
6 changes: 6 additions & 0 deletions releasenotes/notes/fix-date-comparison-ced1d6ef64534951.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
Enhancements to Date Filtering in MetadataRouter
- Improved date parsing in filter utilities by introducing `_parse_date`, which first attempts `datetime.fromisoformat(value)` for backward compatibility and then falls back to dateutil.parser.parse() for broader ISO 8601 support.
- Resolved a common issue where comparing naive and timezone-aware datetimes resulted in TypeError. Added `_ensure_both_dates_naive_or_aware`, which ensures both datetimes are either naive or aware. If one is missing a timezone, it is assigned the timezone of the other for consistency.
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ def test_run_unit(self, hf_pipeline_mock):
assert result["documents"][1].to_dict()["classification"]["label"] == "negative"

@pytest.mark.integration
def test_run(self):
def test_run(self, monkeypatch):
monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811
component = TransformersZeroShotDocumentClassifier(
model="cross-encoder/nli-deberta-v3-xsmall", labels=["positive", "negative"]
)
Expand Down
Loading

0 comments on commit ff1050d

Please sign in to comment.