Merge branch 'refs/heads/main' into fix/pipeline_run_issues

deepset-ai · Feb 4, 2025 · 96d475c · 96d475c
2 parents ed5f5b7 + 877f826
commit 96d475c
Show file tree

Hide file tree

Showing 92 changed files with 3,820 additions and 1,288 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
         args: [--markdown-linebreak-ext=md]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.0
+    rev: v0.9.2
     hooks:
       - id: ruff
       - id: ruff-format

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-2.9.0-rc0
+2.10.0-rc0
diff --git a/docs/pydoc/config/preprocessors_api.yml b/docs/pydoc/config/preprocessors_api.yml
@@ -1,7 +1,7 @@
 loaders:
   - type: haystack_pydoc_tools.loaders.CustomPythonLoader
     search_path: [../../../haystack/components/preprocessors]
-    modules: ["document_cleaner", "document_splitter", "text_cleaner", "nltk_document_splitter"]
+    modules: ["document_cleaner", "document_splitter", "recursive_splitter", "text_cleaner"]
     ignore_when_discovered: ["__init__"]
 processors:
   - type: filter

diff --git a/docs/pydoc/config/tools_api.yml b/docs/pydoc/config/tools_api.yml
@@ -2,7 +2,7 @@ loaders:
   - type: haystack_pydoc_tools.loaders.CustomPythonLoader
     search_path: [../../../haystack/tools]
     modules:
-      ["tool"]
+      ["tool", "from_function", "component_tool"]
     ignore_when_discovered: ["__init__"]
 processors:
   - type: filter

diff --git a/haystack/__init__.py b/haystack/__init__.py
@@ -30,5 +30,3 @@
     "GeneratedAnswer",
     "ExtractedAnswer",
 ]
-
-# FIXME: remove before merging PR
diff --git a/haystack/components/classifiers/document_language_classifier.py b/haystack/components/classifiers/document_language_classifier.py
@@ -83,7 +83,7 @@ def run(self, documents: List[Document]):
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             raise TypeError(
                 "DocumentLanguageClassifier expects a list of Document as input. "
-                "In case you want to classify a text, please use the TextLanguageClassifier."
+                "In case you want to classify and route a text, please use the TextLanguageRouter."
             )
 
         output: Dict[str, List[Document]] = {language: [] for language in self.languages}

diff --git a/haystack/components/classifiers/zero_shot_document_classifier.py b/haystack/components/classifiers/zero_shot_document_classifier.py
@@ -211,8 +211,8 @@ def run(self, documents: List[Document], batch_size: int = 1):
 
         if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
             raise TypeError(
-                "DocumentLanguageClassifier expects a list of documents as input. "
-                "In case you want to classify a text, please use the TextLanguageClassifier."
+                "TransformerZeroShotDocumentClassifier expects a list of documents as input. "
+                "In case you want to classify and route a text, please use the TransformersZeroShotTextRouter."
             )
 
         invalid_doc_ids = []

diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -23,6 +23,7 @@
     from docx.document import Document as DocxDocument
     from docx.table import Table
     from docx.text.paragraph import Paragraph
+    from lxml.etree import _Comment
 
 
 @dataclass
@@ -210,6 +211,8 @@ def _extract_elements(self, document: "DocxDocument") -> List[str]:
         """
         elements = []
         for element in document.element.body:
+            if isinstance(element, _Comment):
+                continue
             if element.tag.endswith("p"):
                 paragraph = Paragraph(element, document)
                 if paragraph.contains_page_break:

diff --git a/haystack/components/converters/json.py b/haystack/components/converters/json.py
@@ -194,6 +194,7 @@ def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str,
                 source=source.meta["file_path"],
                 error=exc,
             )
+            return []
 
         meta_fields = self._meta_fields or set()
 

diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py
@@ -5,7 +5,7 @@
 import io
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional, Union
 
 from haystack import Document, component, logging
 from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
@@ -98,29 +98,33 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         )
         self.store_full_path = store_full_path
 
-    def _converter(self, extractor) -> Document:
+    @staticmethod
+    def _converter(lt_page_objs: Iterator) -> str:
         """
-        Extracts text from PDF pages then convert the text into Documents
+        Extracts text from PDF pages then converts the text into a single str
 
-        :param extractor:
+        :param lt_page_objs:
             Python generator that yields PDF pages.
 
         :returns:
-            PDF text converted to Haystack Document
+            PDF text converted to single str
         """
         pages = []
-        for page in extractor:
+        for page in lt_page_objs:
             text = ""
             for container in page:
                 # Keep text only
                 if isinstance(container, LTTextContainer):
-                    text += container.get_text()
+                    container_text = container.get_text()
+                    if container_text:
+                        text += "\n\n"
+                    text += container_text
             pages.append(text)
 
         # Add a page delimiter
-        concat = "\f".join(pages)
+        delimited_pages = "\f".join(pages)
 
-        return Document(content=concat)
+        return delimited_pages
 
     @component.output_types(documents=List[Document])
     def run(
@@ -156,15 +160,15 @@ def run(
                 logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
                 continue
             try:
-                pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
-                document = self._converter(pdf_reader)
+                pages = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
+                text = self._converter(pages)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                 )
                 continue
 
-            if document.content is None or document.content.strip() == "":
+            if text is None or text.strip() == "":
                 logger.warning(
                     "PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
                     source=source,
@@ -174,7 +178,7 @@ def run(
 
             if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                 merged_metadata["file_path"] = os.path.basename(file_path)
-            document.meta = merged_metadata
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py
@@ -155,22 +155,21 @@ def from_dict(cls, data):
         """
         return default_from_dict(cls, data)
 
-    def _default_convert(self, reader: "PdfReader") -> Document:
+    def _default_convert(self, reader: "PdfReader") -> str:
         texts = []
         for page in reader.pages:
-            texts.append(
-                page.extract_text(
-                    orientations=self.plain_mode_orientations,
-                    extraction_mode=self.extraction_mode.value,
-                    space_width=self.plain_mode_space_width,
-                    layout_mode_space_vertically=self.layout_mode_space_vertically,
-                    layout_mode_scale_weight=self.layout_mode_scale_weight,
-                    layout_mode_strip_rotated=self.layout_mode_strip_rotated,
-                    layout_mode_font_height_weight=self.layout_mode_font_height_weight,
-                )
+            extracted_text = page.extract_text(
+                orientations=self.plain_mode_orientations,
+                extraction_mode=self.extraction_mode.value,
+                space_width=self.plain_mode_space_width,
+                layout_mode_space_vertically=self.layout_mode_space_vertically,
+                layout_mode_scale_weight=self.layout_mode_scale_weight,
+                layout_mode_strip_rotated=self.layout_mode_strip_rotated,
+                layout_mode_font_height_weight=self.layout_mode_font_height_weight,
             )
+            texts.append(extracted_text)
         text = "\f".join(texts)
-        return Document(content=text)
+        return text
 
     @component.output_types(documents=List[Document])
     def run(
@@ -205,14 +204,14 @@ def run(
                 continue
             try:
                 pdf_reader = PdfReader(io.BytesIO(bytestream.data))
-                document = self._default_convert(pdf_reader)
+                text = self._default_convert(pdf_reader)
             except Exception as e:
                 logger.warning(
                     "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
                 )
                 continue
 
-            if document.content is None or document.content.strip() == "":
+            if text is None or text.strip() == "":
                 logger.warning(
                     "PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
                     source=source,
@@ -222,7 +221,7 @@ def run(
 
             if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
                 merged_metadata["file_path"] = os.path.basename(file_path)
-            document.meta = merged_metadata
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
diff --git a/haystack/components/embedders/azure_document_embedder.py b/haystack/components/embedders/azure_document_embedder.py
@@ -51,6 +51,8 @@ def __init__(  # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
         embedding_separator: str = "\n",
         timeout: Optional[float] = None,
         max_retries: Optional[int] = None,
+        *,
+        default_headers: Optional[Dict[str, str]] = None,
     ):
         """
         Creates an AzureOpenAIDocumentEmbedder component.
@@ -95,6 +97,7 @@ def __init__(  # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
             `OPENAI_TIMEOUT` environment variable, or 30 seconds.
         :param max_retries: Maximum number of retries to contact AzureOpenAI after an internal error.
             If not set, defaults to either the `OPENAI_MAX_RETRIES` environment variable or to 5 retries.
+        :param default_headers: Default headers to send to the AzureOpenAI client.
         """
         # if not provided as a parameter, azure_endpoint is read from the env var AZURE_OPENAI_ENDPOINT
         azure_endpoint = azure_endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
@@ -119,6 +122,7 @@ def __init__(  # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
         self.embedding_separator = embedding_separator
         self.timeout = timeout or float(os.environ.get("OPENAI_TIMEOUT", 30.0))
         self.max_retries = max_retries or int(os.environ.get("OPENAI_MAX_RETRIES", 5))
+        self.default_headers = default_headers or {}
 
         self._client = AzureOpenAI(
             api_version=api_version,
@@ -129,6 +133,7 @@ def __init__(  # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
             organization=organization,
             timeout=self.timeout,
             max_retries=self.max_retries,
+            default_headers=self.default_headers,
         )
 
     def _get_telemetry_data(self) -> Dict[str, Any]:
@@ -161,6 +166,7 @@ def to_dict(self) -> Dict[str, Any]:
             azure_ad_token=self.azure_ad_token.to_dict() if self.azure_ad_token is not None else None,
             timeout=self.timeout,
             max_retries=self.max_retries,
+            default_headers=self.default_headers,
         )
 
     @classmethod

diff --git a/haystack/components/embedders/azure_text_embedder.py b/haystack/components/embedders/azure_text_embedder.py
@@ -46,6 +46,8 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         max_retries: Optional[int] = None,
         prefix: str = "",
         suffix: str = "",
+        *,
+        default_headers: Optional[Dict[str, str]] = None,
     ):
         """
         Creates an AzureOpenAITextEmbedder component.
@@ -82,6 +84,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             A string to add at the beginning of each text.
         :param suffix:
             A string to add at the end of each text.
+        :param default_headers: Default headers to send to the AzureOpenAI client.
         """
         # Why is this here?
         # AzureOpenAI init is forcing us to use an init method that takes either base_url or azure_endpoint as not
@@ -105,6 +108,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         self.max_retries = max_retries or int(os.environ.get("OPENAI_MAX_RETRIES", 5))
         self.prefix = prefix
         self.suffix = suffix
+        self.default_headers = default_headers or {}
 
         self._client = AzureOpenAI(
             api_version=api_version,
@@ -115,6 +119,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             organization=organization,
             timeout=self.timeout,
             max_retries=self.max_retries,
+            default_headers=self.default_headers,
         )
 
     def _get_telemetry_data(self) -> Dict[str, Any]:
@@ -143,6 +148,7 @@ def to_dict(self) -> Dict[str, Any]:
             azure_ad_token=self.azure_ad_token.to_dict() if self.azure_ad_token is not None else None,
             timeout=self.timeout,
             max_retries=self.max_retries,
+            default_headers=self.default_headers,
         )
 
     @classmethod

diff --git a/haystack/components/embedders/hugging_face_api_document_embedder.py b/haystack/components/embedders/hugging_face_api_document_embedder.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import json
+import warnings
 from typing import Any, Dict, List, Optional, Union
 
 from tqdm import tqdm
@@ -96,8 +96,8 @@ def __init__(
         token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
         prefix: str = "",
         suffix: str = "",
-        truncate: bool = True,
-        normalize: bool = False,
+        truncate: Optional[bool] = True,
+        normalize: Optional[bool] = False,
         batch_size: int = 32,
         progress_bar: bool = True,
         meta_fields_to_embed: Optional[List[str]] = None,
@@ -124,13 +124,11 @@ def __init__(
             Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
             if the backend uses Text Embeddings Inference.
             If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
-            It is always set to `True` and cannot be changed.
         :param normalize:
             Normalizes the embeddings to unit length.
             Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
             if the backend uses Text Embeddings Inference.
             If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
-            It is always set to `False` and cannot be changed.
         :param batch_size:
             Number of documents to process at once.
         :param progress_bar:
@@ -239,18 +237,36 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[
         """
         Embed a list of texts in batches.
         """
+        truncate = self.truncate
+        normalize = self.normalize
+
+        if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
+            if truncate is not None:
+                msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
+                warnings.warn(msg)
+                truncate = None
+            if normalize is not None:
+                msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
+                warnings.warn(msg)
+                normalize = None
 
         all_embeddings = []
         for i in tqdm(
             range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
         ):
             batch = texts_to_embed[i : i + batch_size]
-            response = self._client.post(
-                json={"inputs": batch, "truncate": self.truncate, "normalize": self.normalize},
-                task="feature-extraction",
+
+            np_embeddings = self._client.feature_extraction(
+                # this method does not officially support list of strings, but works as expected
+                text=batch,  # type: ignore[arg-type]
+                truncate=truncate,
+                normalize=normalize,
             )
-            embeddings = json.loads(response.decode())
-            all_embeddings.extend(embeddings)
+
+            if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch):
+                raise ValueError(f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}")
+
+            all_embeddings.extend(np_embeddings.tolist())
 
         return all_embeddings