Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into fix/pipeline_run_issues
Browse files Browse the repository at this point in the history
  • Loading branch information
mathislucka committed Feb 4, 2025
2 parents ed5f5b7 + 877f826 commit 96d475c
Show file tree
Hide file tree
Showing 92 changed files with 3,820 additions and 1,288 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:
args: [--markdown-linebreak-ext=md]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.0
rev: v0.9.2
hooks:
- id: ruff
- id: ruff-format
Expand Down
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.9.0-rc0
2.10.0-rc0
2 changes: 1 addition & 1 deletion docs/pydoc/config/preprocessors_api.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/components/preprocessors]
modules: ["document_cleaner", "document_splitter", "text_cleaner", "nltk_document_splitter"]
modules: ["document_cleaner", "document_splitter", "recursive_splitter", "text_cleaner"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
2 changes: 1 addition & 1 deletion docs/pydoc/config/tools_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ loaders:
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
search_path: [../../../haystack/tools]
modules:
["tool"]
["tool", "from_function", "component_tool"]
ignore_when_discovered: ["__init__"]
processors:
- type: filter
Expand Down
2 changes: 0 additions & 2 deletions haystack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,3 @@
"GeneratedAnswer",
"ExtractedAnswer",
]

# FIXME: remove before merging PR
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def run(self, documents: List[Document]):
if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError(
"DocumentLanguageClassifier expects a list of Document as input. "
"In case you want to classify a text, please use the TextLanguageClassifier."
"In case you want to classify and route a text, please use the TextLanguageRouter."
)

output: Dict[str, List[Document]] = {language: [] for language in self.languages}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ def run(self, documents: List[Document], batch_size: int = 1):

if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
raise TypeError(
"DocumentLanguageClassifier expects a list of documents as input. "
"In case you want to classify a text, please use the TextLanguageClassifier."
"TransformerZeroShotDocumentClassifier expects a list of documents as input. "
"In case you want to classify and route a text, please use the TransformersZeroShotTextRouter."
)

invalid_doc_ids = []
Expand Down
3 changes: 3 additions & 0 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph
from lxml.etree import _Comment


@dataclass
Expand Down Expand Up @@ -210,6 +211,8 @@ def _extract_elements(self, document: "DocxDocument") -> List[str]:
"""
elements = []
for element in document.element.body:
if isinstance(element, _Comment):
continue
if element.tag.endswith("p"):
paragraph = Paragraph(element, document)
if paragraph.contains_page_break:
Expand Down
1 change: 1 addition & 0 deletions haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str,
source=source.meta["file_path"],
error=exc,
)
return []

meta_fields = self._meta_fields or set()

Expand Down
30 changes: 17 additions & 13 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import io
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, Iterator, List, Optional, Union

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
Expand Down Expand Up @@ -98,29 +98,33 @@ def __init__( # pylint: disable=too-many-positional-arguments
)
self.store_full_path = store_full_path

def _converter(self, extractor) -> Document:
@staticmethod
def _converter(lt_page_objs: Iterator) -> str:
"""
Extracts text from PDF pages then convert the text into Documents
Extracts text from PDF pages then converts the text into a single str
:param extractor:
:param lt_page_objs:
Python generator that yields PDF pages.
:returns:
PDF text converted to Haystack Document
PDF text converted to single str
"""
pages = []
for page in extractor:
for page in lt_page_objs:
text = ""
for container in page:
# Keep text only
if isinstance(container, LTTextContainer):
text += container.get_text()
container_text = container.get_text()
if container_text:
text += "\n\n"
text += container_text
pages.append(text)

# Add a page delimiter
concat = "\f".join(pages)
delimited_pages = "\f".join(pages)

return Document(content=concat)
return delimited_pages

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -156,15 +160,15 @@ def run(
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
document = self._converter(pdf_reader)
pages = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
text = self._converter(pages)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue

if document.content is None or document.content.strip() == "":
if text is None or text.strip() == "":
logger.warning(
"PDFMinerToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
Expand All @@ -174,7 +178,7 @@ def run(

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
29 changes: 14 additions & 15 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,22 +155,21 @@ def from_dict(cls, data):
"""
return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
def _default_convert(self, reader: "PdfReader") -> str:
texts = []
for page in reader.pages:
texts.append(
page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
extracted_text = page.extract_text(
orientations=self.plain_mode_orientations,
extraction_mode=self.extraction_mode.value,
space_width=self.plain_mode_space_width,
layout_mode_space_vertically=self.layout_mode_space_vertically,
layout_mode_scale_weight=self.layout_mode_scale_weight,
layout_mode_strip_rotated=self.layout_mode_strip_rotated,
layout_mode_font_height_weight=self.layout_mode_font_height_weight,
)
texts.append(extracted_text)
text = "\f".join(texts)
return Document(content=text)
return text

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -205,14 +204,14 @@ def run(
continue
try:
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
document = self._default_convert(pdf_reader)
text = self._default_convert(pdf_reader)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
)
continue

if document.content is None or document.content.strip() == "":
if text is None or text.strip() == "":
logger.warning(
"PyPDFToDocument could not extract text from the file {source}. Returning an empty document.",
source=source,
Expand All @@ -222,7 +221,7 @@ def run(

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
6 changes: 6 additions & 0 deletions haystack/components/embedders/azure_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def __init__( # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
embedding_separator: str = "\n",
timeout: Optional[float] = None,
max_retries: Optional[int] = None,
*,
default_headers: Optional[Dict[str, str]] = None,
):
"""
Creates an AzureOpenAIDocumentEmbedder component.
Expand Down Expand Up @@ -95,6 +97,7 @@ def __init__( # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
`OPENAI_TIMEOUT` environment variable, or 30 seconds.
:param max_retries: Maximum number of retries to contact AzureOpenAI after an internal error.
If not set, defaults to either the `OPENAI_MAX_RETRIES` environment variable or to 5 retries.
:param default_headers: Default headers to send to the AzureOpenAI client.
"""
# if not provided as a parameter, azure_endpoint is read from the env var AZURE_OPENAI_ENDPOINT
azure_endpoint = azure_endpoint or os.environ.get("AZURE_OPENAI_ENDPOINT")
Expand All @@ -119,6 +122,7 @@ def __init__( # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
self.embedding_separator = embedding_separator
self.timeout = timeout or float(os.environ.get("OPENAI_TIMEOUT", 30.0))
self.max_retries = max_retries or int(os.environ.get("OPENAI_MAX_RETRIES", 5))
self.default_headers = default_headers or {}

self._client = AzureOpenAI(
api_version=api_version,
Expand All @@ -129,6 +133,7 @@ def __init__( # noqa: PLR0913 (too-many-arguments) # pylint: disable=too-many-p
organization=organization,
timeout=self.timeout,
max_retries=self.max_retries,
default_headers=self.default_headers,
)

def _get_telemetry_data(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -161,6 +166,7 @@ def to_dict(self) -> Dict[str, Any]:
azure_ad_token=self.azure_ad_token.to_dict() if self.azure_ad_token is not None else None,
timeout=self.timeout,
max_retries=self.max_retries,
default_headers=self.default_headers,
)

@classmethod
Expand Down
6 changes: 6 additions & 0 deletions haystack/components/embedders/azure_text_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def __init__( # pylint: disable=too-many-positional-arguments
max_retries: Optional[int] = None,
prefix: str = "",
suffix: str = "",
*,
default_headers: Optional[Dict[str, str]] = None,
):
"""
Creates an AzureOpenAITextEmbedder component.
Expand Down Expand Up @@ -82,6 +84,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
A string to add at the beginning of each text.
:param suffix:
A string to add at the end of each text.
:param default_headers: Default headers to send to the AzureOpenAI client.
"""
# Why is this here?
# AzureOpenAI init is forcing us to use an init method that takes either base_url or azure_endpoint as not
Expand All @@ -105,6 +108,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
self.max_retries = max_retries or int(os.environ.get("OPENAI_MAX_RETRIES", 5))
self.prefix = prefix
self.suffix = suffix
self.default_headers = default_headers or {}

self._client = AzureOpenAI(
api_version=api_version,
Expand All @@ -115,6 +119,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
organization=organization,
timeout=self.timeout,
max_retries=self.max_retries,
default_headers=self.default_headers,
)

def _get_telemetry_data(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -143,6 +148,7 @@ def to_dict(self) -> Dict[str, Any]:
azure_ad_token=self.azure_ad_token.to_dict() if self.azure_ad_token is not None else None,
timeout=self.timeout,
max_retries=self.max_retries,
default_headers=self.default_headers,
)

@classmethod
Expand Down
36 changes: 26 additions & 10 deletions haystack/components/embedders/hugging_face_api_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

import json
import warnings
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm
Expand Down Expand Up @@ -96,8 +96,8 @@ def __init__(
token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
prefix: str = "",
suffix: str = "",
truncate: bool = True,
normalize: bool = False,
truncate: Optional[bool] = True,
normalize: Optional[bool] = False,
batch_size: int = 32,
progress_bar: bool = True,
meta_fields_to_embed: Optional[List[str]] = None,
Expand All @@ -124,13 +124,11 @@ def __init__(
Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
if the backend uses Text Embeddings Inference.
If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
It is always set to `True` and cannot be changed.
:param normalize:
Normalizes the embeddings to unit length.
Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
if the backend uses Text Embeddings Inference.
If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
It is always set to `False` and cannot be changed.
:param batch_size:
Number of documents to process at once.
:param progress_bar:
Expand Down Expand Up @@ -239,18 +237,36 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> List[List[
"""
Embed a list of texts in batches.
"""
truncate = self.truncate
normalize = self.normalize

if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
if truncate is not None:
msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
warnings.warn(msg)
truncate = None
if normalize is not None:
msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
warnings.warn(msg)
normalize = None

all_embeddings = []
for i in tqdm(
range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
):
batch = texts_to_embed[i : i + batch_size]
response = self._client.post(
json={"inputs": batch, "truncate": self.truncate, "normalize": self.normalize},
task="feature-extraction",

np_embeddings = self._client.feature_extraction(
# this method does not officially support list of strings, but works as expected
text=batch, # type: ignore[arg-type]
truncate=truncate,
normalize=normalize,
)
embeddings = json.loads(response.decode())
all_embeddings.extend(embeddings)

if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch):
raise ValueError(f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}")

all_embeddings.extend(np_embeddings.tolist())

return all_embeddings

Expand Down
Loading

0 comments on commit 96d475c

Please sign in to comment.