Skip to content

Commit

Permalink
fix: PDFMinerToDocument convert function - adding double new lines …
Browse files Browse the repository at this point in the history
…between each `container_text` so that passages can be detected. (#8729)

* initial import

* adding double new lines between container_texts so that passages can be detected

* reducing type specification to avoid import error

* adding release notes

* renaming variable
  • Loading branch information
davidsbatista authored Jan 17, 2025
1 parent 424bce2 commit 5af2888
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 7 deletions.
18 changes: 11 additions & 7 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import io
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, Iterator, List, Optional, Union

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
Expand Down Expand Up @@ -98,23 +98,27 @@ def __init__( # pylint: disable=too-many-positional-arguments
)
self.store_full_path = store_full_path

def _converter(self, extractor) -> str:
@staticmethod
def _converter(lt_page_objs: Iterator) -> str:
"""
Extracts text from PDF pages then converts the text into a single str
:param extractor:
:param lt_page_objs:
Python generator that yields PDF pages.
:returns:
PDF text converted to single str
"""
pages = []
for page in extractor:
for page in lt_page_objs:
text = ""
for container in page:
# Keep text only
if isinstance(container, LTTextContainer):
text += container.get_text()
container_text = container.get_text()
if container_text:
text += "\n\n"
text += container_text
pages.append(text)

# Add a page delimiter
Expand Down Expand Up @@ -156,8 +160,8 @@ def run(
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
try:
pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
text = self._converter(pdf_reader)
pages = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
text = self._converter(pages)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Updated `PDFMinerToDocument` convert function to to double new lines between container_text so that passages can later by `DocumentSplitter`.
30 changes: 30 additions & 0 deletions test/components/converters/test_pdfminer_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest

from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.dataclasses import ByteStream
from haystack.components.converters.pdfminer import PDFMinerToDocument

Expand Down Expand Up @@ -155,3 +156,32 @@ def test_run_empty_document(self, caplog, test_files_path):
# Check that not only content is used when the returned document is initialized and doc id is generated
assert results["documents"][0].meta["file_path"] == "non_text_searchable.pdf"
assert results["documents"][0].id != Document(content="").id

def test_run_detect_pages_and_split_by_passage(self, test_files_path):
converter = PDFMinerToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
pdf_doc = converter.run(sources=sources)
splitter = DocumentSplitter(split_length=1, split_by="page")
docs = splitter.run(pdf_doc["documents"])
assert len(docs["documents"]) == 4

def test_run_detect_paragraphs_to_be_used_in_split_passage(self, test_files_path):
converter = PDFMinerToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_2.pdf"]
pdf_doc = converter.run(sources=sources)
splitter = DocumentSplitter(split_length=1, split_by="passage")
docs = splitter.run(pdf_doc["documents"])

assert len(docs["documents"]) == 29

expected = (
"\nA wiki (/ˈwɪki/ (About this soundlisten) WIK-ee) is a hypertext publication collaboratively"
" \nedited and managed by its own audience directly using a web browser. A typical wiki \ncontains "
"multiple pages for the subjects or scope of the project and may be either open \nto the public or "
"limited to use within an organization for maintaining its internal knowledge \nbase. Wikis are "
"enabled by wiki software, otherwise known as wiki engines. A wiki engine, \nbeing a form of a "
"content management system, differs from other web-based systems \nsuch as blog software, in that "
"the content is created without any defined owner or leader, \nand wikis have little inherent "
"structure, allowing structure to emerge according to the \nneeds of the users.[1] \n\n"
)
assert docs["documents"][6].content == expected

0 comments on commit 5af2888

Please sign in to comment.