From 2828d9e4aeeef1f6d95928a8cfc9e47fa29a5866 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Wed, 5 Feb 2025 14:43:19 +0100 Subject: [PATCH] refactor!: `DOCXToDocument` converter - store DOCX metadata as a dict (#8804) * DOCXToDocument - store DOCX metadata as a dict * do not export DOCXMetadata to converters package --- haystack/components/converters/__init__.py | 3 +- haystack/components/converters/docx.py | 4 +- ...docxmetadata-as-dict-20cf2ef0abf7af8a.yaml | 6 + .../converters/test_docx_file_to_document.py | 136 +++++++++--------- 4 files changed, 77 insertions(+), 72 deletions(-) create mode 100644 releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 2c7ed33505..d0057ea33d 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -4,7 +4,7 @@ from haystack.components.converters.azure import AzureOCRDocumentConverter from haystack.components.converters.csv import CSVToDocument -from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument +from haystack.components.converters.docx import DOCXToDocument from haystack.components.converters.html import HTMLToDocument from haystack.components.converters.json import JSONConverter from haystack.components.converters.markdown import MarkdownToDocument @@ -28,7 +28,6 @@ "OpenAPIServiceToFunctions", "OutputAdapter", "DOCXToDocument", - "DOCXMetadata", "PPTXToDocument", "CSVToDocument", "JSONConverter", diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index 8f9a58004d..3607ae4544 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -5,7 +5,7 @@ import csv import io import os -from dataclasses import dataclass +from dataclasses import asdict, dataclass from enum import Enum from io import StringIO from pathlib import Path @@ -189,7 +189,7 @@ def run( ) continue - docx_metadata = self._get_docx_metadata(document=docx_document) + docx_metadata = asdict(self._get_docx_metadata(document=docx_document)) merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata} if not self.store_full_path and "file_path" in bytestream.meta: diff --git a/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml new file mode 100644 index 0000000000..60f4c17ce3 --- /dev/null +++ b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml @@ -0,0 +1,6 @@ +--- +upgrade: + - | + The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a + dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass. + This change does not impact reading from or writing to a Document Store. diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py index c013759938..2de6bfefbe 100644 --- a/test/components/converters/test_docx_file_to_document.py +++ b/test/components/converters/test_docx_file_to_document.py @@ -121,23 +121,23 @@ def test_run(self, test_files_path, docx_converter): assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { "file_path": os.path.basename(paths[0]), - "docx": DOCXMetadata( - author="Microsoft Office User", - category="", - comments="", - content_status="", - created="2024-06-09T21:17:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Carlos Fernández Lorán", - last_printed=None, - modified="2024-06-09T21:27:00+00:00", - revision=2, - subject="", - title="", - version="", - ), + "docx": { + "author": "Microsoft Office User", + "category": "", + "comments": "", + "content_status": "", + "created": "2024-06-09T21:17:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Carlos Fernández Lorán", + "last_printed": None, + "modified": "2024-06-09T21:27:00+00:00", + "revision": 2, + "subject": "", + "title": "", + "version": "", + }, } def test_run_with_table(self, test_files_path): @@ -153,23 +153,23 @@ def test_run_with_table(self, test_files_path): assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { "file_path": os.path.basename(paths[0]), - "docx": DOCXMetadata( - author="Saha, Anirban", - category="", - comments="", - content_status="", - created="2020-07-14T08:14:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Saha, Anirban", - last_printed=None, - modified="2020-07-14T08:16:00+00:00", - revision=1, - subject="", - title="", - version="", - ), + "docx": { + "author": "Saha, Anirban", + "category": "", + "comments": "", + "content_status": "", + "created": "2020-07-14T08:14:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Saha, Anirban", + "last_printed": None, + "modified": "2020-07-14T08:16:00+00:00", + "revision": 1, + "subject": "", + "title": "", + "version": "", + }, } # let's now detect that the table markdown is correctly added and that order of elements is correct content_parts = docs[0].content.split("\n\n") @@ -193,23 +193,23 @@ def test_run_with_store_full_path_false(self, test_files_path): assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { "file_path": "sample_docx_1.docx", - "docx": DOCXMetadata( - author="Microsoft Office User", - category="", - comments="", - content_status="", - created="2024-06-09T21:17:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Carlos Fernández Lorán", - last_printed=None, - modified="2024-06-09T21:27:00+00:00", - revision=2, - subject="", - title="", - version="", - ), + "docx": { + "author": "Microsoft Office User", + "category": "", + "comments": "", + "content_status": "", + "created": "2024-06-09T21:17:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Carlos Fernández Lorán", + "last_printed": None, + "modified": "2024-06-09T21:27:00+00:00", + "revision": 2, + "subject": "", + "title": "", + "version": "", + }, } @pytest.mark.parametrize("table_format", ["markdown", "csv"]) @@ -285,23 +285,23 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter): doc = output["documents"][0] assert doc.meta == { "file_path": os.path.basename(paths[0]), - "docx": DOCXMetadata( - author="Microsoft Office User", - category="", - comments="", - content_status="", - created="2024-06-09T21:17:00+00:00", - identifier="", - keywords="", - language="", - last_modified_by="Carlos Fernández Lorán", - last_printed=None, - modified="2024-06-09T21:27:00+00:00", - revision=2, - subject="", - title="", - version="", - ), + "docx": { + "author": "Microsoft Office User", + "category": "", + "comments": "", + "content_status": "", + "created": "2024-06-09T21:17:00+00:00", + "identifier": "", + "keywords": "", + "language": "", + "last_modified_by": "Carlos Fernández Lorán", + "last_printed": None, + "modified": "2024-06-09T21:27:00+00:00", + "revision": 2, + "subject": "", + "title": "", + "version": "", + }, "language": "it", "author": "test_author", }