From 2828d9e4aeeef1f6d95928a8cfc9e47fa29a5866 Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <stefanofiorucci@gmail.com>
Date: Wed, 5 Feb 2025 14:43:19 +0100
Subject: [PATCH] refactor!: `DOCXToDocument` converter - store DOCX metadata
 as a dict (#8804)

* DOCXToDocument - store DOCX metadata as a dict

* do not export DOCXMetadata to converters package
---
 haystack/components/converters/__init__.py    |   3 +-
 haystack/components/converters/docx.py        |   4 +-
 ...docxmetadata-as-dict-20cf2ef0abf7af8a.yaml |   6 +
 .../converters/test_docx_file_to_document.py  | 136 +++++++++---------
 4 files changed, 77 insertions(+), 72 deletions(-)
 create mode 100644 releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
index 2c7ed33505..d0057ea33d 100644
--- a/haystack/components/converters/__init__.py
+++ b/haystack/components/converters/__init__.py
@@ -4,7 +4,7 @@
 
 from haystack.components.converters.azure import AzureOCRDocumentConverter
 from haystack.components.converters.csv import CSVToDocument
-from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
+from haystack.components.converters.docx import DOCXToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.json import JSONConverter
 from haystack.components.converters.markdown import MarkdownToDocument
@@ -28,7 +28,6 @@
     "OpenAPIServiceToFunctions",
     "OutputAdapter",
     "DOCXToDocument",
-    "DOCXMetadata",
     "PPTXToDocument",
     "CSVToDocument",
     "JSONConverter",
diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
index 8f9a58004d..3607ae4544 100644
--- a/haystack/components/converters/docx.py
+++ b/haystack/components/converters/docx.py
@@ -5,7 +5,7 @@
 import csv
 import io
 import os
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass
 from enum import Enum
 from io import StringIO
 from pathlib import Path
@@ -189,7 +189,7 @@ def run(
                 )
                 continue
 
-            docx_metadata = self._get_docx_metadata(document=docx_document)
+            docx_metadata = asdict(self._get_docx_metadata(document=docx_document))
             merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
 
             if not self.store_full_path and "file_path" in bytestream.meta:
diff --git a/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml
new file mode 100644
index 0000000000..60f4c17ce3
--- /dev/null
+++ b/releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml
@@ -0,0 +1,6 @@
+---
+upgrade:
+  - |
+    The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a
+    dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass.
+    This change does not impact reading from or writing to a Document Store.
diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py
index c013759938..2de6bfefbe 100644
--- a/test/components/converters/test_docx_file_to_document.py
+++ b/test/components/converters/test_docx_file_to_document.py
@@ -121,23 +121,23 @@ def test_run(self, test_files_path, docx_converter):
         assert docs[0].meta.keys() == {"file_path", "docx"}
         assert docs[0].meta == {
             "file_path": os.path.basename(paths[0]),
-            "docx": DOCXMetadata(
-                author="Microsoft Office User",
-                category="",
-                comments="",
-                content_status="",
-                created="2024-06-09T21:17:00+00:00",
-                identifier="",
-                keywords="",
-                language="",
-                last_modified_by="Carlos Fernández Lorán",
-                last_printed=None,
-                modified="2024-06-09T21:27:00+00:00",
-                revision=2,
-                subject="",
-                title="",
-                version="",
-            ),
+            "docx": {
+                "author": "Microsoft Office User",
+                "category": "",
+                "comments": "",
+                "content_status": "",
+                "created": "2024-06-09T21:17:00+00:00",
+                "identifier": "",
+                "keywords": "",
+                "language": "",
+                "last_modified_by": "Carlos Fernández Lorán",
+                "last_printed": None,
+                "modified": "2024-06-09T21:27:00+00:00",
+                "revision": 2,
+                "subject": "",
+                "title": "",
+                "version": "",
+            },
         }
 
     def test_run_with_table(self, test_files_path):
@@ -153,23 +153,23 @@ def test_run_with_table(self, test_files_path):
         assert docs[0].meta.keys() == {"file_path", "docx"}
         assert docs[0].meta == {
             "file_path": os.path.basename(paths[0]),
-            "docx": DOCXMetadata(
-                author="Saha, Anirban",
-                category="",
-                comments="",
-                content_status="",
-                created="2020-07-14T08:14:00+00:00",
-                identifier="",
-                keywords="",
-                language="",
-                last_modified_by="Saha, Anirban",
-                last_printed=None,
-                modified="2020-07-14T08:16:00+00:00",
-                revision=1,
-                subject="",
-                title="",
-                version="",
-            ),
+            "docx": {
+                "author": "Saha, Anirban",
+                "category": "",
+                "comments": "",
+                "content_status": "",
+                "created": "2020-07-14T08:14:00+00:00",
+                "identifier": "",
+                "keywords": "",
+                "language": "",
+                "last_modified_by": "Saha, Anirban",
+                "last_printed": None,
+                "modified": "2020-07-14T08:16:00+00:00",
+                "revision": 1,
+                "subject": "",
+                "title": "",
+                "version": "",
+            },
         }
         # let's now detect that the table markdown is correctly added and that order of elements is correct
         content_parts = docs[0].content.split("\n\n")
@@ -193,23 +193,23 @@ def test_run_with_store_full_path_false(self, test_files_path):
         assert docs[0].meta.keys() == {"file_path", "docx"}
         assert docs[0].meta == {
             "file_path": "sample_docx_1.docx",
-            "docx": DOCXMetadata(
-                author="Microsoft Office User",
-                category="",
-                comments="",
-                content_status="",
-                created="2024-06-09T21:17:00+00:00",
-                identifier="",
-                keywords="",
-                language="",
-                last_modified_by="Carlos Fernández Lorán",
-                last_printed=None,
-                modified="2024-06-09T21:27:00+00:00",
-                revision=2,
-                subject="",
-                title="",
-                version="",
-            ),
+            "docx": {
+                "author": "Microsoft Office User",
+                "category": "",
+                "comments": "",
+                "content_status": "",
+                "created": "2024-06-09T21:17:00+00:00",
+                "identifier": "",
+                "keywords": "",
+                "language": "",
+                "last_modified_by": "Carlos Fernández Lorán",
+                "last_printed": None,
+                "modified": "2024-06-09T21:27:00+00:00",
+                "revision": 2,
+                "subject": "",
+                "title": "",
+                "version": "",
+            },
         }
 
     @pytest.mark.parametrize("table_format", ["markdown", "csv"])
@@ -285,23 +285,23 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter):
         doc = output["documents"][0]
         assert doc.meta == {
             "file_path": os.path.basename(paths[0]),
-            "docx": DOCXMetadata(
-                author="Microsoft Office User",
-                category="",
-                comments="",
-                content_status="",
-                created="2024-06-09T21:17:00+00:00",
-                identifier="",
-                keywords="",
-                language="",
-                last_modified_by="Carlos Fernández Lorán",
-                last_printed=None,
-                modified="2024-06-09T21:27:00+00:00",
-                revision=2,
-                subject="",
-                title="",
-                version="",
-            ),
+            "docx": {
+                "author": "Microsoft Office User",
+                "category": "",
+                "comments": "",
+                "content_status": "",
+                "created": "2024-06-09T21:17:00+00:00",
+                "identifier": "",
+                "keywords": "",
+                "language": "",
+                "last_modified_by": "Carlos Fernández Lorán",
+                "last_printed": None,
+                "modified": "2024-06-09T21:27:00+00:00",
+                "revision": 2,
+                "subject": "",
+                "title": "",
+                "version": "",
+            },
             "language": "it",
             "author": "test_author",
         }