Skip to content

Commit

Permalink
refactor!: DOCXToDocument converter - store DOCX metadata as a dict (
Browse files Browse the repository at this point in the history
…#8804)

* DOCXToDocument - store DOCX metadata as a dict

* do not export DOCXMetadata to converters package
  • Loading branch information
anakin87 authored Feb 5, 2025
1 parent 5ae9488 commit 2828d9e
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 72 deletions.
3 changes: 1 addition & 2 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.csv import CSVToDocument
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
from haystack.components.converters.docx import DOCXToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.json import JSONConverter
from haystack.components.converters.markdown import MarkdownToDocument
Expand All @@ -28,7 +28,6 @@
"OpenAPIServiceToFunctions",
"OutputAdapter",
"DOCXToDocument",
"DOCXMetadata",
"PPTXToDocument",
"CSVToDocument",
"JSONConverter",
Expand Down
4 changes: 2 additions & 2 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import csv
import io
import os
from dataclasses import dataclass
from dataclasses import asdict, dataclass
from enum import Enum
from io import StringIO
from pathlib import Path
Expand Down Expand Up @@ -189,7 +189,7 @@ def run(
)
continue

docx_metadata = self._get_docx_metadata(document=docx_document)
docx_metadata = asdict(self._get_docx_metadata(document=docx_document))
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}

if not self.store_full_path and "file_path" in bytestream.meta:
Expand Down
6 changes: 6 additions & 0 deletions releasenotes/notes/docxmetadata-as-dict-20cf2ef0abf7af8a.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
upgrade:
- |
The `DOCXToDocument` converter now returns a `Document` object with DOCX metadata stored in the `meta` field as a
dictionary under the key `docx`. Previously, the metadata was represented as a `DOCXMetadata` dataclass.
This change does not impact reading from or writing to a Document Store.
136 changes: 68 additions & 68 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,23 +121,23 @@ def test_run(self, test_files_path, docx_converter):
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": os.path.basename(paths[0]),
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
}

def test_run_with_table(self, test_files_path):
Expand All @@ -153,23 +153,23 @@ def test_run_with_table(self, test_files_path):
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": os.path.basename(paths[0]),
"docx": DOCXMetadata(
author="Saha, Anirban",
category="",
comments="",
content_status="",
created="2020-07-14T08:14:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Saha, Anirban",
last_printed=None,
modified="2020-07-14T08:16:00+00:00",
revision=1,
subject="",
title="",
version="",
),
"docx": {
"author": "Saha, Anirban",
"category": "",
"comments": "",
"content_status": "",
"created": "2020-07-14T08:14:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Saha, Anirban",
"last_printed": None,
"modified": "2020-07-14T08:16:00+00:00",
"revision": 1,
"subject": "",
"title": "",
"version": "",
},
}
# let's now detect that the table markdown is correctly added and that order of elements is correct
content_parts = docs[0].content.split("\n\n")
Expand All @@ -193,23 +193,23 @@ def test_run_with_store_full_path_false(self, test_files_path):
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": "sample_docx_1.docx",
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
}

@pytest.mark.parametrize("table_format", ["markdown", "csv"])
Expand Down Expand Up @@ -285,23 +285,23 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter):
doc = output["documents"][0]
assert doc.meta == {
"file_path": os.path.basename(paths[0]),
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
"language": "it",
"author": "test_author",
}
Expand Down

0 comments on commit 2828d9e

Please sign in to comment.