Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions backend/aci/server/app_connectors/microsoft_onedrive.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import io
import tempfile
from typing import override

import requests
Expand Down Expand Up @@ -161,3 +162,189 @@ def create_excel_from_csv(
except Exception as e:
logger.error(f"Failed to create CSV file from CSV data: {e}")
raise Exception(f"Failed to create CSV file: {e}") from e

def create_docx_from_markdown(
self, markdown_data: str, parent_folder_id: str, filename: str | None = None
) -> dict[str, str | int]:
"""
Convert Markdown text to a formatted DOCX document and save it to OneDrive.
Uses the md2docx-python library for robust conversion.

Args:
markdown_data: The Markdown text as a string to convert
parent_folder_id: The identifier of the parent folder where the DOCX file will be created
filename: Optional custom name for the DOCX file (without .docx extension)

Returns:
dict: Response containing the created DOCX file metadata
"""
logger.info(f"Creating DOCX file from Markdown on OneDrive, folder: {parent_folder_id}")

try:
from md2docx_python.src.md2docx_python import markdown_to_word

# Determine filename
if not filename:
filename = "converted_document"

# Ensure .docx extension
if not filename.endswith(".docx"):
filename += ".docx"

# Create temporary files for conversion
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as md_file:
md_file.write(markdown_data)
md_file_path = md_file.name

with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as docx_file:
docx_file_path = docx_file.name

try:
# Convert markdown to DOCX using the well-maintained library
markdown_to_word(md_file_path, docx_file_path)

# Read the generated DOCX file
with open(docx_file_path, "rb") as docx_file:
docx_bytes = docx_file.read()

# Upload DOCX file to OneDrive
upload_url = (
f"{self.base_url}/me/drive/items/{parent_folder_id}:/{filename}:/content"
)

headers = {
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
}

upload_response = requests.put(
upload_url, headers=headers, data=docx_bytes, timeout=60
)
upload_response.raise_for_status()

result = upload_response.json()

# Count some basic stats for the response
lines = markdown_data.split("\n")
word_count = len(markdown_data.split())

logger.info(
f"Successfully created DOCX file: {filename}, ID: {result.get('id', '')}"
)

return {
"id": result.get("id", ""),
"name": result.get("name", ""),
"path": result.get("parentReference", {}).get("path", "")
+ "/"
+ result.get("name", ""),
"size": result.get("size", 0),
"mime_type": result.get("file", {}).get("mimeType", ""),
"created_datetime": result.get("createdDateTime", ""),
"modified_datetime": result.get("lastModifiedDateTime", ""),
"download_url": result.get("@microsoft.graph.downloadUrl", ""),
"lines_converted": len(lines),
"word_count": word_count,
"note": "DOCX file created successfully from Markdown using md2docx-python library.",
}

finally:
# Clean up temporary files
import os

try:
os.unlink(md_file_path)
os.unlink(docx_file_path)
except OSError:
pass # Files already cleaned up

except Exception as e:
logger.error(f"Failed to create DOCX file from Markdown data: {e}")
raise Exception(f"Failed to create DOCX file: {e}") from e
Comment on lines +166 to +263
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Move imports to module level and verify import paths

Several issues with imports and code organization:

  1. Import statements should be at the module level, not inside methods
  2. The import path md2docx_python.src.md2docx_python seems unusual - typically src directories are not part of the import path
  3. The os module is imported inside the finally block but should be at module level

Apply this refactor to move imports to module level:

import csv
import io
+import os
import tempfile
from typing import override

import requests
+from md2docx_python import markdown_to_word
+from md2docx_python import word_to_markdown

And update the method to remove internal imports:

-            from md2docx_python.src.md2docx_python import markdown_to_word
             
             # Determine filename
             finally:
                 # Clean up temporary files
-                import os
-
                 try:

🏁 Script executed:

#!/bin/bash
# Verify the correct import path for md2docx-python library
pip show md2docx-python | grep Location || echo "Package not found"

# Try to find the correct import structure
python -c "import md2docx_python; print(dir(md2docx_python))" 2>/dev/null || echo "Failed to import md2docx_python"

# Check if the src path is part of the package
find . -name "*.py" -type f -exec grep -l "from md2docx_python" {} \; 2>/dev/null | head -5

Length of output: 412


Ensure module-level imports and correct md2docx-python dependency

The inline import and unusual path for the md2docx-python library will break at runtime (the package isn’t found in your environment). Please:

  • Add md2docx-python to your project’s dependencies (e.g. requirements.txt or pyproject.toml) so it can be installed.
  • Move all imports out of the method to the top of backend/aci/server/app_connectors/microsoft_onedrive.py:
    import os
    import tempfile
    import requests
    
    # Adjust the import to match the installed package’s API:
    # e.g. from md2docx_python import markdown_to_word
    # or from md2docx_python.src.md2docx_python import markdown_to_word
    from md2docx_python import markdown_to_word
  • Remove the from md2docx_python.src.md2docx_python import markdown_to_word inside create_docx_from_markdown and the import os in the finally block.
  • Verify the correct top-level import path by consulting the library’s documentation and your installed package.

These changes will prevent import errors, follow best practices, and make the code easier to maintain.

🤖 Prompt for AI Agents
In backend/aci/server/app_connectors/microsoft_onedrive.py around lines 166 to
263, the import of md2docx-python is done inside the create_docx_from_markdown
method using an unusual path that will cause runtime errors. To fix this, add
md2docx-python to your project dependencies (requirements.txt or
pyproject.toml), move all imports including os, tempfile, requests, and the
correct import of markdown_to_word from md2docx_python to the top of the file,
remove the inline import inside the method and the os import in the finally
block, and verify the correct import path for markdown_to_word by checking the
installed package or its documentation.


def read_markdown_from_docx(self, item_id: str) -> dict[str, str | int]:
"""
Convert a DOCX file from OneDrive to Markdown text.
Uses the md2docx-python library for robust conversion.

Args:
item_id: The identifier of the DOCX file in OneDrive to convert

Returns:
dict: Response containing the markdown content and metadata
"""
logger.info(f"Converting DOCX file to Markdown from OneDrive: {item_id}")

try:
from md2docx_python.src.docx2md_python import word_to_markdown

# Download the DOCX file from OneDrive
download_url = f"{self.base_url}/me/drive/items/{item_id}/content"
headers = {"Authorization": f"Bearer {self.access_token}"}

download_response = requests.get(download_url, headers=headers, timeout=30)
download_response.raise_for_status()

# Get file metadata for response details
metadata_url = f"{self.base_url}/me/drive/items/{item_id}"
metadata_response = requests.get(metadata_url, headers=headers, timeout=30)
metadata_response.raise_for_status()
metadata = metadata_response.json()

# Verify it's a DOCX file
if not metadata.get("name", "").lower().endswith((".docx", ".doc")):
raise Exception(f"File '{metadata.get('name', '')}' is not a Word document")

# Create temporary files for conversion
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as docx_file:
docx_file.write(download_response.content)
docx_file_path = docx_file.name

with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as md_file:
md_file_path = md_file.name

try:
# Convert DOCX to Markdown using the well-maintained library
word_to_markdown(docx_file_path, md_file_path)

# Read the generated Markdown file
with open(md_file_path, encoding="utf-8") as md_file:
markdown_content = md_file.read()

# Count some basic stats for the response
lines = markdown_content.split("\n")
word_count = len(markdown_content.split())

logger.info(
f"Successfully converted DOCX to Markdown: {item_id}, {len(markdown_content)} characters"
)

return {
"content": markdown_content,
"id": metadata.get("id", ""),
"name": metadata.get("name", ""),
"path": metadata.get("parentReference", {}).get("path", "")
+ "/"
+ metadata.get("name", ""),
"size": metadata.get("size", 0),
"mime_type": metadata.get("file", {}).get("mimeType", ""),
"created_datetime": metadata.get("createdDateTime", ""),
"modified_datetime": metadata.get("lastModifiedDateTime", ""),
"lines_extracted": len(lines),
"word_count": word_count,
"note": "DOCX file successfully converted to Markdown using md2docx-python library.",
}

finally:
# Clean up temporary files
import os

try:
os.unlink(docx_file_path)
os.unlink(md_file_path)
except OSError:
pass # Files already cleaned up

except Exception as e:
logger.error(f"Failed to convert DOCX file to Markdown: {item_id}, error: {e}")
raise Exception(f"Failed to convert DOCX file: {e}") from e
Comment on lines +265 to +350
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Refactor duplicate code patterns

This method has the same issues as create_docx_from_markdown:

  1. Import statements inside the method
  2. Silent exception handling in cleanup
  3. Duplicate cleanup logic

Consider extracting the temporary file cleanup logic into a helper method:

def _cleanup_temp_files(self, *file_paths: str) -> None:
    """Clean up temporary files, logging any errors."""
    for file_path in file_paths:
        try:
            os.unlink(file_path)
        except OSError as e:
            logger.debug(f"Failed to clean up temporary file {file_path}: {e}")

Then use it in both methods:

             finally:
-                # Clean up temporary files
-                import os
-
-                try:
-                    os.unlink(docx_file_path)
-                    os.unlink(md_file_path)
-                except OSError:
-                    pass  # Files already cleaned up
+                self._cleanup_temp_files(docx_file_path, md_file_path)
🤖 Prompt for AI Agents
In backend/aci/server/app_connectors/microsoft_onedrive.py around lines 265 to
350, the method read_markdown_from_docx has import statements inside the method,
silent exception handling during temporary file cleanup, and duplicate cleanup
logic similar to create_docx_from_markdown. To fix this, move the import
statements to the top of the file, extract the temporary file cleanup code into
a new helper method _cleanup_temp_files that accepts file paths and logs any
cleanup errors, then replace the existing cleanup code in
read_markdown_from_docx and create_docx_from_markdown with calls to this helper
method.

50 changes: 50 additions & 0 deletions backend/apps/microsoft_onedrive/functions.json
Original file line number Diff line number Diff line change
Expand Up @@ -1127,5 +1127,55 @@
"visible": ["csv_data", "parent_folder_id", "filename"],
"additionalProperties": false
}
},
{
"name": "MICROSOFT_ONEDRIVE__CREATE_DOCX_FROM_MARKDOWN",
"description": "Convert Markdown text to a formatted DOCX document and save it to OneDrive. Takes Markdown data as a string and creates a properly formatted Word document in the specified parent folder on OneDrive. Supports common Markdown elements like headers, paragraphs, lists, bold, italic, and code blocks.",
"tags": ["convert", "markdown", "docx", "word", "create", "onedrive"],
"visibility": "public",
"active": true,
"protocol": "connector",
"protocol_data": {},
"parameters": {
"type": "object",
"properties": {
"markdown_data": {
"type": "string",
"description": "The Markdown text as a string to convert to DOCX. Should be properly formatted Markdown with headers, paragraphs, lists, etc."
},
"parent_folder_id": {
"type": "string",
"description": "The identifier of the parent folder where the new DOCX file will be created. Use 'root' for the root directory or a specific folder item ID."
},
"filename": {
"type": "string",
"description": "The name for the DOCX file (without .docx extension). If not provided, defaults to 'converted_document'."
}
},
"required": ["markdown_data", "parent_folder_id"],
"visible": ["markdown_data", "parent_folder_id", "filename"],
"additionalProperties": false
}
},
{
"name": "MICROSOFT_ONEDRIVE__READ_MARKDOWN_FROM_DOCX",
"description": "Read and extract Markdown content from a DOCX file stored in OneDrive. Downloads the DOCX file and extracts its content as properly formatted Markdown text, preserving headings, lists, formatting, and other document structure.",
"tags": ["convert", "docx", "markdown", "read", "extract", "onedrive"],
"visibility": "public",
"active": true,
"protocol": "connector",
"protocol_data": {},
"parameters": {
"type": "object",
"properties": {
"item_id": {
"type": "string",
"description": "The identifier of the DOCX file in OneDrive to convert. Use the full item ID as returned by other OneDrive functions (e.g., '7006ADAF2D3C1355!s5625f3cefe4c4fe28f99a4b6f05cb944')."
}
},
"required": ["item_id"],
"visible": ["item_id"],
"additionalProperties": false
}
}
]
5 changes: 5 additions & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"stripe>=12.1.0",
"e2b-code-interpreter>=1.5.0",
"browser-use>=0.5.0",
"md2docx-python>=0.1.8",
]

[dependency-groups]
Expand Down Expand Up @@ -141,6 +142,10 @@ ignore_missing_imports = true
module = "datasets.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "md2docx_python.*"
ignore_missing_imports = true

[tool.pytest.ini_options]
log_cli = true
log_cli_level = "INFO"
Expand Down
Loading