aipotheosis-labs · jiwei-aipolabs · Jul 25, 2025 · coderabbitai · Jul 25, 2025 · coderabbitai
diff --git a/backend/aci/server/app_connectors/microsoft_onedrive.py b/backend/aci/server/app_connectors/microsoft_onedrive.py
@@ -1,5 +1,6 @@
 import csv
 import io
+import tempfile
 from typing import override
 
 import requests
@@ -161,3 +162,189 @@ def create_excel_from_csv(
         except Exception as e:
             logger.error(f"Failed to create CSV file from CSV data: {e}")
             raise Exception(f"Failed to create CSV file: {e}") from e
+
+    def create_docx_from_markdown(
+        self, markdown_data: str, parent_folder_id: str, filename: str | None = None
+    ) -> dict[str, str | int]:
+        """
+        Convert Markdown text to a formatted DOCX document and save it to OneDrive.
+        Uses the md2docx-python library for robust conversion.
+
+        Args:
+            markdown_data: The Markdown text as a string to convert
+            parent_folder_id: The identifier of the parent folder where the DOCX file will be created
+            filename: Optional custom name for the DOCX file (without .docx extension)
+
+        Returns:
+            dict: Response containing the created DOCX file metadata
+        """
+        logger.info(f"Creating DOCX file from Markdown on OneDrive, folder: {parent_folder_id}")
+
+        try:
+            from md2docx_python.src.md2docx_python import markdown_to_word
+
+            # Determine filename
+            if not filename:
+                filename = "converted_document"
+
+            # Ensure .docx extension
+            if not filename.endswith(".docx"):
+                filename += ".docx"
+
+            # Create temporary files for conversion
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as md_file:
+                md_file.write(markdown_data)
+                md_file_path = md_file.name
+
+            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as docx_file:
+                docx_file_path = docx_file.name
+
+            try:
+                # Convert markdown to DOCX using the well-maintained library
+                markdown_to_word(md_file_path, docx_file_path)
+
+                # Read the generated DOCX file
+                with open(docx_file_path, "rb") as docx_file:
+                    docx_bytes = docx_file.read()
+
+                # Upload DOCX file to OneDrive
+                upload_url = (
+                    f"{self.base_url}/me/drive/items/{parent_folder_id}:/{filename}:/content"
+                )
+
+                headers = {
+                    "Authorization": f"Bearer {self.access_token}",
+                    "Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                }
+
+                upload_response = requests.put(
+                    upload_url, headers=headers, data=docx_bytes, timeout=60
+                )
+                upload_response.raise_for_status()
+
+                result = upload_response.json()
+
+                # Count some basic stats for the response
+                lines = markdown_data.split("\n")
+                word_count = len(markdown_data.split())
+
+                logger.info(
+                    f"Successfully created DOCX file: {filename}, ID: {result.get('id', '')}"
+                )
+
+                return {
+                    "id": result.get("id", ""),
+                    "name": result.get("name", ""),
+                    "path": result.get("parentReference", {}).get("path", "")
+                    + "/"
+                    + result.get("name", ""),
+                    "size": result.get("size", 0),
+                    "mime_type": result.get("file", {}).get("mimeType", ""),
+                    "created_datetime": result.get("createdDateTime", ""),
+                    "modified_datetime": result.get("lastModifiedDateTime", ""),
+                    "download_url": result.get("@microsoft.graph.downloadUrl", ""),
+                    "lines_converted": len(lines),
+                    "word_count": word_count,
+                    "note": "DOCX file created successfully from Markdown using md2docx-python library.",
+                }
+
+            finally:
+                # Clean up temporary files
+                import os
+
+                try:
+                    os.unlink(md_file_path)
+                    os.unlink(docx_file_path)
+                except OSError:
+                    pass  # Files already cleaned up
+
+        except Exception as e:
+            logger.error(f"Failed to create DOCX file from Markdown data: {e}")
+            raise Exception(f"Failed to create DOCX file: {e}") from e
+
+    def read_markdown_from_docx(self, item_id: str) -> dict[str, str | int]:
+        """
+        Convert a DOCX file from OneDrive to Markdown text.
+        Uses the md2docx-python library for robust conversion.
+
+        Args:
+            item_id: The identifier of the DOCX file in OneDrive to convert
+
+        Returns:
+            dict: Response containing the markdown content and metadata
+        """
+        logger.info(f"Converting DOCX file to Markdown from OneDrive: {item_id}")
+
+        try:
+            from md2docx_python.src.docx2md_python import word_to_markdown
+
+            # Download the DOCX file from OneDrive
+            download_url = f"{self.base_url}/me/drive/items/{item_id}/content"
+            headers = {"Authorization": f"Bearer {self.access_token}"}
+
+            download_response = requests.get(download_url, headers=headers, timeout=30)
+            download_response.raise_for_status()
+
+            # Get file metadata for response details
+            metadata_url = f"{self.base_url}/me/drive/items/{item_id}"
+            metadata_response = requests.get(metadata_url, headers=headers, timeout=30)
+            metadata_response.raise_for_status()
+            metadata = metadata_response.json()
+
+            # Verify it's a DOCX file
+            if not metadata.get("name", "").lower().endswith((".docx", ".doc")):
+                raise Exception(f"File '{metadata.get('name', '')}' is not a Word document")
+
+            # Create temporary files for conversion
+            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as docx_file:
+                docx_file.write(download_response.content)
+                docx_file_path = docx_file.name
+
+            with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as md_file:
+                md_file_path = md_file.name
+
+            try:
+                # Convert DOCX to Markdown using the well-maintained library
+                word_to_markdown(docx_file_path, md_file_path)
+
+                # Read the generated Markdown file
+                with open(md_file_path, encoding="utf-8") as md_file:
+                    markdown_content = md_file.read()
+
+                # Count some basic stats for the response
+                lines = markdown_content.split("\n")
+                word_count = len(markdown_content.split())
+
+                logger.info(
+                    f"Successfully converted DOCX to Markdown: {item_id}, {len(markdown_content)} characters"
+                )
+
+                return {
+                    "content": markdown_content,
+                    "id": metadata.get("id", ""),
+                    "name": metadata.get("name", ""),
+                    "path": metadata.get("parentReference", {}).get("path", "")
+                    + "/"
+                    + metadata.get("name", ""),
+                    "size": metadata.get("size", 0),
+                    "mime_type": metadata.get("file", {}).get("mimeType", ""),
+                    "created_datetime": metadata.get("createdDateTime", ""),
+                    "modified_datetime": metadata.get("lastModifiedDateTime", ""),
+                    "lines_extracted": len(lines),
+                    "word_count": word_count,
+                    "note": "DOCX file successfully converted to Markdown using md2docx-python library.",
+                }
+
+            finally:
+                # Clean up temporary files
+                import os
+
+                try:
+                    os.unlink(docx_file_path)
+                    os.unlink(md_file_path)
+                except OSError:
+                    pass  # Files already cleaned up
+
+        except Exception as e:
+            logger.error(f"Failed to convert DOCX file to Markdown: {item_id}, error: {e}")
+            raise Exception(f"Failed to convert DOCX file: {e}") from e
diff --git a/backend/apps/microsoft_onedrive/functions.json b/backend/apps/microsoft_onedrive/functions.json
@@ -1127,5 +1127,55 @@
             "visible": ["csv_data", "parent_folder_id", "filename"],
             "additionalProperties": false
         }
+    },
+    {
+        "name": "MICROSOFT_ONEDRIVE__CREATE_DOCX_FROM_MARKDOWN",
+        "description": "Convert Markdown text to a formatted DOCX document and save it to OneDrive. Takes Markdown data as a string and creates a properly formatted Word document in the specified parent folder on OneDrive. Supports common Markdown elements like headers, paragraphs, lists, bold, italic, and code blocks.",
+        "tags": ["convert", "markdown", "docx", "word", "create", "onedrive"],
+        "visibility": "public",
+        "active": true,
+        "protocol": "connector",
+        "protocol_data": {},
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "markdown_data": {
+                    "type": "string",
+                    "description": "The Markdown text as a string to convert to DOCX. Should be properly formatted Markdown with headers, paragraphs, lists, etc."
+                },
+                "parent_folder_id": {
+                    "type": "string",
+                    "description": "The identifier of the parent folder where the new DOCX file will be created. Use 'root' for the root directory or a specific folder item ID."
+                },
+                "filename": {
+                    "type": "string",
+                    "description": "The name for the DOCX file (without .docx extension). If not provided, defaults to 'converted_document'."
+                }
+            },
+            "required": ["markdown_data", "parent_folder_id"],
+            "visible": ["markdown_data", "parent_folder_id", "filename"],
+            "additionalProperties": false
+        }
+    },
+    {
+        "name": "MICROSOFT_ONEDRIVE__READ_MARKDOWN_FROM_DOCX",
+        "description": "Read and extract Markdown content from a DOCX file stored in OneDrive. Downloads the DOCX file and extracts its content as properly formatted Markdown text, preserving headings, lists, formatting, and other document structure.",
+        "tags": ["convert", "docx", "markdown", "read", "extract", "onedrive"],
+        "visibility": "public",
+        "active": true,
+        "protocol": "connector",
+        "protocol_data": {},
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "item_id": {
+                    "type": "string",
+                    "description": "The identifier of the DOCX file in OneDrive to convert. Use the full item ID as returned by other OneDrive functions (e.g., '7006ADAF2D3C1355!s5625f3cefe4c4fe28f99a4b6f05cb944')."
+                }
+            },
+            "required": ["item_id"],
+            "visible": ["item_id"],
+            "additionalProperties": false
+        }
     }
 ]
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "stripe>=12.1.0",
     "e2b-code-interpreter>=1.5.0",
     "browser-use>=0.5.0",
+    "md2docx-python>=0.1.8",
 ]
 
 [dependency-groups]
@@ -141,6 +142,10 @@ ignore_missing_imports = true
 module = "datasets.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "md2docx_python.*"
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"