From 52e971744f80ddc4f034e73f1d567f2c8b7623fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=89=AF?= <841369634@qq.com>
Date: Thu, 21 May 2026 02:16:11 +0800
Subject: [PATCH] feat: `Document Extract`: 1) supports parse the image in PDF
 file. 2) Return the document image list

---
 .../impl/base_document_extract_node.py        | 21 +++++++--
 .../handle/impl/text/pdf_split_handle.py      | 46 +++++++++++++++++--
 ui/src/locales/lang/en-US/workflow.ts         |  3 +-
 ui/src/locales/lang/zh-CN/workflow.ts         |  1 +
 ui/src/locales/lang/zh-Hant/workflow.ts       |  1 +
 ui/src/utils/common.ts                        |  5 +-
 ui/src/workflow/common/data.ts                |  4 ++
 7 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
index 68eeadfe055..d7157422850 100644
--- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
+++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
@@ -1,5 +1,4 @@
 # coding=utf-8
-import ast
 import io
 
 import uuid_utils.compat as uuid
@@ -23,7 +22,6 @@ def execute(self, document, chat_id=None, **kwargs):
         get_buffer = FileBufferHandle().get_buffer
 
         self.context['document_list'] = document
-        content = []
         if document is None or not isinstance(document, list):
             return NodeResult({'content': '', 'document_list': []}, {})
 
@@ -39,7 +37,10 @@ def execute(self, document, chat_id=None, **kwargs):
         elif [WorkflowMode.TOOL, WorkflowMode.TOOL_LOOP].__contains__(self.workflow_manage.flow.workflow_mode):
             tool_id = self.workflow_params.get('tool_id')
 
-        # doc文件中的图片保存
+        # 提取并保存成功的图片列表，用于节点输出
+        extracted_image_list = []
+
+        # 文档文件中提取到的图片保存
         def save_image(image_list):
             for image in image_list:
                 meta = {
@@ -62,6 +63,14 @@ def save_image(image_list):
                 if not QuerySet(File).filter(id=new_file.id).exists():
                     new_file.save(file_bytes)
 
+                extracted_image_list.append({
+                    "name": image.file_name,
+                    "size": new_file.file_size,
+                    "url": f"./oss/file/{new_file.id}",
+                    "file_id": new_file.id,
+                })
+
+        content = []
         document_list = []
         for doc in document:
             file = QuerySet(File).filter(id=doc['file_id']).first()
@@ -77,7 +86,11 @@ def save_image(image_list):
                     document_list.append({'id': str(file.id), 'name': doc['name'], 'content': file_content})
                     break
 
-        return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {})
+        return NodeResult({
+            'content': splitter.join(content),
+            "image_list": extracted_image_list,
+            'document_list': document_list
+        }, {})
 
     def get_details(self, index: int, **kwargs):
         content = self.context.get('content', '').split(splitter)
diff --git a/apps/common/handle/impl/text/pdf_split_handle.py b/apps/common/handle/impl/text/pdf_split_handle.py
index 725830403b9..cefcd244700 100644
--- a/apps/common/handle/impl/text/pdf_split_handle.py
+++ b/apps/common/handle/impl/text/pdf_split_handle.py
@@ -13,6 +13,7 @@
 import time
 import traceback
 from typing import List
+import uuid_utils.compat as uuid
 
 from pypdf import PdfReader
 from pypdf.generic import Destination
@@ -21,6 +22,7 @@
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
 from common.utils.split_model import SplitModel, smart_split_paragraph
+from knowledge.models import File
 
 default_pattern_list = [
     re.compile("(?<=^)# .*|(?<=\\n)# .*"),
@@ -103,7 +105,7 @@ def handle(
         return {"name": file.name, "content": split_model.parse(content)}
 
     @staticmethod
-    def handle_pdf_content(file, pdf_document):
+    def handle_pdf_content(file, pdf_document, save_image):
         # 第一步:收集所有字体大小
         font_sizes = []
         page_lines = []
@@ -124,6 +126,7 @@ def handle_pdf_content(file, pdf_document):
 
         # 第二步:提取内容
         content = ""
+        image_list = []
         for page_num, page in enumerate(pdf_document.pages):
             start_time = time.time()
 
@@ -141,8 +144,41 @@ def handle_pdf_content(file, pdf_document):
                 else:  # 正文
                     content += f"{text}\n"
 
+            # 处理页面中的图片
             for image_index in range(PdfSplitHandle.get_page_image_count(page)):
-                content += f"![image](image_{page_num}_{image_index})\n\n"
+                try:
+                    image_obj = page.images[image_index]
+                    image_uuid = uuid.uuid7()
+
+                    # 读取图片文件名
+                    original_name = getattr(image_obj, 'name', None) or f"image_{page_num}_{image_index}"
+                    # 移除文件扩展名（如果存在），因为后面会单独添加
+                    image_file_name = original_name.rsplit('.', 1)[0] if '.' in original_name else original_name
+                    # 清理文件名中的特殊字符
+                    image_file_name = re.sub(r'[^\w\-_\.]+', '_', image_file_name)
+                    if not image_file_name:
+                        image_file_name = f"image_{page_num}_{image_index}"
+
+                    # 获取图片扩展名
+                    file_extension = 'png'
+                    if hasattr(image_obj, 'name') and image_obj.name:
+                        ext = image_obj.name.rsplit('.', 1)[-1].lower() if '.' in image_obj.name else 'png'
+                        if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'tif', 'webp']:
+                            file_extension = ext
+
+                    # 保存图片到列表
+                    image_file = File(
+                        id=image_uuid,
+                        file_name=f"{image_file_name}.{file_extension}",
+                        meta={'debug': False, 'content': image_obj.data}
+                    )
+                    image_list.append(image_file)
+
+                    # 添加Markdown图片引用
+                    content += f"![{image_file_name}](./oss/file/{image_uuid})\n\n"
+                except Exception as e:
+                    maxkb_logger.error(f"Error extracting image from PDF page {page_num}, index {image_index}: {e}")
+                    content += f"![image](image_{page_num}_{image_index})\n\n"
 
             content = content.replace("\0", "")
 
@@ -151,6 +187,10 @@ def handle_pdf_content(file, pdf_document):
                 f"File: {file.name}, Page: {page_num + 1}, Time: {elapsed_time:.3f}s"
             )
 
+        # 保存所有图片
+        if len(image_list) > 0:
+            save_image(image_list)
+
         return content
 
     @staticmethod
@@ -551,7 +591,7 @@ def get_content(self, file, save_image):
         try:
             with open(temp_file_path, "rb") as pdf_file:
                 pdf_document = PdfReader(pdf_file)
-                return self.handle_pdf_content(file, pdf_document)
+                return self.handle_pdf_content(file, pdf_document, save_image)
         except BaseException as e:
             traceback.print_exception(e)
             return f"{e}"
diff --git a/ui/src/locales/lang/en-US/workflow.ts b/ui/src/locales/lang/en-US/workflow.ts
index 61111fb3ba9..3f7cc2dae62 100644
--- a/ui/src/locales/lang/en-US/workflow.ts
+++ b/ui/src/locales/lang/en-US/workflow.ts
@@ -98,7 +98,7 @@ export default {
     dataSourceWebNode: {
       label: 'Web Site',
       text: 'Input the root URL to automatically crawl web data (single link corresponds to a single document), output a list of documents with content',
-      field_label: 'Document list',
+      field_label: 'Document List',
     },
     dataSourceLocalNode: {
       label: 'Local File',
@@ -277,6 +277,7 @@ You are a master of problem optimization, adept at accurately inferring user int
       label: 'Document Content Extraction',
       text: 'Parse input documents to output structured document content',
       content: 'Document Content',
+      image_list: 'Document Image List',
     },
     documentSplitNode: {
       label: 'Document Splitting',
diff --git a/ui/src/locales/lang/zh-CN/workflow.ts b/ui/src/locales/lang/zh-CN/workflow.ts
index 15b68955bf7..e34635370c0 100644
--- a/ui/src/locales/lang/zh-CN/workflow.ts
+++ b/ui/src/locales/lang/zh-CN/workflow.ts
@@ -276,6 +276,7 @@ export default {
       label: '文档内容提取',
       text: '解析输入文档，输出结构化文档内容',
       content: '文档内容',
+      image_list: '文档图片',
     },
     documentSplitNode: {
       label: '文档分段',
diff --git a/ui/src/locales/lang/zh-Hant/workflow.ts b/ui/src/locales/lang/zh-Hant/workflow.ts
index a9a33338734..00f89341aa6 100644
--- a/ui/src/locales/lang/zh-Hant/workflow.ts
+++ b/ui/src/locales/lang/zh-Hant/workflow.ts
@@ -276,6 +276,7 @@ export default {
       label: '文檔內容提取',
       text: '解析輸入文檔，輸出結構化文檔內容',
       content: '文檔內容',
+      image_list: '文檔圖片',
     },
     documentSplitNode: {
       label: '文檔拆分',
diff --git a/ui/src/utils/common.ts b/ui/src/utils/common.ts
index b2148247ff1..ac10083a520 100644
--- a/ui/src/utils/common.ts
+++ b/ui/src/utils/common.ts
@@ -64,9 +64,8 @@ const typeList: any = {
 export function getImgUrl(name: string) {
   const list = Object.values(typeList).flat()
 
-  const type = list.includes(fileType(name).toLowerCase())
-    ? fileType(name).toLowerCase()
-    : 'unknown'
+  const typeStr = fileType(name).toLowerCase()
+  const type = list.includes(typeStr) ? typeStr : 'unknown'
   return new URL(`../assets/fileType/${type}-icon.svg`, import.meta.url).href
 }
 
diff --git a/ui/src/workflow/common/data.ts b/ui/src/workflow/common/data.ts
index a726ae9ed5a..98c3c48cf2d 100644
--- a/ui/src/workflow/common/data.ts
+++ b/ui/src/workflow/common/data.ts
@@ -414,6 +414,10 @@ export const documentExtractNode = {
           label: t('workflow.nodes.documentExtractNode.content'),
           value: 'content',
         },
+        {
+          label: t('workflow.nodes.documentExtractNode.image_list'),
+          value: 'image_list',
+        },
         {
           label: t('workflow.nodes.dataSourceWebNode.field_label'),
           value: 'document_list',