From 52e971744f80ddc4f034e73f1d567f2c8b7623fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=89=AF?= <841369634@qq.com> Date: Thu, 21 May 2026 02:16:11 +0800 Subject: [PATCH] feat: `Document Extract`: 1) supports parse the image in PDF file. 2) Return the document image list --- .../impl/base_document_extract_node.py | 21 +++++++-- .../handle/impl/text/pdf_split_handle.py | 46 +++++++++++++++++-- ui/src/locales/lang/en-US/workflow.ts | 3 +- ui/src/locales/lang/zh-CN/workflow.ts | 1 + ui/src/locales/lang/zh-Hant/workflow.ts | 1 + ui/src/utils/common.ts | 5 +- ui/src/workflow/common/data.ts | 4 ++ 7 files changed, 70 insertions(+), 11 deletions(-) diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py index 68eeadfe055..d7157422850 100644 --- a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py +++ b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py @@ -1,5 +1,4 @@ # coding=utf-8 -import ast import io import uuid_utils.compat as uuid @@ -23,7 +22,6 @@ def execute(self, document, chat_id=None, **kwargs): get_buffer = FileBufferHandle().get_buffer self.context['document_list'] = document - content = [] if document is None or not isinstance(document, list): return NodeResult({'content': '', 'document_list': []}, {}) @@ -39,7 +37,10 @@ def execute(self, document, chat_id=None, **kwargs): elif [WorkflowMode.TOOL, WorkflowMode.TOOL_LOOP].__contains__(self.workflow_manage.flow.workflow_mode): tool_id = self.workflow_params.get('tool_id') - # doc文件中的图片保存 + # 提取并保存成功的图片列表,用于节点输出 + extracted_image_list = [] + + # 文档文件中提取到的图片保存 def save_image(image_list): for image in image_list: meta = { @@ -62,6 +63,14 @@ def save_image(image_list): if not QuerySet(File).filter(id=new_file.id).exists(): new_file.save(file_bytes) + extracted_image_list.append({ + "name": image.file_name, + "size": new_file.file_size, + "url": f"./oss/file/{new_file.id}", + "file_id": new_file.id, + }) + + content = [] document_list = [] for doc in document: file = QuerySet(File).filter(id=doc['file_id']).first() @@ -77,7 +86,11 @@ def save_image(image_list): document_list.append({'id': str(file.id), 'name': doc['name'], 'content': file_content}) break - return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {}) + return NodeResult({ + 'content': splitter.join(content), + "image_list": extracted_image_list, + 'document_list': document_list + }, {}) def get_details(self, index: int, **kwargs): content = self.context.get('content', '').split(splitter) diff --git a/apps/common/handle/impl/text/pdf_split_handle.py b/apps/common/handle/impl/text/pdf_split_handle.py index 725830403b9..cefcd244700 100644 --- a/apps/common/handle/impl/text/pdf_split_handle.py +++ b/apps/common/handle/impl/text/pdf_split_handle.py @@ -13,6 +13,7 @@ import time import traceback from typing import List +import uuid_utils.compat as uuid from pypdf import PdfReader from pypdf.generic import Destination @@ -21,6 +22,7 @@ from common.handle.base_split_handle import BaseSplitHandle from common.utils.logger import maxkb_logger from common.utils.split_model import SplitModel, smart_split_paragraph +from knowledge.models import File default_pattern_list = [ re.compile("(?<=^)# .*|(?<=\\n)# .*"), @@ -103,7 +105,7 @@ def handle( return {"name": file.name, "content": split_model.parse(content)} @staticmethod - def handle_pdf_content(file, pdf_document): + def handle_pdf_content(file, pdf_document, save_image): # 第一步:收集所有字体大小 font_sizes = [] page_lines = [] @@ -124,6 +126,7 @@ def handle_pdf_content(file, pdf_document): # 第二步:提取内容 content = "" + image_list = [] for page_num, page in enumerate(pdf_document.pages): start_time = time.time() @@ -141,8 +144,41 @@ def handle_pdf_content(file, pdf_document): else: # 正文 content += f"{text}\n" + # 处理页面中的图片 for image_index in range(PdfSplitHandle.get_page_image_count(page)): - content += f"![image](image_{page_num}_{image_index})\n\n" + try: + image_obj = page.images[image_index] + image_uuid = uuid.uuid7() + + # 读取图片文件名 + original_name = getattr(image_obj, 'name', None) or f"image_{page_num}_{image_index}" + # 移除文件扩展名(如果存在),因为后面会单独添加 + image_file_name = original_name.rsplit('.', 1)[0] if '.' in original_name else original_name + # 清理文件名中的特殊字符 + image_file_name = re.sub(r'[^\w\-_\.]+', '_', image_file_name) + if not image_file_name: + image_file_name = f"image_{page_num}_{image_index}" + + # 获取图片扩展名 + file_extension = 'png' + if hasattr(image_obj, 'name') and image_obj.name: + ext = image_obj.name.rsplit('.', 1)[-1].lower() if '.' in image_obj.name else 'png' + if ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'tif', 'webp']: + file_extension = ext + + # 保存图片到列表 + image_file = File( + id=image_uuid, + file_name=f"{image_file_name}.{file_extension}", + meta={'debug': False, 'content': image_obj.data} + ) + image_list.append(image_file) + + # 添加Markdown图片引用 + content += f"![{image_file_name}](./oss/file/{image_uuid})\n\n" + except Exception as e: + maxkb_logger.error(f"Error extracting image from PDF page {page_num}, index {image_index}: {e}") + content += f"![image](image_{page_num}_{image_index})\n\n" content = content.replace("\0", "") @@ -151,6 +187,10 @@ def handle_pdf_content(file, pdf_document): f"File: {file.name}, Page: {page_num + 1}, Time: {elapsed_time:.3f}s" ) + # 保存所有图片 + if len(image_list) > 0: + save_image(image_list) + return content @staticmethod @@ -551,7 +591,7 @@ def get_content(self, file, save_image): try: with open(temp_file_path, "rb") as pdf_file: pdf_document = PdfReader(pdf_file) - return self.handle_pdf_content(file, pdf_document) + return self.handle_pdf_content(file, pdf_document, save_image) except BaseException as e: traceback.print_exception(e) return f"{e}" diff --git a/ui/src/locales/lang/en-US/workflow.ts b/ui/src/locales/lang/en-US/workflow.ts index 61111fb3ba9..3f7cc2dae62 100644 --- a/ui/src/locales/lang/en-US/workflow.ts +++ b/ui/src/locales/lang/en-US/workflow.ts @@ -98,7 +98,7 @@ export default { dataSourceWebNode: { label: 'Web Site', text: 'Input the root URL to automatically crawl web data (single link corresponds to a single document), output a list of documents with content', - field_label: 'Document list', + field_label: 'Document List', }, dataSourceLocalNode: { label: 'Local File', @@ -277,6 +277,7 @@ You are a master of problem optimization, adept at accurately inferring user int label: 'Document Content Extraction', text: 'Parse input documents to output structured document content', content: 'Document Content', + image_list: 'Document Image List', }, documentSplitNode: { label: 'Document Splitting', diff --git a/ui/src/locales/lang/zh-CN/workflow.ts b/ui/src/locales/lang/zh-CN/workflow.ts index 15b68955bf7..e34635370c0 100644 --- a/ui/src/locales/lang/zh-CN/workflow.ts +++ b/ui/src/locales/lang/zh-CN/workflow.ts @@ -276,6 +276,7 @@ export default { label: '文档内容提取', text: '解析输入文档,输出结构化文档内容', content: '文档内容', + image_list: '文档图片', }, documentSplitNode: { label: '文档分段', diff --git a/ui/src/locales/lang/zh-Hant/workflow.ts b/ui/src/locales/lang/zh-Hant/workflow.ts index a9a33338734..00f89341aa6 100644 --- a/ui/src/locales/lang/zh-Hant/workflow.ts +++ b/ui/src/locales/lang/zh-Hant/workflow.ts @@ -276,6 +276,7 @@ export default { label: '文檔內容提取', text: '解析輸入文檔,輸出結構化文檔內容', content: '文檔內容', + image_list: '文檔圖片', }, documentSplitNode: { label: '文檔拆分', diff --git a/ui/src/utils/common.ts b/ui/src/utils/common.ts index b2148247ff1..ac10083a520 100644 --- a/ui/src/utils/common.ts +++ b/ui/src/utils/common.ts @@ -64,9 +64,8 @@ const typeList: any = { export function getImgUrl(name: string) { const list = Object.values(typeList).flat() - const type = list.includes(fileType(name).toLowerCase()) - ? fileType(name).toLowerCase() - : 'unknown' + const typeStr = fileType(name).toLowerCase() + const type = list.includes(typeStr) ? typeStr : 'unknown' return new URL(`../assets/fileType/${type}-icon.svg`, import.meta.url).href } diff --git a/ui/src/workflow/common/data.ts b/ui/src/workflow/common/data.ts index a726ae9ed5a..98c3c48cf2d 100644 --- a/ui/src/workflow/common/data.ts +++ b/ui/src/workflow/common/data.ts @@ -414,6 +414,10 @@ export const documentExtractNode = { label: t('workflow.nodes.documentExtractNode.content'), value: 'content', }, + { + label: t('workflow.nodes.documentExtractNode.image_list'), + value: 'image_list', + }, { label: t('workflow.nodes.dataSourceWebNode.field_label'), value: 'document_list',