Skip to content

Commit d99889d

Browse files
feat: Document Extract supports inputting document URLs (for API calling)
1 parent 20f27c3 commit d99889d

3 files changed

Lines changed: 128 additions & 6 deletions

File tree

apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
# coding=utf-8
2-
import ast
32
import io
3+
import requests
44

55
import uuid_utils.compat as uuid
66
from django.db.models import QuerySet
77

88
from application.flow.common import WorkflowMode
99
from application.flow.i_step_node import NodeResult
1010
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
11+
from common.utils.common import get_file_name_from_content_disposition, get_file_name_from_url
12+
from common.utils.logger import maxkb_logger
1113
from knowledge.models import File, FileSourceType
1214
from knowledge.serializers.document import split_handles, parse_table_handle_list, FileBufferHandle
15+
from oss.serializers.file import validate_url, SafeHTTPAdapter
1316

1417
splitter = '\n`-----------------------------------`\n'
1518

@@ -23,7 +26,6 @@ def execute(self, document, chat_id=None, **kwargs):
2326
get_buffer = FileBufferHandle().get_buffer
2427

2528
self.context['document_list'] = document
26-
content = []
2729
if document is None or not isinstance(document, list):
2830
return NodeResult({'content': '', 'document_list': []}, {})
2931

@@ -62,9 +64,106 @@ def save_image(image_list):
6264
if not QuerySet(File).filter(id=new_file.id).exists():
6365
new_file.save(file_bytes)
6466

67+
# 从URL下载文件并保存为File对象
68+
def download_and_save_file(url, file_name=None):
69+
try:
70+
# 验证URL安全性
71+
validated_url = validate_url(url)
72+
73+
# 创建安全的HTTP会话
74+
session = requests.Session()
75+
safe_adapter = SafeHTTPAdapter()
76+
session.mount('http://', safe_adapter)
77+
session.mount('https://', safe_adapter)
78+
79+
try:
80+
# 发送GET请求下载文件
81+
response = session.get(
82+
validated_url,
83+
timeout=30,
84+
allow_redirects=True
85+
)
86+
response.raise_for_status()
87+
88+
# 获取文件名(如果未提供)
89+
if not file_name:
90+
# 如果Content-Disposition头中有文件名,优先使用
91+
file_name = get_file_name_from_content_disposition(response.headers.get('Content-Disposition', ''))
92+
if file_name is None:
93+
# 从URL路径中提取文件名
94+
file_name = get_file_name_from_url(validated_url, 'downloaded_document')
95+
96+
# 获取文件内容
97+
file_bytes = response.content
98+
99+
# 生成文件ID
100+
file_id = uuid.uuid7()
101+
102+
# 确定source_type和source_id
103+
source_type = FileSourceType.APPLICATION.value if application_id else FileSourceType.KNOWLEDGE.value if knowledge_id else FileSourceType.TOOL.value
104+
source_id = application_id or knowledge_id or tool_id
105+
106+
# 创建File对象
107+
meta = {
108+
'debug': False if (application_id or knowledge_id or tool_id) else True,
109+
'chat_id': chat_id,
110+
'application_id': str(application_id) if application_id else None,
111+
'knowledge_id': str(knowledge_id) if knowledge_id else None,
112+
'tool_id': str(tool_id) if tool_id else None,
113+
'file_id': str(file_id),
114+
'source_url': url
115+
}
116+
117+
new_file = File(
118+
id=file_id,
119+
file_name=file_name,
120+
file_size=len(file_bytes),
121+
source_type=source_type,
122+
source_id=source_id,
123+
meta=meta
124+
)
125+
126+
# 保存文件到数据库
127+
new_file.save(file_bytes)
128+
129+
maxkb_logger.info(f'Successfully downloaded and saved file from URL: {url}, file_id: {file_id}')
130+
131+
return new_file
132+
133+
finally:
134+
session.close()
135+
136+
except Exception as e:
137+
maxkb_logger.error(f'Failed to download document file from URL: {url}, error: {str(e)}')
138+
raise Exception(f'Failed to download document file: {str(e)}')
139+
140+
content = []
65141
document_list = []
66142
for doc in document:
67-
file = QuerySet(File).filter(id=doc['file_id']).first()
143+
# 考虑API调用时,用户传错了格式,抛出异常提示
144+
if isinstance(doc, str):
145+
raise ValueError('The "document_list" parameters must be in the format of `[{ "url": "http......" }, ......]`')
146+
147+
# 如果是文档的 http(s) URL地址,则先下载并保存到file表中
148+
if not doc.get("file_id" ) and doc.get("url") and doc.get("url").startswith("http"):
149+
try:
150+
# 下载并保存文件
151+
file = download_and_save_file(doc["url"], doc.get('name', None))
152+
153+
# 更新doc字典,添加file_id
154+
doc['file_id'] = str(file.id)
155+
if not doc.get('name'):
156+
doc['name'] = file.file_name
157+
158+
maxkb_logger.info(f'Downloaded file from URL and assigned file_id: {doc["file_id"]}')
159+
except Exception as e:
160+
maxkb_logger.error(f'Error processing document URL: {doc.get("url")}, error: {str(e)}')
161+
raise e
162+
elif doc.get("file_id"):
163+
file = QuerySet(File).filter(id=doc['file_id']).first()
164+
else:
165+
raise ValueError('Please provide a valid document file ID or URL')
166+
68167
buffer = io.BytesIO(file.get_bytes())
69168
buffer.name = doc['name'] # this is the important line
70169

apps/common/utils/common.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from django.db.models import QuerySet
2424
from django.utils.translation import gettext as _
2525
from pydub import AudioSegment
26+
from urllib.parse import urlparse
2627

2728
from ..database_model_manage.database_model_manage import DatabaseModelManage
2829
from ..exception.app_exception import AppApiException
@@ -409,6 +410,7 @@ def is_valid_uuid(uuid_string):
409410
except ValueError:
410411
return False
411412

413+
412414
def common_convert_value(_type, value):
413415
if value is None:
414416
return None
@@ -436,3 +438,25 @@ def common_convert_value(_type, value):
436438
return v
437439
raise Exception(_('type error'))
438440
return value
441+
442+
443+
def get_file_name_from_content_disposition(content_disposition, default = None):
444+
if not content_disposition:
445+
return default
446+
447+
file_name = default
448+
if 'filename=' in content_disposition:
449+
filename_part = content_disposition.split('filename=')[1].split(';')[0].strip('"\'')
450+
if filename_part:
451+
file_name = filename_part
452+
453+
return file_name
454+
455+
456+
def get_file_name_from_url(url, default = None):
457+
if not url:
458+
return default
459+
460+
parsed_url = urlparse(url)
461+
path_parts = parsed_url.path.split('/')
462+
return path_parts[-1] if path_parts and path_parts[-1] else default

ui/src/utils/common.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,8 @@ const typeList: any = {
6464
export function getImgUrl(name: string) {
6565
const list = Object.values(typeList).flat()
6666

67-
const type = list.includes(fileType(name).toLowerCase())
68-
? fileType(name).toLowerCase()
69-
: 'unknown'
67+
const typeStr = fileType(name).toLowerCase()
68+
const type = list.includes(typeStr) ? typeStr : 'unknown'
7069
return new URL(`../assets/fileType/${type}-icon.svg`, import.meta.url).href
7170
}
7271

0 commit comments

Comments
 (0)