11# coding=utf-8
2- import ast
32import io
3+ import requests
44
55import uuid_utils .compat as uuid
66from django .db .models import QuerySet
77
88from application .flow .common import WorkflowMode
99from application .flow .i_step_node import NodeResult
1010from application .flow .step_node .document_extract_node .i_document_extract_node import IDocumentExtractNode
11+ from common .utils .common import get_file_name_from_content_disposition , get_file_name_from_url
12+ from common .utils .logger import maxkb_logger
1113from knowledge .models import File , FileSourceType
1214from knowledge .serializers .document import split_handles , parse_table_handle_list , FileBufferHandle
15+ from oss .serializers .file import validate_url , SafeHTTPAdapter
1316
1417splitter = '\n `-----------------------------------`\n '
1518
@@ -23,7 +26,6 @@ def execute(self, document, chat_id=None, **kwargs):
2326 get_buffer = FileBufferHandle ().get_buffer
2427
2528 self .context ['document_list' ] = document
26- content = []
2729 if document is None or not isinstance (document , list ):
2830 return NodeResult ({'content' : '' , 'document_list' : []}, {})
2931
@@ -62,9 +64,106 @@ def save_image(image_list):
6264 if not QuerySet (File ).filter (id = new_file .id ).exists ():
6365 new_file .save (file_bytes )
6466
67+ # 从URL下载文件并保存为File对象
68+ def download_and_save_file (url , file_name = None ):
69+ try :
70+ # 验证URL安全性
71+ validated_url = validate_url (url )
72+
73+ # 创建安全的HTTP会话
74+ session = requests .Session ()
75+ safe_adapter = SafeHTTPAdapter ()
76+ session .mount ('http://' , safe_adapter )
77+ session .mount ('https://' , safe_adapter )
78+
79+ try :
80+ # 发送GET请求下载文件
81+ response = session .get (
82+ validated_url ,
83+ timeout = 30 ,
84+ allow_redirects = True
85+ )
86+ response .raise_for_status ()
87+
88+ # 获取文件名(如果未提供)
89+ if not file_name :
90+ # 如果Content-Disposition头中有文件名,优先使用
91+ file_name = get_file_name_from_content_disposition (response .headers .get ('Content-Disposition' , '' ))
92+ if file_name is None :
93+ # 从URL路径中提取文件名
94+ file_name = get_file_name_from_url (validated_url , 'downloaded_document' )
95+
96+ # 获取文件内容
97+ file_bytes = response .content
98+
99+ # 生成文件ID
100+ file_id = uuid .uuid7 ()
101+
102+ # 确定source_type和source_id
103+ source_type = FileSourceType .APPLICATION .value if application_id else FileSourceType .KNOWLEDGE .value if knowledge_id else FileSourceType .TOOL .value
104+ source_id = application_id or knowledge_id or tool_id
105+
106+ # 创建File对象
107+ meta = {
108+ 'debug' : False if (application_id or knowledge_id or tool_id ) else True ,
109+ 'chat_id' : chat_id ,
110+ 'application_id' : str (application_id ) if application_id else None ,
111+ 'knowledge_id' : str (knowledge_id ) if knowledge_id else None ,
112+ 'tool_id' : str (tool_id ) if tool_id else None ,
113+ 'file_id' : str (file_id ),
114+ 'source_url' : url
115+ }
116+
117+ new_file = File (
118+ id = file_id ,
119+ file_name = file_name ,
120+ file_size = len (file_bytes ),
121+ source_type = source_type ,
122+ source_id = source_id ,
123+ meta = meta
124+ )
125+
126+ # 保存文件到数据库
127+ new_file .save (file_bytes )
128+
129+ maxkb_logger .info (f'Successfully downloaded and saved file from URL: { url } , file_id: { file_id } ' )
130+
131+ return new_file
132+
133+ finally :
134+ session .close ()
135+
136+ except Exception as e :
137+ maxkb_logger .error (f'Failed to download document file from URL: { url } , error: { str (e )} ' )
138+ raise Exception (f'Failed to download document file: { str (e )} ' )
139+
140+ content = []
65141 document_list = []
66142 for doc in document :
67- file = QuerySet (File ).filter (id = doc ['file_id' ]).first ()
143+ # 考虑API调用时,用户传错了格式,抛出异常提示
144+ if isinstance (doc , str ):
145+ raise ValueError ('The "document_list" parameters must be in the format of `[{ "url": "http......" }, ......]`' )
146+
147+ # 如果是文档的 http(s) URL地址,则先下载并保存到file表中
148+ if not doc .get ("file_id" ) and doc .get ("url" ) and doc .get ("url" ).startswith ("http" ):
149+ try :
150+ # 下载并保存文件
151+ file = download_and_save_file (doc ["url" ], doc .get ('name' , None ))
152+
153+ # 更新doc字典,添加file_id
154+ doc ['file_id' ] = str (file .id )
155+ if not doc .get ('name' ):
156+ doc ['name' ] = file .file_name
157+
158+ maxkb_logger .info (f'Downloaded file from URL and assigned file_id: { doc ["file_id" ]} ' )
159+ except Exception as e :
160+ maxkb_logger .error (f'Error processing document URL: { doc .get ("url" )} , error: { str (e )} ' )
161+ raise e
162+ elif doc .get ("file_id" ):
163+ file = QuerySet (File ).filter (id = doc ['file_id' ]).first ()
164+ else :
165+ raise ValueError ('Please provide a valid document file ID or URL' )
166+
68167 buffer = io .BytesIO (file .get_bytes ())
69168 buffer .name = doc ['name' ] # this is the important line
70169
0 commit comments