diff --git a/app/config.py b/app/config.py index 0c23bd63..34234a07 100644 --- a/app/config.py +++ b/app/config.py @@ -24,6 +24,7 @@ METADATA_SERVER_URL = getattr(seahub_settings, 'METADATA_SERVER_URL', '') ENABLE_METADATA_MANAGEMENT = getattr(seahub_settings, 'ENABLE_METADATA_MANAGEMENT', False) METADATA_FILE_TYPES = getattr(seahub_settings, 'METADATA_FILE_TYPES', {}) + FILE_SERVER = getattr(seahub_settings, 'FILE_SERVER_ROOT', '') except ImportError: logger.critical("Can not import seahub settings.") raise RuntimeError("Can not import seahub settings.") diff --git a/repo_metadata/ai/ai_server.py b/repo_metadata/ai/ai_server.py new file mode 100644 index 00000000..b80344f3 --- /dev/null +++ b/repo_metadata/ai/ai_server.py @@ -0,0 +1,43 @@ +import os +import logging + +from seafevents.app.config import get_config +from seafevents.utils import get_opt_from_conf_or_env + + +logger = logging.getLogger(__name__) + + +class RepoMetadataAIserver: + def __init__(self): + self.llm_url = None + # Refer to diff llm model + self.llm_type = None + # Refer to llm api key + self.llm_key = None + + def init(self, config): + self._parse_config(config) + + def _parse_config(self, config): + section_name = 'AI' + config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR') + if config_dir: + config_file = os.path.join(config_dir, 'seafevents.conf') + else: + config_file = os.environ.get('EVENTS_CONFIG_FILE') + + if not config_file or not os.path.exists(config_file): + return + + config = get_config(config_file) + if not config.has_section(section_name): + return + + self.llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE') + if self.llm_type == 'open-ai-proxy': + self.llm_url = get_opt_from_conf_or_env(config, section_name, 'llm_url', 'LLM_URL') + if not self.llm_url: + logger.info("llm_url not found in the configuration file or environment variables.") + +metadata_ai_server = RepoMetadataAIserver() diff --git a/repo_metadata/ai/constants.py b/repo_metadata/ai/constants.py new file mode 100644 index 00000000..c309b1dc --- /dev/null +++ b/repo_metadata/ai/constants.py @@ -0,0 +1 @@ +LLM_INPUT_CHARACTERS_LIMIT = 8000 diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py new file mode 100644 index 00000000..7b0775a4 --- /dev/null +++ b/repo_metadata/ai/gen_summary.py @@ -0,0 +1,111 @@ +import os +import logging +from gevent.pool import Pool + +from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI +from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md +from seafevents.repo_metadata.ai.ai_server import metadata_ai_server +from seafevents.repo_metadata.ai.constants import LLM_INPUT_CHARACTERS_LIMIT +from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI +from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS +from seafevents.repo_metadata.utils import get_file_by_path +from seafevents.repo_metadata.utils import METADATA_TABLE + + +logger = logging.getLogger(__name__) + + +def gen_doc_summary(content): + llm_type = metadata_ai_server.llm_type + llm_url = metadata_ai_server.llm_url + + if llm_type == 'open-ai-proxy': + openai_api = OpenAIAPI(llm_url) + system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None' + system_prompt = {"role": "system", "content": system_content} + user_prompt = {"role": "user", "content": content} + messages = [system_prompt, user_prompt] + summary = openai_api.chat_completions(messages) + return summary + else: + logger.error('llm_type is not set correctly in seafevents.conf') + return None + + +def create_summary_of_doc_in_repo(repo_id): + metadata_server_api = MetadataServerAPI('seafevents') + sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}`' + query_result = metadata_server_api.query_rows(repo_id, sql).get('results', []) + updated_summary_rows = [] + + def process_row(row): + parent_dir = row[METADATA_TABLE.columns.parent_dir.name] + file_name = row[METADATA_TABLE.columns.file_name.name] + path = os.path.join(parent_dir, file_name) + if _is_excluded_path(path): + return + + row_id = row[METADATA_TABLE.columns.id.name] + _, ext = os.path.splitext(file_name) + if ext == '.sdoc': + sdoc_content = get_file_by_path(repo_id, path) + md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT] + summary_text = gen_doc_summary(md_content) + if summary_text in ['None', 'none']: + return + + updated_row = { + METADATA_TABLE.columns.id.name: row_id, + METADATA_TABLE.columns.summary.name: summary_text, + } + updated_summary_rows.append(updated_row) + + pool = Pool(10) + logger.info(f'Start summarizing sdoc in repo {repo_id}') + for row in query_result: + pool.spawn(process_row, row) + + pool.join() + + if updated_summary_rows: + metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_rows) + logger.info(f'Finish summarizing sdoc in repo {repo_id}') + return {'success': True} + + +def update_single_doc_summary(repo_id, file_path): + metadata_server_api = MetadataServerAPI('seafevents') + parent_dir = os.path.dirname(file_path) + file_name = os.path.basename(file_path) + _, file_ext = os.path.splitext(file_name) + sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}` WHERE (`{METADATA_TABLE.columns.parent_dir.name}` = ? AND `{METADATA_TABLE.columns.file_name.name}` = ?)' + parameters = [] + updated_summary_row = [] + if file_ext == '.sdoc': + sdoc_content = get_file_by_path(repo_id, file_path) + md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT] + summary_text = gen_doc_summary(md_content) + if summary_text in ['None', 'none']: + summary_text = '' + + parameters.append(parent_dir) + parameters.append(file_name) + query_result = metadata_server_api.query_rows(repo_id, sql, parameters).get('results', []) + row_id = query_result[0][METADATA_TABLE.columns.id.name] + + updated_row = { + METADATA_TABLE.columns.id.name: row_id, + METADATA_TABLE.columns.summary.name: summary_text, + } + updated_summary_row.append(updated_row) + if updated_summary_row: + metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_row) + return {'success': True} + + +def _is_excluded_path(path): + if not path or path == '/': + return True + for ex_path in EXCLUDED_PATHS: + if path.startswith(ex_path): + return True diff --git a/repo_metadata/ai/utils/openai_api.py b/repo_metadata/ai/utils/openai_api.py new file mode 100644 index 00000000..75c1643f --- /dev/null +++ b/repo_metadata/ai/utils/openai_api.py @@ -0,0 +1,39 @@ +import requests +import logging +import json + + +logger = logging.getLogger(__name__) + + +def parse_response(response): + if response.status_code >= 400: + raise ConnectionError(response.status_code, response.text) + else: + try: + data = json.loads(response.text) + return data + except: + pass + + +class OpenAIAPI: + def __init__(self, openai_proxy_url, timeout=180): + self.openai_proxy_url = openai_proxy_url.rstrip('/') + '/api/v1/chat-completions/create' + self.timeout = timeout + + def chat_completions(self, messages, temperature=0): + json_data = { + 'model': 'gpt-4o-mini', + 'messages': messages, + 'temperature': temperature + } + response = requests.post(self.openai_proxy_url, json=json_data, timeout=self.timeout) + data = parse_response(response) + try: + result = data['choices'][0]['message']['content'] + except KeyError as e: + logger.exception(e) + result = None + + return result diff --git a/repo_metadata/ai/utils/sdoc2md.py b/repo_metadata/ai/utils/sdoc2md.py new file mode 100644 index 00000000..e3aed510 --- /dev/null +++ b/repo_metadata/ai/utils/sdoc2md.py @@ -0,0 +1,272 @@ +from html2text import HTML2Text + + +HEADER_LABEL = [ + 'header1', + 'header2', + 'header3', + 'header4', + 'header5', + 'header6', +] + +def _handle_text_style(json_data_text, return_null=False): + text = json_data_text.get('text', '') + pure_text = text + bold = json_data_text.get('bold') + italic = json_data_text.get('italic') + + if italic: + text = "_%s_" % text + if bold: + text = "**%s**" % text + + if (not text) and return_null: + text = '.' + return text, pure_text + +# sdoc 2 html dom +# 1. header +def _handle_header_dom(header_json, header_type): + output = '' + for child in header_json['children']: + if 'text' in child: + output += child.get('text') + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + tag = { + "header1": "

%s

", + "header2": "

%s

", + "header3": "

%s

", + "header4": "

%s

", + "header5": "
%s
", + "header6": "
%s
", + + }.get(header_type) + return tag % output + +# 3 list including ordered / unordered list +def _handle_list_dom(list_json, tag='', ordered=False): + for list_item in list_json['children']: + item_eles = list_item['children'] + text = '' + for lic in item_eles: + + if lic.get('type') == 'unordered_list': + tag += _handle_list_dom(lic, '') + if lic.get('type') == 'ordered_list': + tag += _handle_list_dom(lic, '', True) + + if lic.get('type') == 'paragraph': + for item in lic['children']: + if 'text' in item: + text += _handle_text_style(item)[0] + else: + item_type = item.get('type') + if item_type == 'link': + text_name = item['children'][0]['text'] + text_url = item.get('href') + text += "%s" % (text_url, text_name) + tag += "
  • %s

  • " % text + if ordered: + res = "" % tag + else: + res = "" % tag + return res + +# 4 checkbox +def _handle_check_list_dom(check_list_json): + output = "" + checked = check_list_json.get('checked') + for child in check_list_json['children']: + if 'text' in child: + output += _handle_text_style(child)[0] + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + if checked: + output = "

    * [x] %s

    " % output + else: + output = "

    * [ ] %s

    " % output + + return output + +# 5 blockquote +def _handle_blockquote_dom(blockquote_json): + output = "" + for child in blockquote_json['children']: + child_type = child.get('type') + if child_type in ['ordered_list', 'unordered_list']: + output += _handle_list_dom(child, '', child_type == 'ordered_list') + + if child_type == 'link': + text_name = child['children'][0]['text'] + text_url = child.get('href') + output += "%s" % (text_url, text_name) + + if child_type == 'paragraph': + output += '%s' % _handle_pagragh_dom(child) + + if child_type == 'check_list_item': + output += '%s' % _handle_check_list_dom(child) + + + if 'text' in child: + text = child.get('text') + text_list = text.split("\n") + output += ''.join(['

    %s

    ' % t for t in text_list if t.strip()]) + + tag = "
    %s
    " % output + return tag + +# 6 url link +def _handle_link_dom(link_json): + href = link_json.get('href') + link_child = link_json['children'][0] + + res = "%s" % (href, link_child.get('text')) + return res + + +# 7 pagragh +def _handle_pagragh_dom(pagragh_json): + output = '' + for child in pagragh_json['children']: + if 'text' in child: + output += _handle_text_style(child)[0] + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + + result = "

    %s

    " % output + return result.replace("\n", "") + + +def _handle_table_cell_dom(table_cell_json): + output = '' + for child in table_cell_json['children']: + if 'text' in child: + output += _handle_text_style(child)[0] + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + return output + + +# html2markdown +def handle_header(header_json, header_type): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + dom = _handle_header_dom(header_json, header_type) + return md_hander.handle(dom) + + +def handle_check_list(check_list_json): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + return md_hander.handle(_handle_check_list_dom(check_list_json)) + + +def handle_paragraph(paragraph_json): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + dom = _handle_pagragh_dom(paragraph_json) + return md_hander.handle(dom) + + +def handle_list(json_data, ordered=False): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + html = _handle_list_dom(json_data, '', ordered) + md = md_hander.handle(html) + return md + + +def handle_codeblock(code_bloc_json): + lang = code_bloc_json.get('language', '') + output = "" + for child in code_bloc_json.get('children'): + if 'children' in child: + output += "%s\n" % child.get('children', '')[0].get('text') + return "```%s\n%s```" % (lang, output) + + +def handle_blockquote(json_data): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + html = _handle_blockquote_dom(json_data) + md = md_hander.handle(html) + return md + + +def handle_table(table_json): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + th_headers = '' + th_body = '' + first_table_row = table_json['children'][0] + other_table_rows = table_json['children'][1:] + + for first_table_cell in first_table_row['children']: + th_headers += "%s" % _handle_table_cell_dom(first_table_cell) + + for table_row in other_table_rows: + td = '' + for table_cell in table_row['children']: + td += "%s" % _handle_table_cell_dom(table_cell) + th_body += "%s" % td + + html = "
    %s%s
    " % (th_headers, th_body) + return md_hander.handle(html) + + +# +def json2md(json_data): + doc_type = json_data.get('type') + markdown_output = '' + if doc_type == 'title': + output = handle_header(json_data, 'header1') + markdown_output += output + + if doc_type in HEADER_LABEL: + output = handle_header(json_data, doc_type) + markdown_output += output + + if doc_type == 'check_list_item': + output = handle_check_list(json_data) + markdown_output += output + + if doc_type == 'paragraph': + output = handle_paragraph(json_data) + markdown_output += output + + if doc_type == 'code_block': + output = handle_codeblock(json_data) + markdown_output += output + + if doc_type == 'table': + output = handle_table(json_data) + markdown_output += output + + if doc_type == 'unordered_list': + output = handle_list(json_data) + markdown_output += output + + if doc_type == 'ordered_list': + output = handle_list(json_data, ordered=True) + markdown_output += output + + if doc_type == 'blockquote': + output = handle_blockquote(json_data) + markdown_output += output + return markdown_output + +def sdoc2md(json_tree): + results = [] + for sub in json_tree.get('children'): + results.append(json2md(sub)) + markdown_text = "\n".join(results) + return markdown_text diff --git a/repo_metadata/utils.py b/repo_metadata/utils.py index 1bedcee1..3abd08bc 100644 --- a/repo_metadata/utils.py +++ b/repo_metadata/utils.py @@ -2,12 +2,15 @@ import random import math import exifread +import requests +import json from io import BytesIO +from urllib.parse import quote as urlquote from seafobj import commit_mgr, fs_mgr - -from seafevents.app.config import METADATA_FILE_TYPES +from seaserv import seafile_api +from seafevents.app.config import METADATA_FILE_TYPES, FILE_SERVER def gen_fileext_type_map(): @@ -22,6 +25,24 @@ def gen_fileext_type_map(): return ext_to_type +def gen_file_get_url(token, filename): + return '%s/files/%s/%s' % (FILE_SERVER, token, urlquote(filename)) + + +def get_file_by_path(repo_id, path): + file_id = seafile_api.get_file_id_by_path(repo_id, path) + filename = os.path.basename(path) + token = seafile_api.get_fileserver_access_token( + repo_id, file_id, 'download', username='sys_summary_sdoc', use_onetime=True + ) + url = gen_file_get_url(token, filename) + content =requests.get(url, timeout=10).content.decode() + + if content: + content = json.loads(content) + return content + + FILEEXT_TYPE_MAP = gen_fileext_type_map() @@ -99,6 +120,7 @@ def __init__(self): self.file_type = MetadataColumn('_file_type', '_file_type', 'single-select', {'options': gen_select_options(list(METADATA_FILE_TYPES.keys()))}) self.location = MetadataColumn('_location', '_location', 'geolocation', {'geo_format': 'lng_lat'}) + self.summary = MetadataColumn('_summary', '_summary', 'long-text') class MetadataColumn(object): diff --git a/seafevent_server/request_handler.py b/seafevent_server/request_handler.py index 83de310f..5fb5a80f 100644 --- a/seafevent_server/request_handler.py +++ b/seafevent_server/request_handler.py @@ -7,7 +7,8 @@ from seafevents.seafevent_server.task_manager import task_manager from seafevents.seafevent_server.export_task_manager import event_export_task_manager from seafevents.seasearch.index_task.index_task_manager import index_task_manager - +from seafevents.repo_metadata.ai.gen_summary import create_summary_of_doc_in_repo, \ + update_single_doc_summary app = Flask(__name__) logger = logging.getLogger(__name__) @@ -132,3 +133,49 @@ def search(): results = index_task_manager.keyword_search(query, repos, count, suffixes) return {'results': results}, 200 + + +@app.route('/create-summary-of-doc-in-repo', methods=['POST']) +def create_doc_summary(): + is_valid, error = check_auth_token(request) + if not is_valid: + return make_response((error, 403)) + + try: + data = json.loads(request.data) + except Exception as e: + logger.exception(e) + return {'error_msg': 'Bad request.'}, 400 + + repo_id = data.get('repo_id') + + if not repo_id: + return {'error_msg': 'repo_id invalid.'}, 400 + create_status = create_summary_of_doc_in_repo(repo_id) + + return create_status + + +@app.route('/update-single-doc-summary', methods=['POST']) +def update_doc_summary(): + is_valid, error = check_auth_token(request) + if not is_valid: + return make_response((error, 403)) + + try: + data = json.loads(request.data) + except Exception as e: + logger.exception(e) + return {'error_msg': 'Bad request.'}, 400 + + repo_id = data.get('repo_id') + file_path = data.get('file_path') + + if not repo_id: + return {'error_msg': 'repo_id invalid.'}, 400 + if not file_path: + return {'error_msg': 'file_path invalid.'}, 400 + + update_status = update_single_doc_summary(repo_id, file_path) + + return update_status diff --git a/seafevent_server/seafevent_server.py b/seafevent_server/seafevent_server.py index c1a6bdfd..c61d4768 100644 --- a/seafevent_server/seafevent_server.py +++ b/seafevent_server/seafevent_server.py @@ -5,6 +5,8 @@ from seafevents.seafevent_server.task_manager import task_manager from seafevents.seafevent_server.export_task_manager import event_export_task_manager from seafevents.seasearch.index_task.index_task_manager import index_task_manager +from seafevents.repo_metadata.ai.ai_server import metadata_ai_server +from seafevents.app.config import ENABLE_METADATA_MANAGEMENT class SeafEventServer(Thread): @@ -22,6 +24,9 @@ def __init__(self, app, config): index_task_manager.init(config) + if ENABLE_METADATA_MANAGEMENT: + metadata_ai_server.init(config) + def _parse_config(self, config): if config.has_option('SEAF-EVENT-SERVER', 'host'): self._host = config.get('SEAF-EVENT-SERVER', 'host')