diff --git a/app/config.py b/app/config.py index 0c23bd63..34234a07 100644 --- a/app/config.py +++ b/app/config.py @@ -24,6 +24,7 @@ METADATA_SERVER_URL = getattr(seahub_settings, 'METADATA_SERVER_URL', '') ENABLE_METADATA_MANAGEMENT = getattr(seahub_settings, 'ENABLE_METADATA_MANAGEMENT', False) METADATA_FILE_TYPES = getattr(seahub_settings, 'METADATA_FILE_TYPES', {}) + FILE_SERVER = getattr(seahub_settings, 'FILE_SERVER_ROOT', '') except ImportError: logger.critical("Can not import seahub settings.") raise RuntimeError("Can not import seahub settings.") diff --git a/repo_metadata/ai/ai_server.py b/repo_metadata/ai/ai_server.py new file mode 100644 index 00000000..b80344f3 --- /dev/null +++ b/repo_metadata/ai/ai_server.py @@ -0,0 +1,43 @@ +import os +import logging + +from seafevents.app.config import get_config +from seafevents.utils import get_opt_from_conf_or_env + + +logger = logging.getLogger(__name__) + + +class RepoMetadataAIserver: + def __init__(self): + self.llm_url = None + # Refer to diff llm model + self.llm_type = None + # Refer to llm api key + self.llm_key = None + + def init(self, config): + self._parse_config(config) + + def _parse_config(self, config): + section_name = 'AI' + config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR') + if config_dir: + config_file = os.path.join(config_dir, 'seafevents.conf') + else: + config_file = os.environ.get('EVENTS_CONFIG_FILE') + + if not config_file or not os.path.exists(config_file): + return + + config = get_config(config_file) + if not config.has_section(section_name): + return + + self.llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE') + if self.llm_type == 'open-ai-proxy': + self.llm_url = get_opt_from_conf_or_env(config, section_name, 'llm_url', 'LLM_URL') + if not self.llm_url: + logger.info("llm_url not found in the configuration file or environment variables.") + +metadata_ai_server = RepoMetadataAIserver() diff --git a/repo_metadata/ai/constants.py b/repo_metadata/ai/constants.py new file mode 100644 index 00000000..c309b1dc --- /dev/null +++ b/repo_metadata/ai/constants.py @@ -0,0 +1 @@ +LLM_INPUT_CHARACTERS_LIMIT = 8000 diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py new file mode 100644 index 00000000..7b0775a4 --- /dev/null +++ b/repo_metadata/ai/gen_summary.py @@ -0,0 +1,111 @@ +import os +import logging +from gevent.pool import Pool + +from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI +from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md +from seafevents.repo_metadata.ai.ai_server import metadata_ai_server +from seafevents.repo_metadata.ai.constants import LLM_INPUT_CHARACTERS_LIMIT +from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI +from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS +from seafevents.repo_metadata.utils import get_file_by_path +from seafevents.repo_metadata.utils import METADATA_TABLE + + +logger = logging.getLogger(__name__) + + +def gen_doc_summary(content): + llm_type = metadata_ai_server.llm_type + llm_url = metadata_ai_server.llm_url + + if llm_type == 'open-ai-proxy': + openai_api = OpenAIAPI(llm_url) + system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None' + system_prompt = {"role": "system", "content": system_content} + user_prompt = {"role": "user", "content": content} + messages = [system_prompt, user_prompt] + summary = openai_api.chat_completions(messages) + return summary + else: + logger.error('llm_type is not set correctly in seafevents.conf') + return None + + +def create_summary_of_doc_in_repo(repo_id): + metadata_server_api = MetadataServerAPI('seafevents') + sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}`' + query_result = metadata_server_api.query_rows(repo_id, sql).get('results', []) + updated_summary_rows = [] + + def process_row(row): + parent_dir = row[METADATA_TABLE.columns.parent_dir.name] + file_name = row[METADATA_TABLE.columns.file_name.name] + path = os.path.join(parent_dir, file_name) + if _is_excluded_path(path): + return + + row_id = row[METADATA_TABLE.columns.id.name] + _, ext = os.path.splitext(file_name) + if ext == '.sdoc': + sdoc_content = get_file_by_path(repo_id, path) + md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT] + summary_text = gen_doc_summary(md_content) + if summary_text in ['None', 'none']: + return + + updated_row = { + METADATA_TABLE.columns.id.name: row_id, + METADATA_TABLE.columns.summary.name: summary_text, + } + updated_summary_rows.append(updated_row) + + pool = Pool(10) + logger.info(f'Start summarizing sdoc in repo {repo_id}') + for row in query_result: + pool.spawn(process_row, row) + + pool.join() + + if updated_summary_rows: + metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_rows) + logger.info(f'Finish summarizing sdoc in repo {repo_id}') + return {'success': True} + + +def update_single_doc_summary(repo_id, file_path): + metadata_server_api = MetadataServerAPI('seafevents') + parent_dir = os.path.dirname(file_path) + file_name = os.path.basename(file_path) + _, file_ext = os.path.splitext(file_name) + sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}` WHERE (`{METADATA_TABLE.columns.parent_dir.name}` = ? AND `{METADATA_TABLE.columns.file_name.name}` = ?)' + parameters = [] + updated_summary_row = [] + if file_ext == '.sdoc': + sdoc_content = get_file_by_path(repo_id, file_path) + md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT] + summary_text = gen_doc_summary(md_content) + if summary_text in ['None', 'none']: + summary_text = '' + + parameters.append(parent_dir) + parameters.append(file_name) + query_result = metadata_server_api.query_rows(repo_id, sql, parameters).get('results', []) + row_id = query_result[0][METADATA_TABLE.columns.id.name] + + updated_row = { + METADATA_TABLE.columns.id.name: row_id, + METADATA_TABLE.columns.summary.name: summary_text, + } + updated_summary_row.append(updated_row) + if updated_summary_row: + metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_row) + return {'success': True} + + +def _is_excluded_path(path): + if not path or path == '/': + return True + for ex_path in EXCLUDED_PATHS: + if path.startswith(ex_path): + return True diff --git a/repo_metadata/ai/utils/openai_api.py b/repo_metadata/ai/utils/openai_api.py new file mode 100644 index 00000000..75c1643f --- /dev/null +++ b/repo_metadata/ai/utils/openai_api.py @@ -0,0 +1,39 @@ +import requests +import logging +import json + + +logger = logging.getLogger(__name__) + + +def parse_response(response): + if response.status_code >= 400: + raise ConnectionError(response.status_code, response.text) + else: + try: + data = json.loads(response.text) + return data + except: + pass + + +class OpenAIAPI: + def __init__(self, openai_proxy_url, timeout=180): + self.openai_proxy_url = openai_proxy_url.rstrip('/') + '/api/v1/chat-completions/create' + self.timeout = timeout + + def chat_completions(self, messages, temperature=0): + json_data = { + 'model': 'gpt-4o-mini', + 'messages': messages, + 'temperature': temperature + } + response = requests.post(self.openai_proxy_url, json=json_data, timeout=self.timeout) + data = parse_response(response) + try: + result = data['choices'][0]['message']['content'] + except KeyError as e: + logger.exception(e) + result = None + + return result diff --git a/repo_metadata/ai/utils/sdoc2md.py b/repo_metadata/ai/utils/sdoc2md.py new file mode 100644 index 00000000..e3aed510 --- /dev/null +++ b/repo_metadata/ai/utils/sdoc2md.py @@ -0,0 +1,272 @@ +from html2text import HTML2Text + + +HEADER_LABEL = [ + 'header1', + 'header2', + 'header3', + 'header4', + 'header5', + 'header6', +] + +def _handle_text_style(json_data_text, return_null=False): + text = json_data_text.get('text', '') + pure_text = text + bold = json_data_text.get('bold') + italic = json_data_text.get('italic') + + if italic: + text = "_%s_" % text + if bold: + text = "**%s**" % text + + if (not text) and return_null: + text = '.' + return text, pure_text + +# sdoc 2 html dom +# 1. header +def _handle_header_dom(header_json, header_type): + output = '' + for child in header_json['children']: + if 'text' in child: + output += child.get('text') + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + tag = { + "header1": "
%s
* [x] %s
" % output + else: + output = "* [ ] %s
" % output + + return output + +# 5 blockquote +def _handle_blockquote_dom(blockquote_json): + output = "" + for child in blockquote_json['children']: + child_type = child.get('type') + if child_type in ['ordered_list', 'unordered_list']: + output += _handle_list_dom(child, '', child_type == 'ordered_list') + + if child_type == 'link': + text_name = child['children'][0]['text'] + text_url = child.get('href') + output += "%s" % (text_url, text_name) + + if child_type == 'paragraph': + output += '%s' % _handle_pagragh_dom(child) + + if child_type == 'check_list_item': + output += '%s' % _handle_check_list_dom(child) + + + if 'text' in child: + text = child.get('text') + text_list = text.split("\n") + output += ''.join(['%s
' % t for t in text_list if t.strip()]) + + tag = "%s" % output + return tag + +# 6 url link +def _handle_link_dom(link_json): + href = link_json.get('href') + link_child = link_json['children'][0] + + res = "%s" % (href, link_child.get('text')) + return res + + +# 7 pagragh +def _handle_pagragh_dom(pagragh_json): + output = '' + for child in pagragh_json['children']: + if 'text' in child: + output += _handle_text_style(child)[0] + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + + result = "
%s
" % output + return result.replace("\n", "") + + +def _handle_table_cell_dom(table_cell_json): + output = '' + for child in table_cell_json['children']: + if 'text' in child: + output += _handle_text_style(child)[0] + else: + child_type = child.get('type') + if child_type == 'link': + output += _handle_link_dom(child) + + return output + + +# html2markdown +def handle_header(header_json, header_type): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + dom = _handle_header_dom(header_json, header_type) + return md_hander.handle(dom) + + +def handle_check_list(check_list_json): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + return md_hander.handle(_handle_check_list_dom(check_list_json)) + + +def handle_paragraph(paragraph_json): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + dom = _handle_pagragh_dom(paragraph_json) + return md_hander.handle(dom) + + +def handle_list(json_data, ordered=False): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + html = _handle_list_dom(json_data, '', ordered) + md = md_hander.handle(html) + return md + + +def handle_codeblock(code_bloc_json): + lang = code_bloc_json.get('language', '') + output = "" + for child in code_bloc_json.get('children'): + if 'children' in child: + output += "%s\n" % child.get('children', '')[0].get('text') + return "```%s\n%s```" % (lang, output) + + +def handle_blockquote(json_data): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + html = _handle_blockquote_dom(json_data) + md = md_hander.handle(html) + return md + + +def handle_table(table_json): + md_hander = HTML2Text(bodywidth=0) # no wrapping length + th_headers = '' + th_body = '' + first_table_row = table_json['children'][0] + other_table_rows = table_json['children'][1:] + + for first_table_cell in first_table_row['children']: + th_headers += "