From 024872908c66cc304a85ac701424658a3564ef42 Mon Sep 17 00:00:00 2001
From: cir9no <44470218+cir9no@users.noreply.github.com>
Date: Wed, 31 Jul 2024 19:03:12 +0800
Subject: [PATCH 1/5] feat: init sdoc summary when add summary column
---
app/config.py | 1 +
repo_metadata/ai/gen_summary.py | 77 ++++++++
repo_metadata/ai/utils/openai_api.py | 65 +++++++
repo_metadata/ai/utils/sdoc2md.py | 272 +++++++++++++++++++++++++++
repo_metadata/utils.py | 26 ++-
seafevent_server/request_handler.py | 23 ++-
6 files changed, 461 insertions(+), 3 deletions(-)
create mode 100644 repo_metadata/ai/gen_summary.py
create mode 100644 repo_metadata/ai/utils/openai_api.py
create mode 100644 repo_metadata/ai/utils/sdoc2md.py
diff --git a/app/config.py b/app/config.py
index 0c23bd63..34234a07 100644
--- a/app/config.py
+++ b/app/config.py
@@ -24,6 +24,7 @@
METADATA_SERVER_URL = getattr(seahub_settings, 'METADATA_SERVER_URL', '')
ENABLE_METADATA_MANAGEMENT = getattr(seahub_settings, 'ENABLE_METADATA_MANAGEMENT', False)
METADATA_FILE_TYPES = getattr(seahub_settings, 'METADATA_FILE_TYPES', {})
+ FILE_SERVER = getattr(seahub_settings, 'FILE_SERVER_ROOT', '')
except ImportError:
logger.critical("Can not import seahub settings.")
raise RuntimeError("Can not import seahub settings.")
diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py
new file mode 100644
index 00000000..967c04e0
--- /dev/null
+++ b/repo_metadata/ai/gen_summary.py
@@ -0,0 +1,77 @@
+import os
+import logging
+from gevent.pool import Pool
+
+from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI, get_openai_proxy_url
+from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md
+from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI
+from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS
+from seafevents.repo_metadata.utils import get_file_by_path
+from seafevents.repo_metadata.utils import METADATA_TABLE
+
+
+logger = logging.getLogger(__name__)
+
+
+
+def gen_doc_summary(content):
+ openai_proxy_url = get_openai_proxy_url()
+ openai = OpenAIAPI(openai_proxy_url)
+ system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None'
+ system_prompt = {"role": "system", "content": system_content}
+ user_prompt = {"role": "user", "content": content}
+ messages = [system_prompt, user_prompt]
+ summary = openai.chat_completions(messages)
+ return summary
+
+
+def create_summary_of_sdoc_in_repo(repo_id):
+ metadata_server_api = MetadataServerAPI('seafevents')
+ sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}`'
+ query_result = metadata_server_api.query_rows(repo_id, sql).get('results', [])
+ updated_summary_rows = []
+
+ def process_row(row):
+ parent_dir = row[METADATA_TABLE.columns.parent_dir.name]
+ file_name = row[METADATA_TABLE.columns.file_name.name]
+ if parent_dir == '/':
+ path = parent_dir + file_name
+ else:
+ path = parent_dir + '/' + file_name
+ if _is_excluded_path(path):
+ return
+
+ row_id = row[METADATA_TABLE.columns.id.name]
+ _, ext = os.path.splitext(file_name)
+ if ext == '.sdoc':
+ sdoc_content = get_file_by_path(repo_id, path)
+ md_content = sdoc2md(sdoc_content)
+ summary_text = gen_doc_summary(md_content)
+ if summary_text in ['None', 'none']:
+ return
+
+ updated_row = {
+ METADATA_TABLE.columns.id.name: row_id,
+ METADATA_TABLE.columns.summary.name: summary_text,
+ }
+ updated_summary_rows.append(updated_row)
+
+ pool = Pool(50)
+ logger.info(f'Start summarizing sdoc in repo {repo_id}')
+ for row in query_result:
+ pool.spawn(process_row, row)
+
+ pool.join()
+
+ if updated_summary_rows:
+ metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_rows)
+ logger.info(f'Finish summarizing sdoc in repo {repo_id}')
+ return {'success': True}
+
+
+def _is_excluded_path(path):
+ if not path or path == '/':
+ return True
+ for ex_path in EXCLUDED_PATHS:
+ if path.startswith(ex_path):
+ return True
diff --git a/repo_metadata/ai/utils/openai_api.py b/repo_metadata/ai/utils/openai_api.py
new file mode 100644
index 00000000..15b0d03b
--- /dev/null
+++ b/repo_metadata/ai/utils/openai_api.py
@@ -0,0 +1,65 @@
+import requests
+import logging
+import json
+import os
+
+from seafevents.app.config import get_config
+from seafevents.utils import get_opt_from_conf_or_env
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_response(response):
+ if response.status_code >= 400:
+ raise ConnectionError(response.status_code, response.text)
+ else:
+ try:
+ data = json.loads(response.text)
+ return data
+ except:
+ pass
+
+
+def get_openai_proxy_url():
+ section_name = 'AI'
+ config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR')
+ if config_dir:
+ config_file = os.path.join(config_dir, 'seafevents.conf')
+ else:
+ config_file = os.environ.get('EVENTS_CONFIG_FILE')
+
+ if not config_file or not os.path.exists(config_file):
+ return
+
+ config = get_config(config_file)
+
+ if not config.has_section(section_name):
+ return
+
+ openai_proxy_url = get_opt_from_conf_or_env(config, section_name, 'openai_proxy_url', 'OPENAI_PROXY_URL')
+ if not openai_proxy_url:
+ raise ValueError("OPENAI_PROXY_URL not found in the configuration file or environment variables.")
+ return openai_proxy_url
+
+
+class OpenAIAPI:
+ def __init__(self, openai_url, timeout=180):
+ self.openai_url = openai_url.rstrip('/') + '/api/v1/chat-completions/create'
+ self.timeout = timeout
+
+ def chat_completions(self, messages, temperature=0):
+ json_data = {
+ 'model': 'gpt-4o-mini',
+ 'messages': messages,
+ 'temperature': temperature
+ }
+ response = requests.post(self.openai_url, json=json_data, timeout=self.timeout)
+ data = parse_response(response)
+ try:
+ result = data['choices'][0]['message']['content']
+ except KeyError as e:
+ logger.exception(e)
+ result = None
+
+ return result
diff --git a/repo_metadata/ai/utils/sdoc2md.py b/repo_metadata/ai/utils/sdoc2md.py
new file mode 100644
index 00000000..e3aed510
--- /dev/null
+++ b/repo_metadata/ai/utils/sdoc2md.py
@@ -0,0 +1,272 @@
+from html2text import HTML2Text
+
+
+HEADER_LABEL = [
+ 'header1',
+ 'header2',
+ 'header3',
+ 'header4',
+ 'header5',
+ 'header6',
+]
+
+def _handle_text_style(json_data_text, return_null=False):
+ text = json_data_text.get('text', '')
+ pure_text = text
+ bold = json_data_text.get('bold')
+ italic = json_data_text.get('italic')
+
+ if italic:
+ text = "_%s_" % text
+ if bold:
+ text = "**%s**" % text
+
+ if (not text) and return_null:
+ text = '.'
+ return text, pure_text
+
+# sdoc 2 html dom
+# 1. header
+def _handle_header_dom(header_json, header_type):
+ output = ''
+ for child in header_json['children']:
+ if 'text' in child:
+ output += child.get('text')
+ else:
+ child_type = child.get('type')
+ if child_type == 'link':
+ output += _handle_link_dom(child)
+
+ tag = {
+ "header1": "
%s
",
+ "header2": "%s
",
+ "header3": "%s
",
+ "header4": "%s
",
+ "header5": "%s
",
+ "header6": "%s
",
+
+ }.get(header_type)
+ return tag % output
+
+# 3 list including ordered / unordered list
+def _handle_list_dom(list_json, tag='', ordered=False):
+ for list_item in list_json['children']:
+ item_eles = list_item['children']
+ text = ''
+ for lic in item_eles:
+
+ if lic.get('type') == 'unordered_list':
+ tag += _handle_list_dom(lic, '')
+ if lic.get('type') == 'ordered_list':
+ tag += _handle_list_dom(lic, '', True)
+
+ if lic.get('type') == 'paragraph':
+ for item in lic['children']:
+ if 'text' in item:
+ text += _handle_text_style(item)[0]
+ else:
+ item_type = item.get('type')
+ if item_type == 'link':
+ text_name = item['children'][0]['text']
+ text_url = item.get('href')
+ text += "%s" % (text_url, text_name)
+ tag += "%s
" % text
+ if ordered:
+ res = "" % tag
+ else:
+ res = "" % tag
+ return res
+
+# 4 checkbox
+def _handle_check_list_dom(check_list_json):
+ output = ""
+ checked = check_list_json.get('checked')
+ for child in check_list_json['children']:
+ if 'text' in child:
+ output += _handle_text_style(child)[0]
+ else:
+ child_type = child.get('type')
+ if child_type == 'link':
+ output += _handle_link_dom(child)
+
+ if checked:
+ output = "* [x] %s
" % output
+ else:
+ output = "* [ ] %s
" % output
+
+ return output
+
+# 5 blockquote
+def _handle_blockquote_dom(blockquote_json):
+ output = ""
+ for child in blockquote_json['children']:
+ child_type = child.get('type')
+ if child_type in ['ordered_list', 'unordered_list']:
+ output += _handle_list_dom(child, '', child_type == 'ordered_list')
+
+ if child_type == 'link':
+ text_name = child['children'][0]['text']
+ text_url = child.get('href')
+ output += "%s" % (text_url, text_name)
+
+ if child_type == 'paragraph':
+ output += '%s' % _handle_pagragh_dom(child)
+
+ if child_type == 'check_list_item':
+ output += '%s' % _handle_check_list_dom(child)
+
+
+ if 'text' in child:
+ text = child.get('text')
+ text_list = text.split("\n")
+ output += ''.join(['%s
' % t for t in text_list if t.strip()])
+
+ tag = "%s
" % output
+ return tag
+
+# 6 url link
+def _handle_link_dom(link_json):
+ href = link_json.get('href')
+ link_child = link_json['children'][0]
+
+ res = "%s" % (href, link_child.get('text'))
+ return res
+
+
+# 7 pagragh
+def _handle_pagragh_dom(pagragh_json):
+ output = ''
+ for child in pagragh_json['children']:
+ if 'text' in child:
+ output += _handle_text_style(child)[0]
+ else:
+ child_type = child.get('type')
+ if child_type == 'link':
+ output += _handle_link_dom(child)
+
+
+ result = "%s
" % output
+ return result.replace("\n", "")
+
+
+def _handle_table_cell_dom(table_cell_json):
+ output = ''
+ for child in table_cell_json['children']:
+ if 'text' in child:
+ output += _handle_text_style(child)[0]
+ else:
+ child_type = child.get('type')
+ if child_type == 'link':
+ output += _handle_link_dom(child)
+
+ return output
+
+
+# html2markdown
+def handle_header(header_json, header_type):
+ md_hander = HTML2Text(bodywidth=0) # no wrapping length
+ dom = _handle_header_dom(header_json, header_type)
+ return md_hander.handle(dom)
+
+
+def handle_check_list(check_list_json):
+ md_hander = HTML2Text(bodywidth=0) # no wrapping length
+ return md_hander.handle(_handle_check_list_dom(check_list_json))
+
+
+def handle_paragraph(paragraph_json):
+ md_hander = HTML2Text(bodywidth=0) # no wrapping length
+ dom = _handle_pagragh_dom(paragraph_json)
+ return md_hander.handle(dom)
+
+
+def handle_list(json_data, ordered=False):
+ md_hander = HTML2Text(bodywidth=0) # no wrapping length
+ html = _handle_list_dom(json_data, '', ordered)
+ md = md_hander.handle(html)
+ return md
+
+
+def handle_codeblock(code_bloc_json):
+ lang = code_bloc_json.get('language', '')
+ output = ""
+ for child in code_bloc_json.get('children'):
+ if 'children' in child:
+ output += "%s\n" % child.get('children', '')[0].get('text')
+ return "```%s\n%s```" % (lang, output)
+
+
+def handle_blockquote(json_data):
+ md_hander = HTML2Text(bodywidth=0) # no wrapping length
+ html = _handle_blockquote_dom(json_data)
+ md = md_hander.handle(html)
+ return md
+
+
+def handle_table(table_json):
+ md_hander = HTML2Text(bodywidth=0) # no wrapping length
+ th_headers = ''
+ th_body = ''
+ first_table_row = table_json['children'][0]
+ other_table_rows = table_json['children'][1:]
+
+ for first_table_cell in first_table_row['children']:
+ th_headers += "%s | " % _handle_table_cell_dom(first_table_cell)
+
+ for table_row in other_table_rows:
+ td = ''
+ for table_cell in table_row['children']:
+ td += "%s | " % _handle_table_cell_dom(table_cell)
+ th_body += "%s
" % td
+
+ html = "" % (th_headers, th_body)
+ return md_hander.handle(html)
+
+
+#
+def json2md(json_data):
+ doc_type = json_data.get('type')
+ markdown_output = ''
+ if doc_type == 'title':
+ output = handle_header(json_data, 'header1')
+ markdown_output += output
+
+ if doc_type in HEADER_LABEL:
+ output = handle_header(json_data, doc_type)
+ markdown_output += output
+
+ if doc_type == 'check_list_item':
+ output = handle_check_list(json_data)
+ markdown_output += output
+
+ if doc_type == 'paragraph':
+ output = handle_paragraph(json_data)
+ markdown_output += output
+
+ if doc_type == 'code_block':
+ output = handle_codeblock(json_data)
+ markdown_output += output
+
+ if doc_type == 'table':
+ output = handle_table(json_data)
+ markdown_output += output
+
+ if doc_type == 'unordered_list':
+ output = handle_list(json_data)
+ markdown_output += output
+
+ if doc_type == 'ordered_list':
+ output = handle_list(json_data, ordered=True)
+ markdown_output += output
+
+ if doc_type == 'blockquote':
+ output = handle_blockquote(json_data)
+ markdown_output += output
+ return markdown_output
+
+def sdoc2md(json_tree):
+ results = []
+ for sub in json_tree.get('children'):
+ results.append(json2md(sub))
+ markdown_text = "\n".join(results)
+ return markdown_text
diff --git a/repo_metadata/utils.py b/repo_metadata/utils.py
index 1bedcee1..3abd08bc 100644
--- a/repo_metadata/utils.py
+++ b/repo_metadata/utils.py
@@ -2,12 +2,15 @@
import random
import math
import exifread
+import requests
+import json
from io import BytesIO
+from urllib.parse import quote as urlquote
from seafobj import commit_mgr, fs_mgr
-
-from seafevents.app.config import METADATA_FILE_TYPES
+from seaserv import seafile_api
+from seafevents.app.config import METADATA_FILE_TYPES, FILE_SERVER
def gen_fileext_type_map():
@@ -22,6 +25,24 @@ def gen_fileext_type_map():
return ext_to_type
+def gen_file_get_url(token, filename):
+ return '%s/files/%s/%s' % (FILE_SERVER, token, urlquote(filename))
+
+
+def get_file_by_path(repo_id, path):
+ file_id = seafile_api.get_file_id_by_path(repo_id, path)
+ filename = os.path.basename(path)
+ token = seafile_api.get_fileserver_access_token(
+ repo_id, file_id, 'download', username='sys_summary_sdoc', use_onetime=True
+ )
+ url = gen_file_get_url(token, filename)
+ content =requests.get(url, timeout=10).content.decode()
+
+ if content:
+ content = json.loads(content)
+ return content
+
+
FILEEXT_TYPE_MAP = gen_fileext_type_map()
@@ -99,6 +120,7 @@ def __init__(self):
self.file_type = MetadataColumn('_file_type', '_file_type', 'single-select',
{'options': gen_select_options(list(METADATA_FILE_TYPES.keys()))})
self.location = MetadataColumn('_location', '_location', 'geolocation', {'geo_format': 'lng_lat'})
+ self.summary = MetadataColumn('_summary', '_summary', 'long-text')
class MetadataColumn(object):
diff --git a/seafevent_server/request_handler.py b/seafevent_server/request_handler.py
index 83de310f..fb1de159 100644
--- a/seafevent_server/request_handler.py
+++ b/seafevent_server/request_handler.py
@@ -7,7 +7,7 @@
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.seafevent_server.export_task_manager import event_export_task_manager
from seafevents.seasearch.index_task.index_task_manager import index_task_manager
-
+from seafevents.repo_metadata.ai.gen_summary import create_summary_of_sdoc_in_repo
app = Flask(__name__)
logger = logging.getLogger(__name__)
@@ -132,3 +132,24 @@ def search():
results = index_task_manager.keyword_search(query, repos, count, suffixes)
return {'results': results}, 200
+
+
+@app.route('/create-summary-of-sdoc-in-repo', methods=['POST'])
+def create_sdoc_summary():
+ is_valid, error = check_auth_token(request)
+ if not is_valid:
+ return make_response((error, 403))
+
+ try:
+ data = json.loads(request.data)
+ except Exception as e:
+ logger.exception(e)
+ return {'error_msg': 'Bad request.'}, 400
+
+ repo_id = data.get('repo_id')
+
+ if not repo_id:
+ return {'error_msg': 'repo_id invalid.'}, 400
+ create_status = create_summary_of_sdoc_in_repo(repo_id)
+
+ return create_status
From ff3f8aee8c2169bf2b7794e56d6d82058836e68e Mon Sep 17 00:00:00 2001
From: cir9no <44470218+cir9no@users.noreply.github.com>
Date: Thu, 1 Aug 2024 09:57:18 +0800
Subject: [PATCH 2/5] adjust seafevents ai conf
---
repo_metadata/ai/gen_summary.py | 9 +++++----
repo_metadata/ai/utils/openai_api.py | 16 +++++++++++-----
2 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py
index 967c04e0..e745db44 100644
--- a/repo_metadata/ai/gen_summary.py
+++ b/repo_metadata/ai/gen_summary.py
@@ -2,7 +2,7 @@
import logging
from gevent.pool import Pool
-from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI, get_openai_proxy_url
+from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI, get_llm_url
from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md
from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI
from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS
@@ -15,13 +15,14 @@
def gen_doc_summary(content):
- openai_proxy_url = get_openai_proxy_url()
- openai = OpenAIAPI(openai_proxy_url)
+ llm_url, url_type = get_llm_url()
+ if url_type == 'proxy':
+ openai_api = OpenAIAPI(llm_url)
system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None'
system_prompt = {"role": "system", "content": system_content}
user_prompt = {"role": "user", "content": content}
messages = [system_prompt, user_prompt]
- summary = openai.chat_completions(messages)
+ summary = openai_api.chat_completions(messages)
return summary
diff --git a/repo_metadata/ai/utils/openai_api.py b/repo_metadata/ai/utils/openai_api.py
index 15b0d03b..4d079ec3 100644
--- a/repo_metadata/ai/utils/openai_api.py
+++ b/repo_metadata/ai/utils/openai_api.py
@@ -21,8 +21,10 @@ def parse_response(response):
pass
-def get_openai_proxy_url():
+def get_llm_url():
section_name = 'AI'
+ llm_url, url_type = None
+
config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR')
if config_dir:
config_file = os.path.join(config_dir, 'seafevents.conf')
@@ -37,10 +39,14 @@ def get_openai_proxy_url():
if not config.has_section(section_name):
return
- openai_proxy_url = get_opt_from_conf_or_env(config, section_name, 'openai_proxy_url', 'OPENAI_PROXY_URL')
- if not openai_proxy_url:
- raise ValueError("OPENAI_PROXY_URL not found in the configuration file or environment variables.")
- return openai_proxy_url
+ llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE')
+ if llm_type == 'open-ai-proxy':
+ llm_url = get_opt_from_conf_or_env(config, section_name, 'openai_proxy_url', 'OPENAI_PROXY_URL')
+ url_type = 'proxy'
+
+ if not llm_url:
+ raise ValueError("llm_url not found in the configuration file or environment variables.")
+ return llm_url, url_type
class OpenAIAPI:
From 462b8b33729be8dab7a2c4617f84d3e4d2eb9b98 Mon Sep 17 00:00:00 2001
From: cir9no <44470218+cir9no@users.noreply.github.com>
Date: Thu, 1 Aug 2024 11:50:41 +0800
Subject: [PATCH 3/5] feat/meta: add summary single sdoc
---
repo_metadata/ai/gen_summary.py | 34 ++++++++++++++++++++++++----
repo_metadata/ai/utils/openai_api.py | 4 ++--
seafevent_server/request_handler.py | 28 ++++++++++++++++++++++-
3 files changed, 58 insertions(+), 8 deletions(-)
diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py
index e745db44..4407a421 100644
--- a/repo_metadata/ai/gen_summary.py
+++ b/repo_metadata/ai/gen_summary.py
@@ -13,7 +13,6 @@
logger = logging.getLogger(__name__)
-
def gen_doc_summary(content):
llm_url, url_type = get_llm_url()
if url_type == 'proxy':
@@ -35,10 +34,7 @@ def create_summary_of_sdoc_in_repo(repo_id):
def process_row(row):
parent_dir = row[METADATA_TABLE.columns.parent_dir.name]
file_name = row[METADATA_TABLE.columns.file_name.name]
- if parent_dir == '/':
- path = parent_dir + file_name
- else:
- path = parent_dir + '/' + file_name
+ path = os.path.join(parent_dir, file_name)
if _is_excluded_path(path):
return
@@ -70,6 +66,34 @@ def process_row(row):
return {'success': True}
+def update_single_sdoc_summary(repo_id, file_path):
+ metadata_server_api = MetadataServerAPI('seafevents')
+ parent_dir = os.path.dirname(file_path)
+ file_name = os.path.basename(file_path)
+ _, file_ext = os.path.splitext(file_name)
+ sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}` WHERE (`{METADATA_TABLE.columns.parent_dir.name}` = ? AND `{METADATA_TABLE.columns.file_name.name}` = ?)'
+ parameters = []
+ updated_summary_row = []
+ if file_ext == '.sdoc':
+ sdoc_content = get_file_by_path(repo_id, file_path)
+ md_content = sdoc2md(sdoc_content)
+ summary_text = gen_doc_summary(md_content)
+
+ parameters.append(parent_dir)
+ parameters.append(file_name)
+ query_result = metadata_server_api.query_rows(repo_id, sql, parameters).get('results', [])
+ row_id = query_result[0][METADATA_TABLE.columns.id.name]
+
+ updated_row = {
+ METADATA_TABLE.columns.id.name: row_id,
+ METADATA_TABLE.columns.summary.name: summary_text,
+ }
+ updated_summary_row.append(updated_row)
+ if updated_summary_row:
+ metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_row)
+ return {'success': True}
+
+
def _is_excluded_path(path):
if not path or path == '/':
return True
diff --git a/repo_metadata/ai/utils/openai_api.py b/repo_metadata/ai/utils/openai_api.py
index 4d079ec3..393569df 100644
--- a/repo_metadata/ai/utils/openai_api.py
+++ b/repo_metadata/ai/utils/openai_api.py
@@ -23,7 +23,7 @@ def parse_response(response):
def get_llm_url():
section_name = 'AI'
- llm_url, url_type = None
+ llm_url, url_type = None, None
config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR')
if config_dir:
@@ -41,7 +41,7 @@ def get_llm_url():
llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE')
if llm_type == 'open-ai-proxy':
- llm_url = get_opt_from_conf_or_env(config, section_name, 'openai_proxy_url', 'OPENAI_PROXY_URL')
+ llm_url = get_opt_from_conf_or_env(config, section_name, 'llm_url', 'LLM_URL')
url_type = 'proxy'
if not llm_url:
diff --git a/seafevent_server/request_handler.py b/seafevent_server/request_handler.py
index fb1de159..44b1cb44 100644
--- a/seafevent_server/request_handler.py
+++ b/seafevent_server/request_handler.py
@@ -7,7 +7,8 @@
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.seafevent_server.export_task_manager import event_export_task_manager
from seafevents.seasearch.index_task.index_task_manager import index_task_manager
-from seafevents.repo_metadata.ai.gen_summary import create_summary_of_sdoc_in_repo
+from seafevents.repo_metadata.ai.gen_summary import create_summary_of_sdoc_in_repo, \
+ update_single_sdoc_summary
app = Flask(__name__)
logger = logging.getLogger(__name__)
@@ -153,3 +154,28 @@ def create_sdoc_summary():
create_status = create_summary_of_sdoc_in_repo(repo_id)
return create_status
+
+
+@app.route('/update-single-sdoc-summary', methods=['POST'])
+def update_sdoc_summary():
+ is_valid, error = check_auth_token(request)
+ if not is_valid:
+ return make_response((error, 403))
+
+ try:
+ data = json.loads(request.data)
+ except Exception as e:
+ logger.exception(e)
+ return {'error_msg': 'Bad request.'}, 400
+
+ repo_id = data.get('repo_id')
+ file_path = data.get('file_path')
+
+ if not repo_id:
+ return {'error_msg': 'repo_id invalid.'}, 400
+ if not file_path:
+ return {'error_msg': 'file_path invalid.'}, 400
+
+ update_status = update_single_sdoc_summary(repo_id, file_path)
+
+ return update_status
From 574103a6d6b2a7795a276fc8cd789b717a2ea505 Mon Sep 17 00:00:00 2001
From: cir9no <44470218+cir9no@users.noreply.github.com>
Date: Thu, 1 Aug 2024 13:58:14 +0800
Subject: [PATCH 4/5] feat/meta: add input llm character limit
---
repo_metadata/ai/constants.py | 1 +
repo_metadata/ai/gen_summary.py | 5 +++--
2 files changed, 4 insertions(+), 2 deletions(-)
create mode 100644 repo_metadata/ai/constants.py
diff --git a/repo_metadata/ai/constants.py b/repo_metadata/ai/constants.py
new file mode 100644
index 00000000..c309b1dc
--- /dev/null
+++ b/repo_metadata/ai/constants.py
@@ -0,0 +1 @@
+LLM_INPUT_CHARACTERS_LIMIT = 8000
diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py
index 4407a421..511107eb 100644
--- a/repo_metadata/ai/gen_summary.py
+++ b/repo_metadata/ai/gen_summary.py
@@ -4,6 +4,7 @@
from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI, get_llm_url
from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md
+from seafevents.repo_metadata.ai.constants import LLM_INPUT_CHARACTERS_LIMIT
from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI
from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS
from seafevents.repo_metadata.utils import get_file_by_path
@@ -42,7 +43,7 @@ def process_row(row):
_, ext = os.path.splitext(file_name)
if ext == '.sdoc':
sdoc_content = get_file_by_path(repo_id, path)
- md_content = sdoc2md(sdoc_content)
+ md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT]
summary_text = gen_doc_summary(md_content)
if summary_text in ['None', 'none']:
return
@@ -76,7 +77,7 @@ def update_single_sdoc_summary(repo_id, file_path):
updated_summary_row = []
if file_ext == '.sdoc':
sdoc_content = get_file_by_path(repo_id, file_path)
- md_content = sdoc2md(sdoc_content)
+ md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT]
summary_text = gen_doc_summary(md_content)
parameters.append(parent_dir)
From 1e2f58b21cad95ebbcfd246f0184ca7d9823e4dc Mon Sep 17 00:00:00 2001
From: cir9no <44470218+cir9no@users.noreply.github.com>
Date: Thu, 1 Aug 2024 18:16:01 +0800
Subject: [PATCH 5/5] add metadata ai server
---
repo_metadata/ai/ai_server.py | 43 ++++++++++++++++++++++++++++
repo_metadata/ai/gen_summary.py | 40 +++++++++++++++-----------
repo_metadata/ai/utils/openai_api.py | 38 ++----------------------
seafevent_server/request_handler.py | 16 +++++------
seafevent_server/seafevent_server.py | 5 ++++
5 files changed, 83 insertions(+), 59 deletions(-)
create mode 100644 repo_metadata/ai/ai_server.py
diff --git a/repo_metadata/ai/ai_server.py b/repo_metadata/ai/ai_server.py
new file mode 100644
index 00000000..b80344f3
--- /dev/null
+++ b/repo_metadata/ai/ai_server.py
@@ -0,0 +1,43 @@
+import os
+import logging
+
+from seafevents.app.config import get_config
+from seafevents.utils import get_opt_from_conf_or_env
+
+
+logger = logging.getLogger(__name__)
+
+
+class RepoMetadataAIserver:
+ def __init__(self):
+ self.llm_url = None
+ # Refer to diff llm model
+ self.llm_type = None
+ # Refer to llm api key
+ self.llm_key = None
+
+ def init(self, config):
+ self._parse_config(config)
+
+ def _parse_config(self, config):
+ section_name = 'AI'
+ config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR')
+ if config_dir:
+ config_file = os.path.join(config_dir, 'seafevents.conf')
+ else:
+ config_file = os.environ.get('EVENTS_CONFIG_FILE')
+
+ if not config_file or not os.path.exists(config_file):
+ return
+
+ config = get_config(config_file)
+ if not config.has_section(section_name):
+ return
+
+ self.llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE')
+ if self.llm_type == 'open-ai-proxy':
+ self.llm_url = get_opt_from_conf_or_env(config, section_name, 'llm_url', 'LLM_URL')
+ if not self.llm_url:
+ logger.info("llm_url not found in the configuration file or environment variables.")
+
+metadata_ai_server = RepoMetadataAIserver()
diff --git a/repo_metadata/ai/gen_summary.py b/repo_metadata/ai/gen_summary.py
index 511107eb..7b0775a4 100644
--- a/repo_metadata/ai/gen_summary.py
+++ b/repo_metadata/ai/gen_summary.py
@@ -2,8 +2,9 @@
import logging
from gevent.pool import Pool
-from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI, get_llm_url
+from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI
from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md
+from seafevents.repo_metadata.ai.ai_server import metadata_ai_server
from seafevents.repo_metadata.ai.constants import LLM_INPUT_CHARACTERS_LIMIT
from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI
from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS
@@ -15,18 +16,23 @@
def gen_doc_summary(content):
- llm_url, url_type = get_llm_url()
- if url_type == 'proxy':
- openai_api = OpenAIAPI(llm_url)
- system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None'
- system_prompt = {"role": "system", "content": system_content}
- user_prompt = {"role": "user", "content": content}
- messages = [system_prompt, user_prompt]
- summary = openai_api.chat_completions(messages)
- return summary
-
+ llm_type = metadata_ai_server.llm_type
+ llm_url = metadata_ai_server.llm_url
-def create_summary_of_sdoc_in_repo(repo_id):
+ if llm_type == 'open-ai-proxy':
+ openai_api = OpenAIAPI(llm_url)
+ system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None'
+ system_prompt = {"role": "system", "content": system_content}
+ user_prompt = {"role": "user", "content": content}
+ messages = [system_prompt, user_prompt]
+ summary = openai_api.chat_completions(messages)
+ return summary
+ else:
+ logger.error('llm_type is not set correctly in seafevents.conf')
+ return None
+
+
+def create_summary_of_doc_in_repo(repo_id):
metadata_server_api = MetadataServerAPI('seafevents')
sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}`'
query_result = metadata_server_api.query_rows(repo_id, sql).get('results', [])
@@ -54,7 +60,7 @@ def process_row(row):
}
updated_summary_rows.append(updated_row)
- pool = Pool(50)
+ pool = Pool(10)
logger.info(f'Start summarizing sdoc in repo {repo_id}')
for row in query_result:
pool.spawn(process_row, row)
@@ -67,7 +73,7 @@ def process_row(row):
return {'success': True}
-def update_single_sdoc_summary(repo_id, file_path):
+def update_single_doc_summary(repo_id, file_path):
metadata_server_api = MetadataServerAPI('seafevents')
parent_dir = os.path.dirname(file_path)
file_name = os.path.basename(file_path)
@@ -79,6 +85,8 @@ def update_single_sdoc_summary(repo_id, file_path):
sdoc_content = get_file_by_path(repo_id, file_path)
md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT]
summary_text = gen_doc_summary(md_content)
+ if summary_text in ['None', 'none']:
+ summary_text = ''
parameters.append(parent_dir)
parameters.append(file_name)
@@ -90,8 +98,8 @@ def update_single_sdoc_summary(repo_id, file_path):
METADATA_TABLE.columns.summary.name: summary_text,
}
updated_summary_row.append(updated_row)
- if updated_summary_row:
- metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_row)
+ if updated_summary_row:
+ metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_row)
return {'success': True}
diff --git a/repo_metadata/ai/utils/openai_api.py b/repo_metadata/ai/utils/openai_api.py
index 393569df..75c1643f 100644
--- a/repo_metadata/ai/utils/openai_api.py
+++ b/repo_metadata/ai/utils/openai_api.py
@@ -1,10 +1,6 @@
import requests
import logging
import json
-import os
-
-from seafevents.app.config import get_config
-from seafevents.utils import get_opt_from_conf_or_env
logger = logging.getLogger(__name__)
@@ -21,37 +17,9 @@ def parse_response(response):
pass
-def get_llm_url():
- section_name = 'AI'
- llm_url, url_type = None, None
-
- config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR')
- if config_dir:
- config_file = os.path.join(config_dir, 'seafevents.conf')
- else:
- config_file = os.environ.get('EVENTS_CONFIG_FILE')
-
- if not config_file or not os.path.exists(config_file):
- return
-
- config = get_config(config_file)
-
- if not config.has_section(section_name):
- return
-
- llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE')
- if llm_type == 'open-ai-proxy':
- llm_url = get_opt_from_conf_or_env(config, section_name, 'llm_url', 'LLM_URL')
- url_type = 'proxy'
-
- if not llm_url:
- raise ValueError("llm_url not found in the configuration file or environment variables.")
- return llm_url, url_type
-
-
class OpenAIAPI:
- def __init__(self, openai_url, timeout=180):
- self.openai_url = openai_url.rstrip('/') + '/api/v1/chat-completions/create'
+ def __init__(self, openai_proxy_url, timeout=180):
+ self.openai_proxy_url = openai_proxy_url.rstrip('/') + '/api/v1/chat-completions/create'
self.timeout = timeout
def chat_completions(self, messages, temperature=0):
@@ -60,7 +28,7 @@ def chat_completions(self, messages, temperature=0):
'messages': messages,
'temperature': temperature
}
- response = requests.post(self.openai_url, json=json_data, timeout=self.timeout)
+ response = requests.post(self.openai_proxy_url, json=json_data, timeout=self.timeout)
data = parse_response(response)
try:
result = data['choices'][0]['message']['content']
diff --git a/seafevent_server/request_handler.py b/seafevent_server/request_handler.py
index 44b1cb44..5fb5a80f 100644
--- a/seafevent_server/request_handler.py
+++ b/seafevent_server/request_handler.py
@@ -7,8 +7,8 @@
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.seafevent_server.export_task_manager import event_export_task_manager
from seafevents.seasearch.index_task.index_task_manager import index_task_manager
-from seafevents.repo_metadata.ai.gen_summary import create_summary_of_sdoc_in_repo, \
- update_single_sdoc_summary
+from seafevents.repo_metadata.ai.gen_summary import create_summary_of_doc_in_repo, \
+ update_single_doc_summary
app = Flask(__name__)
logger = logging.getLogger(__name__)
@@ -135,8 +135,8 @@ def search():
return {'results': results}, 200
-@app.route('/create-summary-of-sdoc-in-repo', methods=['POST'])
-def create_sdoc_summary():
+@app.route('/create-summary-of-doc-in-repo', methods=['POST'])
+def create_doc_summary():
is_valid, error = check_auth_token(request)
if not is_valid:
return make_response((error, 403))
@@ -151,13 +151,13 @@ def create_sdoc_summary():
if not repo_id:
return {'error_msg': 'repo_id invalid.'}, 400
- create_status = create_summary_of_sdoc_in_repo(repo_id)
+ create_status = create_summary_of_doc_in_repo(repo_id)
return create_status
-@app.route('/update-single-sdoc-summary', methods=['POST'])
-def update_sdoc_summary():
+@app.route('/update-single-doc-summary', methods=['POST'])
+def update_doc_summary():
is_valid, error = check_auth_token(request)
if not is_valid:
return make_response((error, 403))
@@ -176,6 +176,6 @@ def update_sdoc_summary():
if not file_path:
return {'error_msg': 'file_path invalid.'}, 400
- update_status = update_single_sdoc_summary(repo_id, file_path)
+ update_status = update_single_doc_summary(repo_id, file_path)
return update_status
diff --git a/seafevent_server/seafevent_server.py b/seafevent_server/seafevent_server.py
index c1a6bdfd..c61d4768 100644
--- a/seafevent_server/seafevent_server.py
+++ b/seafevent_server/seafevent_server.py
@@ -5,6 +5,8 @@
from seafevents.seafevent_server.task_manager import task_manager
from seafevents.seafevent_server.export_task_manager import event_export_task_manager
from seafevents.seasearch.index_task.index_task_manager import index_task_manager
+from seafevents.repo_metadata.ai.ai_server import metadata_ai_server
+from seafevents.app.config import ENABLE_METADATA_MANAGEMENT
class SeafEventServer(Thread):
@@ -22,6 +24,9 @@ def __init__(self, app, config):
index_task_manager.init(config)
+ if ENABLE_METADATA_MANAGEMENT:
+ metadata_ai_server.init(config)
+
def _parse_config(self, config):
if config.has_option('SEAF-EVENT-SERVER', 'host'):
self._host = config.get('SEAF-EVENT-SERVER', 'host')