Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: init sdoc summary when add summary column #354

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
METADATA_SERVER_URL = getattr(seahub_settings, 'METADATA_SERVER_URL', '')
ENABLE_METADATA_MANAGEMENT = getattr(seahub_settings, 'ENABLE_METADATA_MANAGEMENT', False)
METADATA_FILE_TYPES = getattr(seahub_settings, 'METADATA_FILE_TYPES', {})
FILE_SERVER = getattr(seahub_settings, 'FILE_SERVER_ROOT', '')
except ImportError:
logger.critical("Can not import seahub settings.")
raise RuntimeError("Can not import seahub settings.")
Expand Down
43 changes: 43 additions & 0 deletions repo_metadata/ai/ai_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import logging

from seafevents.app.config import get_config
from seafevents.utils import get_opt_from_conf_or_env


logger = logging.getLogger(__name__)


class RepoMetadataAIserver:
def __init__(self):
self.llm_url = None
# Refer to diff llm model
self.llm_type = None
# Refer to llm api key
self.llm_key = None

def init(self, config):
self._parse_config(config)

def _parse_config(self, config):
section_name = 'AI'
config_dir = os.environ.get('SEAFILE_CENTRAL_CONF_DIR')
if config_dir:
config_file = os.path.join(config_dir, 'seafevents.conf')
else:
config_file = os.environ.get('EVENTS_CONFIG_FILE')

if not config_file or not os.path.exists(config_file):
return

config = get_config(config_file)
if not config.has_section(section_name):
return

self.llm_type = get_opt_from_conf_or_env(config, section_name, 'llm_type', 'LLM_TYPE')
if self.llm_type == 'open-ai-proxy':
self.llm_url = get_opt_from_conf_or_env(config, section_name, 'llm_url', 'LLM_URL')
if not self.llm_url:
logger.info("llm_url not found in the configuration file or environment variables.")

metadata_ai_server = RepoMetadataAIserver()
1 change: 1 addition & 0 deletions repo_metadata/ai/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
LLM_INPUT_CHARACTERS_LIMIT = 8000
111 changes: 111 additions & 0 deletions repo_metadata/ai/gen_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import logging
from gevent.pool import Pool

from seafevents.repo_metadata.ai.utils.openai_api import OpenAIAPI
from seafevents.repo_metadata.ai.utils.sdoc2md import sdoc2md
from seafevents.repo_metadata.ai.ai_server import metadata_ai_server
from seafevents.repo_metadata.ai.constants import LLM_INPUT_CHARACTERS_LIMIT
from seafevents.repo_metadata.metadata_server_api import MetadataServerAPI
from seafevents.repo_metadata.repo_metadata import EXCLUDED_PATHS
from seafevents.repo_metadata.utils import get_file_by_path
from seafevents.repo_metadata.utils import METADATA_TABLE


logger = logging.getLogger(__name__)


def gen_doc_summary(content):
llm_type = metadata_ai_server.llm_type
llm_url = metadata_ai_server.llm_url

if llm_type == 'open-ai-proxy':
openai_api = OpenAIAPI(llm_url)
system_content = 'You are a document summarization expert. I need you to generate a concise summary of a document that is no longer than 40 words. The summary should capture the main points and themes of the document clearly and effectively.The output language is the same as the input language. If it seems there is no content provided for summarization, just output word: None'
system_prompt = {"role": "system", "content": system_content}
user_prompt = {"role": "user", "content": content}
messages = [system_prompt, user_prompt]
summary = openai_api.chat_completions(messages)
return summary
else:
logger.error('llm_type is not set correctly in seafevents.conf')
return None


def create_summary_of_doc_in_repo(repo_id):
metadata_server_api = MetadataServerAPI('seafevents')
sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}`'
query_result = metadata_server_api.query_rows(repo_id, sql).get('results', [])
updated_summary_rows = []

def process_row(row):
parent_dir = row[METADATA_TABLE.columns.parent_dir.name]
file_name = row[METADATA_TABLE.columns.file_name.name]
path = os.path.join(parent_dir, file_name)
if _is_excluded_path(path):
return

row_id = row[METADATA_TABLE.columns.id.name]
_, ext = os.path.splitext(file_name)
if ext == '.sdoc':
sdoc_content = get_file_by_path(repo_id, path)
md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT]
summary_text = gen_doc_summary(md_content)
if summary_text in ['None', 'none']:
return

updated_row = {
METADATA_TABLE.columns.id.name: row_id,
METADATA_TABLE.columns.summary.name: summary_text,
}
updated_summary_rows.append(updated_row)

pool = Pool(10)
logger.info(f'Start summarizing sdoc in repo {repo_id}')
for row in query_result:
pool.spawn(process_row, row)

pool.join()

if updated_summary_rows:
metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_rows)
logger.info(f'Finish summarizing sdoc in repo {repo_id}')
return {'success': True}


def update_single_doc_summary(repo_id, file_path):
metadata_server_api = MetadataServerAPI('seafevents')
parent_dir = os.path.dirname(file_path)
file_name = os.path.basename(file_path)
_, file_ext = os.path.splitext(file_name)
sql = f'SELECT `{METADATA_TABLE.columns.id.name}`, `{METADATA_TABLE.columns.parent_dir.name}`, `{METADATA_TABLE.columns.file_name.name}` FROM `{METADATA_TABLE.name}` WHERE (`{METADATA_TABLE.columns.parent_dir.name}` = ? AND `{METADATA_TABLE.columns.file_name.name}` = ?)'
parameters = []
updated_summary_row = []
if file_ext == '.sdoc':
sdoc_content = get_file_by_path(repo_id, file_path)
md_content = sdoc2md(sdoc_content)[0:LLM_INPUT_CHARACTERS_LIMIT]
summary_text = gen_doc_summary(md_content)
if summary_text in ['None', 'none']:
summary_text = ''

parameters.append(parent_dir)
parameters.append(file_name)
query_result = metadata_server_api.query_rows(repo_id, sql, parameters).get('results', [])
row_id = query_result[0][METADATA_TABLE.columns.id.name]

updated_row = {
METADATA_TABLE.columns.id.name: row_id,
METADATA_TABLE.columns.summary.name: summary_text,
}
updated_summary_row.append(updated_row)
if updated_summary_row:
metadata_server_api.update_rows(repo_id, METADATA_TABLE.id, updated_summary_row)
return {'success': True}


def _is_excluded_path(path):
if not path or path == '/':
return True
for ex_path in EXCLUDED_PATHS:
if path.startswith(ex_path):
return True
39 changes: 39 additions & 0 deletions repo_metadata/ai/utils/openai_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import requests
import logging
import json


logger = logging.getLogger(__name__)


def parse_response(response):
if response.status_code >= 400:
raise ConnectionError(response.status_code, response.text)
else:
try:
data = json.loads(response.text)
return data
except:
pass


class OpenAIAPI:
def __init__(self, openai_proxy_url, timeout=180):
self.openai_proxy_url = openai_proxy_url.rstrip('/') + '/api/v1/chat-completions/create'
self.timeout = timeout

def chat_completions(self, messages, temperature=0):
json_data = {
'model': 'gpt-4o-mini',
'messages': messages,
'temperature': temperature
}
response = requests.post(self.openai_proxy_url, json=json_data, timeout=self.timeout)
data = parse_response(response)
try:
result = data['choices'][0]['message']['content']
except KeyError as e:
logger.exception(e)
result = None

return result
Loading
Loading