From 34e22e1b23647abddbd43f85661efce8769bfc08 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Fri, 10 Mar 2023 00:31:55 +0800 Subject: [PATCH] refactor --- src/FrontendService.py | 26 ++++++++++++++++---------- src/Util.py | 4 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/FrontendService.py b/src/FrontendService.py index d5b320c..4f6e890 100644 --- a/src/FrontendService.py +++ b/src/FrontendService.py @@ -1,12 +1,13 @@ import re from urllib.parse import urlparse -from SemanticSearchService import BatchOpenAISemanticSearchService from Util import setup_logger logger = setup_logger('FootnoteService') + + class FrontendService: def __init__(self, config, response_text, gpt_input_text_df): self.config = config @@ -14,6 +15,7 @@ def __init__(self, config, response_text, gpt_input_text_df): used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id self.gpt_input_text_df = gpt_input_text_df[used_columns] + def get_data_json(self, response_text, gpt_input_text_df): def create_response_json_object(text, type): return {"text": text, "type": type} @@ -21,14 +23,20 @@ def create_response_json_object(text, type): def create_source_json_object(footnote, domain, url, title, text): return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} - def get_response_json(response_text): - # find reference in text & re-order + def reorder_url_id(response_text, gpt_input_text_df): + # response_text: find reference in text & re-order url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))] url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) for url_id, new_url_id in url_id_map.items(): response_text = response_text.replace(f'[{url_id}]', f'[{new_url_id}]') + # gpt_input_text_df: find reference in text & re-order + in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() + in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) + return response_text, in_scope_source_df + + def get_response_json(response_text): response_json = [] split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) @@ -37,12 +45,9 @@ def get_response_json(response_text): response_json.append(create_response_json_object(sentence, "footnote")) else: response_json.append(create_response_json_object(sentence, "response")) - return response_json, url_id_map + return response_json - def get_source_json(gpt_input_text_df, url_id_map): - # include only sources used in response_text & remap url_id - in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() - in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) + def get_source_json(in_scope_source_df): in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) in_scope_source_df.sort_values('docno', inplace=True) source_text_list = [] @@ -64,8 +69,9 @@ def get_source_json(gpt_input_text_df, url_id_map): source_json = sorted(source_json, key=lambda x: x['footnote']) return source_json, source_text - response_json, url_id_map = get_response_json(response_text) - source_json, source_text = get_source_json(gpt_input_text_df, url_id_map) + response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df) + response_json = get_response_json(response_text) + source_json, source_text = get_source_json(in_scope_source_df) return source_text, {'response_json': response_json, 'source_json': source_json} diff --git a/src/Util.py b/src/Util.py index e930b32..ee8c259 100644 --- a/src/Util.py +++ b/src/Util.py @@ -23,12 +23,14 @@ def setup_logger(tag): def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): - # clean out of prompt texts + # clean out of prompt texts for existing [1], [2], [3]... in the source_text gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) + gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit + # reorder url_id with url that in scope. url_id_list = gpt_input_text_df['url_id'].unique() url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map)