diff --git a/playground/test_OpenAI_Embedding.py b/playground/test_OpenAI_Embedding.py new file mode 100644 index 0000000..789d6dc --- /dev/null +++ b/playground/test_OpenAI_Embedding.py @@ -0,0 +1,103 @@ +import os + +import openai +import pandas as pd +import yaml +from openai.embeddings_utils import get_embedding, cosine_similarity + +from Util import get_project_root + +BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai + + +def search_using_cosine_similarity(df, query): + query_embedding = get_embedding(query, engine=BASE_MODEL) + df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding)) + + results = df.sort_values("similarity", ascending=False, ignore_index=True) + + k = 5 + results = results.head(k) + global sources + sources = [] + for i in range(k): + sources.append({'Page ' + str(results.iloc[i]['page']): results.iloc[i]['text'][:150] + '...'}) + print(sources) + return results.head(k) + + +def compute_embeddings(text, model="text-embedding-ada-002"): + print(f'compute_embeddings() text: {text}') + text = text.replace("\n", " ") + return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding'] + + +def search_similar(df: pd.DataFrame, target_text, n=3, pprint=True): + print(f'search_similar() text: {target_text}') + embedding = compute_embeddings(target_text, model=BASE_MODEL) + df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) + res = df.sort_values('similarities', ascending=False).head(n) + return res, df + + +def compute_embeddings_2(text_df, model=BASE_MODEL, chunk_size=1000): + print(f'compute_embeddings_2() len(texts): {len(df)}') + text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) + embeddings = [] + for i in range(0, len(texts), chunk_size): + response = openai.Embedding.create( + input=texts[i: i + chunk_size], engine=model + ) + embeddings += [r["embedding"] for r in response["data"]] + text_df['embedding'] = embeddings + return text_df + + +if __name__ == '__main__': + # text_df = pd.read_csv(os.path.join(get_project_root(), 'src/text_df.csv')) + texts = [ + "Discover the world of delicious beans with our premium selection.", + "Try our savory bean soup recipe for a delicious and nutritious meal.", + "Our roasted coffee beans are carefully selected for their rich and delicious flavor.", + "Beans are not only delicious, but also a great source of protein and dietary fiber.", + "Looking for a delicious vegan meal? Try our spicy black bean burger recipe.", + + "The sky is blue and the sun is shining today.", + "I need to go grocery shopping after work to pick up some milk and bread.", + "Did you hear about the new movie that just came out? It's supposed to be really good.", + "I'm planning a trip to Europe next summer and I'm so excited.", + "My cat keeps meowing at me for no reason and it's driving me crazy.", + ] + text_df = pd.DataFrame({'text': texts, 'docno': range(len(texts))}) + print(text_df.shape) + + with open(os.path.join(get_project_root(), 'src/config/config.yaml')) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + openai.api_key = config.get('openai_api').get('api_key') + + # text_df = compute_embeddings(text_df) + # result_df = search_using_cosine_similarity(text_df, 'what is chatgpt?') + # print(result_df) + + search_text = 'delicious beans' + search_text = 'Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection ' + + from pyinstrument import Profiler + + profiler = Profiler() + profiler.start() + print("Sequential call mode:") + text_df['embedding'] = text_df['text'].apply(lambda x: compute_embeddings(x, model=BASE_MODEL)) + res, text_df = search_similar(text_df, search_text, n=3) + print(res) + profiler.stop() + profiler.print() + + profiler = Profiler() + profiler.start() + print("Batch call mode:") + text_df = compute_embeddings_2(text_df) + res, text_df = search_similar(text_df, search_text, n=3) + print(res) + profiler.stop() + profiler.print() diff --git a/requirements.txt b/requirements.txt index 0dda05b..5473ef8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,11 @@ Flask==2.2.3 requests==2.28.2 gunicorn==20.1.0 -# nlp +# embedding +matplotlib==3.7.1 +plotly==5.13.1 +scipy==1.10.1 +scikit-learn==1.2.1 # doc extraction python-docx==0.8.11 @@ -18,11 +22,3 @@ python-pptx==0.6.21 # html extraction beautifulsoup4==4.11.2 trafilatura==1.4.1 - -# doc indexer1 -# python-terrier==0.9.2 - -# doc indexer2 -faiss-cpu==1.7.3 -langchain==0.0.95 - diff --git a/src/FootnoteService.py b/src/FootnoteService.py deleted file mode 100644 index f6f1e98..0000000 --- a/src/FootnoteService.py +++ /dev/null @@ -1,154 +0,0 @@ -from urllib.parse import urlparse - -import pandas as pd - -from SemanticSearchService import SemanticSearchService -from Util import setup_logger, split_sentences_from_paragraph - -logger = setup_logger('FootnoteService') - - -class FootnoteService: - def __init__(self, config, response_text, gpt_input_text_df, semantic_search_service: SemanticSearchService): - self.config = config - self.response_text = response_text - used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id - self.gpt_input_text_df = gpt_input_text_df[used_columns] - self.semantic_search_service = semantic_search_service - - if self.config.get('semantic_search').get('provider') == 'pyterrier': - import pyterrier as pt - if not pt.started(): - pt.init() - - def extract_sentences_from_paragraph(self): - # TODO: currently only support English - sentences = split_sentences_from_paragraph(self.response_text) - response_df = pd.DataFrame(sentences, columns=['response_text_sentence']) - return response_df - - def get_footnote_from_sentences(self): - def get_footnote_result_sentence_dict(sentence, docno, rank, score, url_unique_ids, url, url_ids, source_sentence): - return { - 'sentence': sentence, - 'docno': docno, - 'rank': rank, - 'score': score, - 'url_unique_ids': url_unique_ids, - 'url': url, - 'url_ids': url_ids, - 'source_sentence': source_sentence - } - - logger.info(f'FootnoteService.get_footnote_from_sentences()') - - response_sentences_df = self.extract_sentences_from_paragraph() - if not self.config.get('search_option').get('is_use_source'): - footnote_result_list = [] - for index, row in response_sentences_df.iterrows(): - footnote_result_sentence_dict = get_footnote_result_sentence_dict(row["response_text_sentence"], [], [], [], [], [], [], []) - footnote_result_list.append(footnote_result_sentence_dict) - return footnote_result_list, pd.DataFrame() - - in_scope_source_df = self.gpt_input_text_df[self.gpt_input_text_df['in_scope']] - source_index = self.semantic_search_service.index_text_df(in_scope_source_df, 'source_index') - - footnote_result_list = [] - for index, row in response_sentences_df.iterrows(): - response_text_sentence = row["response_text_sentence"] - logger.info(f' [S{index + 1}] {response_text_sentence}') - # print(f'[S{index + 1}] {response_text_sentence}') - - cleaned_response_text_sentence = self.semantic_search_service.clean_sentence_to_avoid_lexical_error(response_text_sentence) - result_df = self.semantic_search_service.use_index_to_search(source_index, cleaned_response_text_sentence) - result_df = result_df.merge(in_scope_source_df, on="docno", how="left")[['docno', 'rank', 'score', 'url', 'url_id', 'text']] - - if self.semantic_search_service.provider == 'pyterrier': - SCORE_THRESHOLD = 5 - result_within_scope_df = result_df[result_df['score'] >= SCORE_THRESHOLD] - elif self.semantic_search_service.provider in ['faiss-openai', 'faiss-huggingface']: - # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also - # print(f'sentence {index}') - # print(result_df[['text', 'url_id', 'score']]) - SCORE_THRESHOLD = 0.6 - top_k = 1 - # # distance for faiss (lower is closer) - # result_within_scope_df = result_df[result_df['score'] <= SCORE_THRESHOLD].head(top_k) - result_within_scope_df = result_df.head(top_k) - else: - NotImplementedError(f'Unsupported semantic search provider: {self.semantic_search_service.provider}') - - footnote_result_sentence_dict = get_footnote_result_sentence_dict(response_text_sentence, - result_within_scope_df['docno'].tolist(), - result_within_scope_df['rank'].tolist(), - result_within_scope_df['score'].tolist(), - sorted(result_within_scope_df['url_id'].unique().tolist()), - result_within_scope_df['url'].tolist(), - result_within_scope_df['url_id'].tolist(), - result_within_scope_df['text'].tolist() - ) - footnote_result_list.append(footnote_result_sentence_dict) - return footnote_result_list, in_scope_source_df - - def pretty_print_footnote_result_list(self, footnote_result_list, gpt_input_text_df): - def create_response_json_object(text, type): - return {"text": text, "type": type} - - def create_source_json_object(footnote, domain, url, title, text): - return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} - - url_id_map = {} # to reassign url_id as per appearance order - - # footnote text and json processing - response_text_with_footnote = '' - response_json = [] - - for footnote_result in footnote_result_list: - footnote_print = [] - response_json.append(create_response_json_object(footnote_result["sentence"], "response")) - - for url_id in footnote_result['url_unique_ids']: - - if url_id not in url_id_map: - url_id_map[url_id] = len(url_id_map) + 1 - - footnote_print += [f'[{url_id_map[url_id]}]'] - response_json.append(create_response_json_object(f'[{url_id_map[url_id]}]', "footnote")) - - response_text_with_footnote += f'{footnote_result["sentence"]}{" " + "".join(sorted(footnote_print)) if len(footnote_print) > 0 else ""} ' - - # source text and json processing - in_scope_source_df = gpt_input_text_df[gpt_input_text_df['in_scope']].copy() - in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) - in_scope_source_df.sort_values('docno', inplace=True) - - source_text_list = [] - source_json = [] - - source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True) - for index, row in source_url_df.iterrows(): - if row['url_id'] not in url_id_map: - continue - - url_text = '' - url_text += f"[{url_id_map[row['url_id']]}] {row['url']}\n" - - for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows(): - url_text += f" {row['text']}\n" - - source_text_list.append(url_text) - - domain_name = urlparse(row['url']).netloc.replace('www.', '') - source_json.append(create_source_json_object(f"[{url_id_map[row['url_id']]}]", domain_name, row['url'], row['name'], row['snippet'])) - - source_text = ''.join(sorted(source_text_list)) - source_json = sorted(source_json, key=lambda x: x['footnote']) - - print('===========Response text (ref):============') - print(response_text_with_footnote) - print() - print('===========Source text:============') - print(source_text) - print() - - return response_text_with_footnote, source_text, {'response_json': response_json, 'source_json': source_json} diff --git a/src/FrontendService.py b/src/FrontendService.py new file mode 100644 index 0000000..4f6e890 --- /dev/null +++ b/src/FrontendService.py @@ -0,0 +1,82 @@ +import re +from urllib.parse import urlparse + +from Util import setup_logger + +logger = setup_logger('FootnoteService') + + + + +class FrontendService: + def __init__(self, config, response_text, gpt_input_text_df): + self.config = config + self.response_text = response_text + used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id + self.gpt_input_text_df = gpt_input_text_df[used_columns] + + + def get_data_json(self, response_text, gpt_input_text_df): + def create_response_json_object(text, type): + return {"text": text, "type": type} + + def create_source_json_object(footnote, domain, url, title, text): + return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} + + def reorder_url_id(response_text, gpt_input_text_df): + # response_text: find reference in text & re-order + url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))] + url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) + + for url_id, new_url_id in url_id_map.items(): + response_text = response_text.replace(f'[{url_id}]', f'[{new_url_id}]') + + # gpt_input_text_df: find reference in text & re-order + in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() + in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) + return response_text, in_scope_source_df + + def get_response_json(response_text): + response_json = [] + split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) + + for sentence in split_sentence: + if sentence.startswith('[') and sentence.endswith(']'): + response_json.append(create_response_json_object(sentence, "footnote")) + else: + response_json.append(create_response_json_object(sentence, "response")) + return response_json + + def get_source_json(in_scope_source_df): + in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) + in_scope_source_df.sort_values('docno', inplace=True) + source_text_list = [] + source_json = [] + source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True) + for index, row in source_url_df.iterrows(): + url_text = '' + url_text += f"[{row['url_id']}] {row['url']}\n" + + for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows(): + url_text += f" {row['text']}\n" + + source_text_list.append(url_text) + + domain_name = urlparse(row['url']).netloc.replace('www.', '') + source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet'])) + source_text = ''.join(sorted(source_text_list)) + + source_json = sorted(source_json, key=lambda x: x['footnote']) + return source_json, source_text + + response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df) + response_json = get_response_json(response_text) + source_json, source_text = get_source_json(in_scope_source_df) + + return source_text, {'response_json': response_json, 'source_json': source_json} + + +if __name__ == '__main__': + sentence = "According to the sources [1] [2], it is predicted that the world's natural gas reserves will last about 52.8 years with the current rate of production. [13] TestTest." + split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', sentence) + print(split_sentence) diff --git a/src/LLMService.py b/src/LLMService.py index beba34e..2f03230 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -53,6 +53,39 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame): {context_str} Question: {search_text} Answer: +""" + return prompt + + def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): + if not self.config.get('search_option').get('is_use_source'): + prompt = \ + f""" +Instructions: Write a comprehensive reply to the given query. +If the context is insufficient, reply "I cannot answer". +Query: {search_text} +""" + return prompt + + logger.info(f"OpenAIService.get_prompt_v3. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") + context_str = "" + url_id_list = gpt_input_text_df['url_id'].unique() + for url_id in url_id_list: + context_str += f"Source ({url_id})\n" + for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == url_id) & gpt_input_text_df['in_scope']].iterrows(): + context_str += f"{row['text']}\n" + context_str += "\n" + prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit') + context_str = context_str[:prompt_length_limit] + prompt = \ + f""" +Web search result: +{context_str} + +Instructions: Using the provided web search results, write a comprehensive reply to the given query. +Make sure to cite results using [number] notation after the reference. +If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. +If the context is insufficient, reply "I cannot answer because my reference sources don't have related info". +Query: {search_text} """ return prompt diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 9c74f3a..867a3cc 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -6,9 +6,9 @@ import yaml from BingService import BingService -from FootnoteService import FootnoteService +from FrontendService import FrontendService from LLMService import LLMServiceFactory -from SemanticSearchService import SemanticSearchServiceFactory +from SemanticSearchService import BatchOpenAISemanticSearchService from Util import setup_logger, post_process_gpt_input_text_df, check_result_cache_exists, load_result_from_cache, save_result_cache, check_max_number_of_cache, get_project_root from text_extract.doc import support_doc_type, doc_extract_svc_map from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc @@ -45,8 +45,6 @@ def overide_config_by_query_string(self, ui_overriden_config): self.config['goose_ai_api']['model'] = value else: raise Exception(f"llm_model is not supported for llm_service_provider: {self.config['llm_service']['provider']}") - elif key == 'semantic_search_provider': - self.config['semantic_search']['provider'] = value else: # invalid query_string but not throwing exception first pass @@ -58,9 +56,8 @@ def validate_config(self): assert self.config['openai_api']['api_key'], 'openai_api_key is required' def _prompt(self, search_text, text_df, cache_path=None): - semantic_search_service_factory = SemanticSearchServiceFactory() - semantic_search_service = semantic_search_service_factory.create_semantic_search_service(self.config) - gpt_input_text_df = semantic_search_service.retrieve_result_by_search_text_from_text_df(search_text, text_df) + semantic_search_service = BatchOpenAISemanticSearchService(self.config) + gpt_input_text_df = semantic_search_service.search_related_source(text_df, search_text) gpt_input_text_df = post_process_gpt_input_text_df(gpt_input_text_df, self.config.get('openai_api').get('prompt').get('prompt_length_limit')) llm_service_provider = self.config.get('llm_service').get('provider') @@ -71,7 +68,7 @@ def _prompt(self, search_text, text_df, cache_path=None): prompt, response_text = cache['prompt'], cache['response_text'] else: llm_service = LLMServiceFactory.create_llm_service(self.config) - prompt = llm_service.get_prompt(search_text, gpt_input_text_df) + prompt = llm_service.get_prompt_v3(search_text, gpt_input_text_df) response_text = llm_service.call_api(prompt) llm_config = self.config.get(f'{llm_service_provider}_api').copy() @@ -81,42 +78,45 @@ def _prompt(self, search_text, text_df, cache_path=None): # check whether the number of cache exceeds the limit check_max_number_of_cache(cache_path, self.config.get('cache').get('max_number_of_cache')) + frontend_service = FrontendService(self.config, response_text, gpt_input_text_df) + source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df) + print('===========Prompt:============') print(prompt) print('===========Search:============') print(search_text) - print('===========Response text (raw):============') + print('===========Response text:============') print(response_text) + print('===========Source text:============') + print(source_text) - footnote_service = FootnoteService(self.config, response_text, gpt_input_text_df, semantic_search_service) - footnote_result_list, in_scope_source_df = footnote_service.get_footnote_from_sentences() - response_text_with_footnote, source_text, data_json = footnote_service.pretty_print_footnote_result_list(footnote_result_list, gpt_input_text_df) - - return response_text, response_text_with_footnote, source_text, data_json + return response_text, source_text, data_json def _extract_bing_text_df(self, search_text, cache_path): # BingSearch using search_text # check if bing search result is cached and load if exists bing_text_df = None - if self.config['search_option']['is_enable_bing_search']: - if self.config.get('cache').get('is_enable_cache') and check_result_cache_exists(cache_path, search_text, 'bing_search'): - logger.info(f"BingService.load_result_from_cache. search_text: {search_text}, cache_path: {cache_path}") - cache = load_result_from_cache(cache_path, search_text, 'bing_search') - bing_text_df = cache['bing_text_df'] - else: - bing_service = BingService(self.config) - website_df = bing_service.call_bing_search_api(search_text) - bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df) - - bing_search_config = self.config.get('bing_search').copy() - bing_search_config.pop('subscription_key') # delete api_key from config to avoid saving it to .cache - save_result_cache(cache_path, search_text, 'bing_search', bing_text_df=bing_text_df, config=bing_search_config) + if not self.config['search_option']['is_use_source'] or not self.config['search_option']['is_enable_bing_search']: + return bing_text_df + + if self.config.get('cache').get('is_enable_cache') and check_result_cache_exists(cache_path, search_text, 'bing_search'): + logger.info(f"BingService.load_result_from_cache. search_text: {search_text}, cache_path: {cache_path}") + cache = load_result_from_cache(cache_path, search_text, 'bing_search') + bing_text_df = cache['bing_text_df'] + else: + bing_service = BingService(self.config) + website_df = bing_service.call_bing_search_api(search_text) + bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df) + + bing_search_config = self.config.get('bing_search').copy() + bing_search_config.pop('subscription_key') # delete api_key from config to avoid saving it to .cache + save_result_cache(cache_path, search_text, 'bing_search', bing_text_df=bing_text_df, config=bing_search_config) return bing_text_df def _extract_doc_text_df(self, bing_text_df): # DocSearch using doc_search_path # bing_text_df is used for doc_id arrangement - if not self.config['search_option']['is_enable_doc_search']: + if not self.config['search_option']['is_use_source'] or not self.config['search_option']['is_enable_doc_search']: return pd.DataFrame([]) files_grabbed = list() for doc_type in support_doc_type: diff --git a/src/SemanticSearchService.py b/src/SemanticSearchService.py index db68221..6812a1e 100644 --- a/src/SemanticSearchService.py +++ b/src/SemanticSearchService.py @@ -1,166 +1,204 @@ -import os -from abc import ABC, abstractmethod -from datetime import datetime - +import openai import pandas as pd -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.embeddings import HuggingFaceEmbeddings -from langchain.vectorstores import FAISS +from openai.embeddings_utils import cosine_similarity from Util import setup_logger +# from abc import ABC, abstractmethod +# from langchain.embeddings import HuggingFaceEmbeddings +# from langchain.vectorstores import FAISS +BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai logger = setup_logger('SemanticSearchService') -class SemanticSearchService(ABC): +# class SemanticSearchService(ABC): +# def __init__(self, config): +# self.cwd = os.getcwd() +# self.config = config +# self.index = None +# self.provider = '' +# +# @abstractmethod +# def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): +# pass +# +# @abstractmethod +# def retrieve_result_by_search_text_from_text_df(self, search_text, text_df) -> pd.DataFrame: +# pass +# +# @staticmethod +# def use_index_to_search(index, search_text): +# pass +# +# def clean_sentence_to_avoid_lexical_error(self, text): +# """ +# Clean sentence. Pyterrier will throw error if the search query contains some special characters shown below +# jnius.JavaException: JVM exception occurred: Failed to process qid 1 ' +# ' -- Lexical error at line 3, column 90. Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException +# python-BaseException +# :return: +# """ +# # TODO: good way to clean +# return text.replace("'", "").replace("?", "").replace("!", "").replace(":", "").replace(";", "") +# +# +# class PyTerrierService(SemanticSearchService): +# def __init__(self, config): +# super().__init__(config) +# self.provider = 'pyterrier' +# +# def create_index_column_in_df(self, text_df: pd.DataFrame) -> pd.DataFrame: +# """ +# add a docno column (primary key / index column) to the dataframe +# :param text_df: +# :return: text_df with docno column +# """ +# text_df["docno"] = text_df.index + 1 +# text_df["docno"] = text_df["docno"].astype(str) +# return text_df +# +# def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): +# """ +# index the text_df to get a indexref +# :param text_df: +# required columns: +# docno: as primary key for later process to retrieve back the row +# text: the text to be indexed +# :return: +# indexref: +# """ +# import pyterrier as pt +# if not pt.started(): +# pt.init() +# datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") +# df_indexer_path = os.path.join(self.cwd, f".index/{indexref_folder_name}_" + datetime_str) +# if not os.path.exists(df_indexer_path): +# os.makedirs(df_indexer_path) +# +# # TODO: using overwrite? +# # Currently I cannot use overwrite=True to directly overwrite the existing index folder +# # when I index for the second time, it will throw error. Therefore need to create a new folder +# # I also cannot delete it in the last step, because the process is still running and consuming the index files inside. +# +# # TODO: using a better wmodel than Tf? +# pd_indexer = pt.DFIndexer(df_indexer_path, wmodel="Tf") +# indexref = pd_indexer.index(text_df["text"], text_df["docno"]) +# return indexref +# +# @staticmethod +# def use_index_to_search(index, search_text): +# result_df: pd.DataFrame = pt.BatchRetrieve(index).search(search_text) +# return result_df +# +# def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): +# logger.info(f"PyTerrierService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") +# text_df = self.create_index_column_in_df(text_df) +# index = self.index_text_df(text_df, 'df_index') +# result_df: pd.DataFrame = self.use_index_to_search(index, search_text) +# return result_df.merge(text_df, on="docno", how="left") +# +# +# class LangChainFAISSService(SemanticSearchService): +# def __init__(self, config): +# super().__init__(config) +# self.provider = self.config.get('semantic_search').get('provider') +# self.embeddings = None +# if self.provider == 'faiss-openai': +# self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('openai_api').get('api_key')) +# elif self.provider == 'faiss-huggingface': +# self.embeddings = HuggingFaceEmbeddings() +# else: +# raise Exception(f"provider {self.provider} is not supported") +# +# def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): +# logger.info(f"LangChainFAISSService.index_text_df. text_df.shape: {text_df.shape}") +# text_df['docno'] = text_df.index.tolist() +# texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist() +# docno_dict = [{'docno': docno} for docno in docno_list] +# faiss_index = FAISS.from_texts(texts, self.embeddings, metadatas=docno_dict) +# return faiss_index +# +# @staticmethod +# def use_index_to_search(index, search_text): +# index: FAISS +# # k: Number of Documents to return. Defaults to 4. +# # fetch_k: Number of Documents to fetch to pass to MMR algorithm. +# +# # k = 15 +# # # Cons: you can only pick k, but you cannot filter by score +# # tuples = index.similarity_search_with_score(search_text, k=k) +# # docno_list = [t[0].metadata['docno'] for t in tuples] +# # score_list = [t[1] for t in tuples] +# # result_df = pd.DataFrame({'docno': docno_list, 'score': score_list}) +# # result_df['rank'] = result_df.index +# +# k = 30 +# docs = index.max_marginal_relevance_search(search_text, k=k, fetch_k=999) +# docno_list = [doc.metadata['docno'] for doc in docs] +# result_df = pd.DataFrame({'docno': docno_list}) +# result_df['rank'] = result_df.index +# result_df['score'] = 999 +# +# return result_df +# +# def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): +# logger.info(f"LangChainFAISSService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") +# faiss_index = self.index_text_df(text_df, '') +# result_df = self.use_index_to_search(faiss_index, search_text) +# return result_df.merge(text_df, on="docno", how="left") +# +# +# class SemanticSearchServiceFactory: +# @staticmethod +# def create_semantic_search_service(config) -> SemanticSearchService: +# provider = config.get('semantic_search').get('provider') +# if provider == 'pyterrier': +# return PyTerrierService(config) +# elif provider in ['faiss-openai', 'faiss-huggingface']: +# return LangChainFAISSService(config) +# else: +# logger.error(f'SemanticSearchService for {provider} is not yet implemented.') +# raise NotImplementedError(f'SemanticSearchService - {provider} - is not supported') + + +class BatchOpenAISemanticSearchService: def __init__(self, config): - self.cwd = os.getcwd() self.config = config - self.index = None - self.provider = '' - - @abstractmethod - def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): - pass - - @abstractmethod - def retrieve_result_by_search_text_from_text_df(self, search_text, text_df) -> pd.DataFrame: - pass + openai.api_key = config.get('openai_api').get('api_key') @staticmethod - def use_index_to_search(index, search_text): - pass - - def clean_sentence_to_avoid_lexical_error(self, text): - """ - Clean sentence. Pyterrier will throw error if the search query contains some special characters shown below - jnius.JavaException: JVM exception occurred: Failed to process qid 1 ' - ' -- Lexical error at line 3, column 90. Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException - python-BaseException - :return: - """ - # TODO: good way to clean - return text.replace("'", "").replace("?", "").replace("!", "").replace(":", "").replace(";", "") - - -class PyTerrierService(SemanticSearchService): - def __init__(self, config): - super().__init__(config) - self.provider = 'pyterrier' - - def create_index_column_in_df(self, text_df: pd.DataFrame) -> pd.DataFrame: - """ - add a docno column (primary key / index column) to the dataframe - :param text_df: - :return: text_df with docno column - """ - text_df["docno"] = text_df.index + 1 - text_df["docno"] = text_df["docno"].astype(str) - return text_df - - def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): - """ - index the text_df to get a indexref - :param text_df: - required columns: - docno: as primary key for later process to retrieve back the row - text: the text to be indexed - :return: - indexref: - """ - import pyterrier as pt - if not pt.started(): - pt.init() - datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") - df_indexer_path = os.path.join(self.cwd, f".index/{indexref_folder_name}_" + datetime_str) - if not os.path.exists(df_indexer_path): - os.makedirs(df_indexer_path) - - # TODO: using overwrite? - # Currently I cannot use overwrite=True to directly overwrite the existing index folder - # when I index for the second time, it will throw error. Therefore need to create a new folder - # I also cannot delete it in the last step, because the process is still running and consuming the index files inside. - - # TODO: using a better wmodel than Tf? - pd_indexer = pt.DFIndexer(df_indexer_path, wmodel="Tf") - indexref = pd_indexer.index(text_df["text"], text_df["docno"]) - return indexref + def batch_call_embeddings(texts, chunk_size=1000): + texts = [text.replace("\n", " ") for text in texts] + embeddings = [] + for i in range(0, len(texts), chunk_size): + response = openai.Embedding.create( + input=texts[i: i + chunk_size], engine=BASE_MODEL + ) + embeddings += [r["embedding"] for r in response["data"]] + return embeddings @staticmethod - def use_index_to_search(index, search_text): - result_df: pd.DataFrame = pt.BatchRetrieve(index).search(search_text) - return result_df - - def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): - logger.info(f"PyTerrierService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") - text_df = self.create_index_column_in_df(text_df) - index = self.index_text_df(text_df, 'df_index') - result_df: pd.DataFrame = self.use_index_to_search(index, search_text) - return result_df.merge(text_df, on="docno", how="left") - - -class LangChainFAISSService(SemanticSearchService): - def __init__(self, config): - super().__init__(config) - self.provider = self.config.get('semantic_search').get('provider') - self.embeddings = None - if self.provider == 'faiss-openai': - self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('openai_api').get('api_key')) - elif self.provider == 'faiss-huggingface': - self.embeddings = HuggingFaceEmbeddings() - else: - raise Exception(f"provider {self.provider} is not supported") - - def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): - logger.info(f"LangChainFAISSService.index_text_df. text_df.shape: {text_df.shape}") - text_df['docno'] = text_df.index.tolist() - texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist() - docno_dict = [{'docno': docno} for docno in docno_list] - faiss_index = FAISS.from_texts(texts, self.embeddings, metadatas=docno_dict) - return faiss_index - - @staticmethod - def use_index_to_search(index, search_text): - index: FAISS - # k: Number of Documents to return. Defaults to 4. - # fetch_k: Number of Documents to fetch to pass to MMR algorithm. - - # k = 15 - # # Cons: you can only pick k, but you cannot filter by score - # tuples = index.similarity_search_with_score(search_text, k=k) - # docno_list = [t[0].metadata['docno'] for t in tuples] - # score_list = [t[1] for t in tuples] - # result_df = pd.DataFrame({'docno': docno_list, 'score': score_list}) - # result_df['rank'] = result_df.index - - k = 30 - docs = index.max_marginal_relevance_search(search_text, k=k, fetch_k=999) - docno_list = [doc.metadata['docno'] for doc in docs] - result_df = pd.DataFrame({'docno': docno_list}) - result_df['rank'] = result_df.index - result_df['score'] = 999 + def compute_embeddings_for_text_df(text_df: pd.DataFrame): + """Compute embeddings for a text_df and return the text_df with the embeddings column added.""" + print(f'compute_embeddings_for_text_df() len(texts): {len(text_df)}') + text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) + text_df['embedding'] = BatchOpenAISemanticSearchService.batch_call_embeddings(text_df['text'].tolist()) + return text_df + def search_related_source(self, text_df: pd.DataFrame, target_text, n=30): + if not self.config.get('search_option').get('is_use_source'): + col = ['name', 'url', 'url_id', 'snippet', 'text', 'similarities', 'rank', 'docno'] + return pd.DataFrame(columns=col) + + print(f'search_similar() text: {target_text}') + embedding = BatchOpenAISemanticSearchService.batch_call_embeddings([target_text])[0] + text_df = BatchOpenAISemanticSearchService.compute_embeddings_for_text_df(text_df) + text_df['similarities'] = text_df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) + result_df = text_df.sort_values('similarities', ascending=False).head(n) + result_df['rank'] = range(1, len(result_df) + 1) + result_df['docno'] = range(1, len(result_df) + 1) return result_df - def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): - logger.info(f"LangChainFAISSService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") - faiss_index = self.index_text_df(text_df, '') - result_df = self.use_index_to_search(faiss_index, search_text) - return result_df.merge(text_df, on="docno", how="left") - - -class SemanticSearchServiceFactory: - @staticmethod - def create_semantic_search_service(config) -> SemanticSearchService: - provider = config.get('semantic_search').get('provider') - if provider == 'pyterrier': - return PyTerrierService(config) - elif provider in ['faiss-openai', 'faiss-huggingface']: - return LangChainFAISSService(config) - else: - logger.error(f'SemanticSearchService for {provider} is not yet implemented.') - raise NotImplementedError(f'SemanticSearchService - {provider} - is not supported') - if __name__ == '__main__': pass diff --git a/src/Util.py b/src/Util.py index 735f64f..ee8c259 100644 --- a/src/Util.py +++ b/src/Util.py @@ -23,22 +23,17 @@ def setup_logger(tag): def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): - # clean out of prompt texts + # clean out of prompt texts for existing [1], [2], [3]... in the source_text + gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) + gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit - - # display_df = gpt_input_text_df[gpt_input_text_df['in_scope']] - # # after cleaning, display text - # display_df.sort_values(by=['docno'], inplace=True) - # distinct_urls = list(display_df['url'].unique()) - # # for list with index - # for index, url in enumerate(distinct_urls): - # print('---------------------') - # print(f'[{index+1}] {url}') - # for index, row in display_df[display_df['url'] == url].iterrows(): - # print(f' {row["text"]}') + # reorder url_id with url that in scope. + url_id_list = gpt_input_text_df['url_id'].unique() + url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) + gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map) return gpt_input_text_df diff --git a/src/config/config.yaml b/src/config/config.yaml index 031a164..e1ac482 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -32,10 +32,3 @@ cache: # .cache result for efficiency and consistency is_enable_cache: false path: .cache max_number_of_cache: 0 -semantic_search: - # provider list: - # faiss-openai (default): Use OpenAIEmbedding. fast, good accuracy but need openai key (cost) - # faiss-huggingface: Use HuggingFaceEmbedding. slow, good accuracy but free (need to download) - # pyterrier: Use PyTerrier Tf (term frequency). fast, fair accuracy and free (need to install pyterrier and java) - provider: faiss-openai - diff --git a/src/gradio_app.py b/src/gradio_app.py index 4a8f0ee..f83000d 100644 --- a/src/gradio_app.py +++ b/src/gradio_app.py @@ -5,8 +5,8 @@ def query_and_get_answer(search_text): search_gpt_service = SearchGPTService() - response_text, response_text_with_footnote, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) - return response_text, response_text_with_footnote, source_text + response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) + return response_text, source_text demo = gr.Interface(fn=query_and_get_answer, diff --git a/src/main.py b/src/main.py index 7e685a3..33d409a 100644 --- a/src/main.py +++ b/src/main.py @@ -4,6 +4,6 @@ search_text = 'the source of dark energy' search_gpt_service = SearchGPTService() - response_text, response_text_with_footnote, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) + response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) print() diff --git a/src/website/static/index.js b/src/website/static/index.js index 282b0d6..d3c5e9c 100644 --- a/src/website/static/index.js +++ b/src/website/static/index.js @@ -14,8 +14,7 @@ $(document).ready(function () { openai_api_key: $('#openai_api_key').val(), is_use_source: $('input[name="is_use_source"]')[0].checked, llm_service_provider: $('#llm_service_provider').val(), - llm_model: $('#llm_model').val(), - semantic_search_provider: $('#semantic_search_provider').val() + llm_model: $('#llm_model').val() }, success: function (response) { $('#' + response.id).html(response.html) diff --git a/src/website/templates/base.html b/src/website/templates/base.html index 43778fe..3f1b899 100644 --- a/src/website/templates/base.html +++ b/src/website/templates/base.html @@ -30,7 +30,7 @@
-

SearchGPT 20230306 Version ( +

SearchGPT 20230309 Version ( Github ). Your feedback will help us to improve

diff --git a/src/website/templates/index.html b/src/website/templates/index.html index 38edf22..6b764b0 100644 --- a/src/website/templates/index.html +++ b/src/website/templates/index.html @@ -63,17 +63,6 @@ >text-curie-001
-
- - -
diff --git a/src/website/views.py b/src/website/views.py index 71e4d57..7108002 100644 --- a/src/website/views.py +++ b/src/website/views.py @@ -31,14 +31,12 @@ def index_page(): 'is_use_source': request.values.get('is_use_source'), 'llm_service_provider': request.values.get('llm_service_provider'), 'llm_model': request.values.get('llm_model'), - 'semantic_search_provider': request.values.get('semantic_search_provider'), } logger.info(f"GET ui_overriden_config: {ui_overriden_config}") if search_text is not None: search_gpt_service = SearchGPTService(ui_overriden_config) - response_text, response_text_with_footnote, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) - # response_text, response_text_with_footnote, source_text, data_json = "test", "test", "test", {'response_json': [], 'source_json': []} + _, _, data_json = search_gpt_service.query_and_get_answer(search_text) except Exception as e: error = str(e)