From 226c9d6acba001eed09f4f751254dd17cd459930 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 20:44:49 +0800 Subject: [PATCH 01/14] POC batch call --- playground/test_OpenAI_Embedding.py | 103 ++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 playground/test_OpenAI_Embedding.py diff --git a/playground/test_OpenAI_Embedding.py b/playground/test_OpenAI_Embedding.py new file mode 100644 index 0000000..a074d1e --- /dev/null +++ b/playground/test_OpenAI_Embedding.py @@ -0,0 +1,103 @@ +import os + +import openai +import pandas as pd +import yaml +from openai.embeddings_utils import get_embedding, cosine_similarity + +from Util import get_project_root + +BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai + + +def search_using_cosine_similarity(df, query): + query_embedding = get_embedding(query, engine=BASE_MODEL) + df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding)) + + results = df.sort_values("similarity", ascending=False, ignore_index=True) + + k = 5 + results = results.head(k) + global sources + sources = [] + for i in range(k): + sources.append({'Page ' + str(results.iloc[i]['page']): results.iloc[i]['text'][:150] + '...'}) + print(sources) + return results.head(k) + + +def compute_embeddings(text, model="text-embedding-ada-002"): + print(f'compute_embeddings() text: {text}') + text = text.replace("\n", " ") + return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding'] + + +def search_similar(df: pd.DataFrame, target_text, n=3, pprint=True): + print(f'search_similar() text: {target_text}') + embedding = compute_embeddings(target_text, model=BASE_MODEL) + df['similarities'] = df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) + res = df.sort_values('similarities', ascending=False).head(n) + return res, df + + +def compute_embeddings_2(df, model=BASE_MODEL, chunk_size=1000): + print(f'compute_embeddings_2() len(texts): {len(df)}') + text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) + embeddings = [] + for i in range(0, len(texts), chunk_size): + response = openai.Embedding.create( + input=texts[i: i + chunk_size], engine=model + ) + embeddings += [r["embedding"] for r in response["data"]] + text_df['embedding'] = embeddings + return text_df + + +if __name__ == '__main__': + # text_df = pd.read_csv(os.path.join(get_project_root(), 'src/text_df.csv')) + texts = [ + "Discover the world of delicious beans with our premium selection.", + "Try our savory bean soup recipe for a delicious and nutritious meal.", + "Our roasted coffee beans are carefully selected for their rich and delicious flavor.", + "Beans are not only delicious, but also a great source of protein and dietary fiber.", + "Looking for a delicious vegan meal? Try our spicy black bean burger recipe.", + + "The sky is blue and the sun is shining today.", + "I need to go grocery shopping after work to pick up some milk and bread.", + "Did you hear about the new movie that just came out? It's supposed to be really good.", + "I'm planning a trip to Europe next summer and I'm so excited.", + "My cat keeps meowing at me for no reason and it's driving me crazy.", + ] + text_df = pd.DataFrame({'text': texts, 'docno': range(len(texts))}) + print(text_df.shape) + + with open(os.path.join(get_project_root(), 'src/config/config.yaml')) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + openai.api_key = config.get('openai_api').get('api_key') + + # text_df = compute_embeddings(text_df) + # result_df = search_using_cosine_similarity(text_df, 'what is chatgpt?') + # print(result_df) + + search_text = 'delicious beans' + search_text = 'Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection Discover the world of delicious beans with our premium selection ' + + from pyinstrument import Profiler + + profiler = Profiler() + profiler.start() + print("Sequential call mode:") + text_df['embedding'] = text_df['text'].apply(lambda x: compute_embeddings(x, model=BASE_MODEL)) + res, text_df = search_similar(text_df, search_text, n=3) + print(res) + profiler.stop() + profiler.print() + + profiler = Profiler() + profiler.start() + print("Batch call mode:") + text_df = compute_embeddings_2(text_df) + res, text_df = search_similar(text_df, search_text, n=3) + print(res) + profiler.stop() + profiler.print() From e4372c920016442f8113a4129e038af70e967ed7 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 21:11:15 +0800 Subject: [PATCH 02/14] run pass for search source. (TODO: footnote) --- playground/test_OpenAI_Embedding.py | 2 +- requirements.txt | 6 +- src/FootnoteService.py | 4 +- src/SearchGPTService.py | 7 +- src/SemanticSearchService.py | 327 +++++++++++++++------------- src/Util.py | 11 - 6 files changed, 191 insertions(+), 166 deletions(-) diff --git a/playground/test_OpenAI_Embedding.py b/playground/test_OpenAI_Embedding.py index a074d1e..789d6dc 100644 --- a/playground/test_OpenAI_Embedding.py +++ b/playground/test_OpenAI_Embedding.py @@ -40,7 +40,7 @@ def search_similar(df: pd.DataFrame, target_text, n=3, pprint=True): return res, df -def compute_embeddings_2(df, model=BASE_MODEL, chunk_size=1000): +def compute_embeddings_2(text_df, model=BASE_MODEL, chunk_size=1000): print(f'compute_embeddings_2() len(texts): {len(df)}') text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) embeddings = [] diff --git a/requirements.txt b/requirements.txt index 0dda05b..ac91aed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,11 @@ Flask==2.2.3 requests==2.28.2 gunicorn==20.1.0 -# nlp +# embedding +matplotlib==3.7.1 +plotly==5.13.1 +scipy==1.10.1 +scikit-learn==1.2.1 # doc extraction python-docx==0.8.11 diff --git a/src/FootnoteService.py b/src/FootnoteService.py index f6f1e98..be5010e 100644 --- a/src/FootnoteService.py +++ b/src/FootnoteService.py @@ -2,14 +2,14 @@ import pandas as pd -from SemanticSearchService import SemanticSearchService +from SemanticSearchService import BatchOpenAISemanticSearchService from Util import setup_logger, split_sentences_from_paragraph logger = setup_logger('FootnoteService') class FootnoteService: - def __init__(self, config, response_text, gpt_input_text_df, semantic_search_service: SemanticSearchService): + def __init__(self, config, response_text, gpt_input_text_df, semantic_search_service: BatchOpenAISemanticSearchService): self.config = config self.response_text = response_text used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 9c74f3a..e318483 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -8,7 +8,7 @@ from BingService import BingService from FootnoteService import FootnoteService from LLMService import LLMServiceFactory -from SemanticSearchService import SemanticSearchServiceFactory +from SemanticSearchService import BatchOpenAISemanticSearchService from Util import setup_logger, post_process_gpt_input_text_df, check_result_cache_exists, load_result_from_cache, save_result_cache, check_max_number_of_cache, get_project_root from text_extract.doc import support_doc_type, doc_extract_svc_map from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc @@ -58,9 +58,8 @@ def validate_config(self): assert self.config['openai_api']['api_key'], 'openai_api_key is required' def _prompt(self, search_text, text_df, cache_path=None): - semantic_search_service_factory = SemanticSearchServiceFactory() - semantic_search_service = semantic_search_service_factory.create_semantic_search_service(self.config) - gpt_input_text_df = semantic_search_service.retrieve_result_by_search_text_from_text_df(search_text, text_df) + semantic_search_service = BatchOpenAISemanticSearchService(self.config) + gpt_input_text_df = semantic_search_service.search_related_source(text_df, search_text) gpt_input_text_df = post_process_gpt_input_text_df(gpt_input_text_df, self.config.get('openai_api').get('prompt').get('prompt_length_limit')) llm_service_provider = self.config.get('llm_service').get('provider') diff --git a/src/SemanticSearchService.py b/src/SemanticSearchService.py index db68221..af92b65 100644 --- a/src/SemanticSearchService.py +++ b/src/SemanticSearchService.py @@ -1,166 +1,199 @@ -import os -from abc import ABC, abstractmethod -from datetime import datetime - +import openai import pandas as pd -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.embeddings import HuggingFaceEmbeddings -from langchain.vectorstores import FAISS +from openai.embeddings_utils import cosine_similarity from Util import setup_logger +# from abc import ABC, abstractmethod +# from langchain.embeddings import HuggingFaceEmbeddings +# from langchain.vectorstores import FAISS +BASE_MODEL = "text-embedding-ada-002" # default embedding of faiss-openai logger = setup_logger('SemanticSearchService') -class SemanticSearchService(ABC): +# class SemanticSearchService(ABC): +# def __init__(self, config): +# self.cwd = os.getcwd() +# self.config = config +# self.index = None +# self.provider = '' +# +# @abstractmethod +# def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): +# pass +# +# @abstractmethod +# def retrieve_result_by_search_text_from_text_df(self, search_text, text_df) -> pd.DataFrame: +# pass +# +# @staticmethod +# def use_index_to_search(index, search_text): +# pass +# +# def clean_sentence_to_avoid_lexical_error(self, text): +# """ +# Clean sentence. Pyterrier will throw error if the search query contains some special characters shown below +# jnius.JavaException: JVM exception occurred: Failed to process qid 1 ' +# ' -- Lexical error at line 3, column 90. Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException +# python-BaseException +# :return: +# """ +# # TODO: good way to clean +# return text.replace("'", "").replace("?", "").replace("!", "").replace(":", "").replace(";", "") +# +# +# class PyTerrierService(SemanticSearchService): +# def __init__(self, config): +# super().__init__(config) +# self.provider = 'pyterrier' +# +# def create_index_column_in_df(self, text_df: pd.DataFrame) -> pd.DataFrame: +# """ +# add a docno column (primary key / index column) to the dataframe +# :param text_df: +# :return: text_df with docno column +# """ +# text_df["docno"] = text_df.index + 1 +# text_df["docno"] = text_df["docno"].astype(str) +# return text_df +# +# def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): +# """ +# index the text_df to get a indexref +# :param text_df: +# required columns: +# docno: as primary key for later process to retrieve back the row +# text: the text to be indexed +# :return: +# indexref: +# """ +# import pyterrier as pt +# if not pt.started(): +# pt.init() +# datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") +# df_indexer_path = os.path.join(self.cwd, f".index/{indexref_folder_name}_" + datetime_str) +# if not os.path.exists(df_indexer_path): +# os.makedirs(df_indexer_path) +# +# # TODO: using overwrite? +# # Currently I cannot use overwrite=True to directly overwrite the existing index folder +# # when I index for the second time, it will throw error. Therefore need to create a new folder +# # I also cannot delete it in the last step, because the process is still running and consuming the index files inside. +# +# # TODO: using a better wmodel than Tf? +# pd_indexer = pt.DFIndexer(df_indexer_path, wmodel="Tf") +# indexref = pd_indexer.index(text_df["text"], text_df["docno"]) +# return indexref +# +# @staticmethod +# def use_index_to_search(index, search_text): +# result_df: pd.DataFrame = pt.BatchRetrieve(index).search(search_text) +# return result_df +# +# def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): +# logger.info(f"PyTerrierService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") +# text_df = self.create_index_column_in_df(text_df) +# index = self.index_text_df(text_df, 'df_index') +# result_df: pd.DataFrame = self.use_index_to_search(index, search_text) +# return result_df.merge(text_df, on="docno", how="left") +# +# +# class LangChainFAISSService(SemanticSearchService): +# def __init__(self, config): +# super().__init__(config) +# self.provider = self.config.get('semantic_search').get('provider') +# self.embeddings = None +# if self.provider == 'faiss-openai': +# self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('openai_api').get('api_key')) +# elif self.provider == 'faiss-huggingface': +# self.embeddings = HuggingFaceEmbeddings() +# else: +# raise Exception(f"provider {self.provider} is not supported") +# +# def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): +# logger.info(f"LangChainFAISSService.index_text_df. text_df.shape: {text_df.shape}") +# text_df['docno'] = text_df.index.tolist() +# texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist() +# docno_dict = [{'docno': docno} for docno in docno_list] +# faiss_index = FAISS.from_texts(texts, self.embeddings, metadatas=docno_dict) +# return faiss_index +# +# @staticmethod +# def use_index_to_search(index, search_text): +# index: FAISS +# # k: Number of Documents to return. Defaults to 4. +# # fetch_k: Number of Documents to fetch to pass to MMR algorithm. +# +# # k = 15 +# # # Cons: you can only pick k, but you cannot filter by score +# # tuples = index.similarity_search_with_score(search_text, k=k) +# # docno_list = [t[0].metadata['docno'] for t in tuples] +# # score_list = [t[1] for t in tuples] +# # result_df = pd.DataFrame({'docno': docno_list, 'score': score_list}) +# # result_df['rank'] = result_df.index +# +# k = 30 +# docs = index.max_marginal_relevance_search(search_text, k=k, fetch_k=999) +# docno_list = [doc.metadata['docno'] for doc in docs] +# result_df = pd.DataFrame({'docno': docno_list}) +# result_df['rank'] = result_df.index +# result_df['score'] = 999 +# +# return result_df +# +# def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): +# logger.info(f"LangChainFAISSService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") +# faiss_index = self.index_text_df(text_df, '') +# result_df = self.use_index_to_search(faiss_index, search_text) +# return result_df.merge(text_df, on="docno", how="left") +# +# +# class SemanticSearchServiceFactory: +# @staticmethod +# def create_semantic_search_service(config) -> SemanticSearchService: +# provider = config.get('semantic_search').get('provider') +# if provider == 'pyterrier': +# return PyTerrierService(config) +# elif provider in ['faiss-openai', 'faiss-huggingface']: +# return LangChainFAISSService(config) +# else: +# logger.error(f'SemanticSearchService for {provider} is not yet implemented.') +# raise NotImplementedError(f'SemanticSearchService - {provider} - is not supported') + + +class BatchOpenAISemanticSearchService: def __init__(self, config): - self.cwd = os.getcwd() - self.config = config - self.index = None - self.provider = '' - - @abstractmethod - def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): - pass - - @abstractmethod - def retrieve_result_by_search_text_from_text_df(self, search_text, text_df) -> pd.DataFrame: - pass + openai.api_key = config.get('openai_api').get('api_key') @staticmethod - def use_index_to_search(index, search_text): - pass - - def clean_sentence_to_avoid_lexical_error(self, text): - """ - Clean sentence. Pyterrier will throw error if the search query contains some special characters shown below - jnius.JavaException: JVM exception occurred: Failed to process qid 1 ' - ' -- Lexical error at line 3, column 90. Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException - python-BaseException - :return: - """ - # TODO: good way to clean - return text.replace("'", "").replace("?", "").replace("!", "").replace(":", "").replace(";", "") - - -class PyTerrierService(SemanticSearchService): - def __init__(self, config): - super().__init__(config) - self.provider = 'pyterrier' - - def create_index_column_in_df(self, text_df: pd.DataFrame) -> pd.DataFrame: - """ - add a docno column (primary key / index column) to the dataframe - :param text_df: - :return: text_df with docno column - """ - text_df["docno"] = text_df.index + 1 - text_df["docno"] = text_df["docno"].astype(str) - return text_df - - def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): - """ - index the text_df to get a indexref - :param text_df: - required columns: - docno: as primary key for later process to retrieve back the row - text: the text to be indexed - :return: - indexref: - """ - import pyterrier as pt - if not pt.started(): - pt.init() - datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S") - df_indexer_path = os.path.join(self.cwd, f".index/{indexref_folder_name}_" + datetime_str) - if not os.path.exists(df_indexer_path): - os.makedirs(df_indexer_path) - - # TODO: using overwrite? - # Currently I cannot use overwrite=True to directly overwrite the existing index folder - # when I index for the second time, it will throw error. Therefore need to create a new folder - # I also cannot delete it in the last step, because the process is still running and consuming the index files inside. - - # TODO: using a better wmodel than Tf? - pd_indexer = pt.DFIndexer(df_indexer_path, wmodel="Tf") - indexref = pd_indexer.index(text_df["text"], text_df["docno"]) - return indexref + def batch_call_embeddings(texts, chunk_size=1000): + texts = [text.replace("\n", " ") for text in texts] + embeddings = [] + for i in range(0, len(texts), chunk_size): + response = openai.Embedding.create( + input=texts[i: i + chunk_size], engine=BASE_MODEL + ) + embeddings += [r["embedding"] for r in response["data"]] + return embeddings @staticmethod - def use_index_to_search(index, search_text): - result_df: pd.DataFrame = pt.BatchRetrieve(index).search(search_text) - return result_df - - def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): - logger.info(f"PyTerrierService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") - text_df = self.create_index_column_in_df(text_df) - index = self.index_text_df(text_df, 'df_index') - result_df: pd.DataFrame = self.use_index_to_search(index, search_text) - return result_df.merge(text_df, on="docno", how="left") - - -class LangChainFAISSService(SemanticSearchService): - def __init__(self, config): - super().__init__(config) - self.provider = self.config.get('semantic_search').get('provider') - self.embeddings = None - if self.provider == 'faiss-openai': - self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('openai_api').get('api_key')) - elif self.provider == 'faiss-huggingface': - self.embeddings = HuggingFaceEmbeddings() - else: - raise Exception(f"provider {self.provider} is not supported") - - def index_text_df(self, text_df: pd.DataFrame, indexref_folder_name: str): - logger.info(f"LangChainFAISSService.index_text_df. text_df.shape: {text_df.shape}") - text_df['docno'] = text_df.index.tolist() - texts, docno_list = text_df['text'].tolist(), text_df['docno'].tolist() - docno_dict = [{'docno': docno} for docno in docno_list] - faiss_index = FAISS.from_texts(texts, self.embeddings, metadatas=docno_dict) - return faiss_index + def compute_embeddings_for_text_df(text_df: pd.DataFrame): + """Compute embeddings for a text_df and return the text_df with the embeddings column added.""" + print(f'compute_embeddings_2() len(texts): {len(text_df)}') + text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) + text_df['embedding'] = BatchOpenAISemanticSearchService.batch_call_embeddings(text_df['text'].tolist()) + return text_df @staticmethod - def use_index_to_search(index, search_text): - index: FAISS - # k: Number of Documents to return. Defaults to 4. - # fetch_k: Number of Documents to fetch to pass to MMR algorithm. - - # k = 15 - # # Cons: you can only pick k, but you cannot filter by score - # tuples = index.similarity_search_with_score(search_text, k=k) - # docno_list = [t[0].metadata['docno'] for t in tuples] - # score_list = [t[1] for t in tuples] - # result_df = pd.DataFrame({'docno': docno_list, 'score': score_list}) - # result_df['rank'] = result_df.index - - k = 30 - docs = index.max_marginal_relevance_search(search_text, k=k, fetch_k=999) - docno_list = [doc.metadata['docno'] for doc in docs] - result_df = pd.DataFrame({'docno': docno_list}) - result_df['rank'] = result_df.index - result_df['score'] = 999 - + def search_related_source(text_df: pd.DataFrame, target_text, n=30): + print(f'search_similar() text: {target_text}') + embedding = BatchOpenAISemanticSearchService.batch_call_embeddings([target_text])[0] + text_df = BatchOpenAISemanticSearchService.compute_embeddings_for_text_df(text_df) + text_df['similarities'] = text_df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) + result_df = text_df.sort_values('similarities', ascending=False).head(n) + result_df['rank'] = range(1, len(result_df) + 1) return result_df - def retrieve_result_by_search_text_from_text_df(self, search_text, text_df): - logger.info(f"LangChainFAISSService.retrieve_result_by_search_text_from_text_df. search_text: {search_text}, text_df.shape: {text_df.shape}") - faiss_index = self.index_text_df(text_df, '') - result_df = self.use_index_to_search(faiss_index, search_text) - return result_df.merge(text_df, on="docno", how="left") - - -class SemanticSearchServiceFactory: - @staticmethod - def create_semantic_search_service(config) -> SemanticSearchService: - provider = config.get('semantic_search').get('provider') - if provider == 'pyterrier': - return PyTerrierService(config) - elif provider in ['faiss-openai', 'faiss-huggingface']: - return LangChainFAISSService(config) - else: - logger.error(f'SemanticSearchService for {provider} is not yet implemented.') - raise NotImplementedError(f'SemanticSearchService - {provider} - is not supported') - if __name__ == '__main__': pass diff --git a/src/Util.py b/src/Util.py index 735f64f..cf02adf 100644 --- a/src/Util.py +++ b/src/Util.py @@ -28,17 +28,6 @@ def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit - - # display_df = gpt_input_text_df[gpt_input_text_df['in_scope']] - # # after cleaning, display text - # display_df.sort_values(by=['docno'], inplace=True) - # distinct_urls = list(display_df['url'].unique()) - # # for list with index - # for index, url in enumerate(distinct_urls): - # print('---------------------') - # print(f'[{index+1}] {url}') - # for index, row in display_df[display_df['url'] == url].iterrows(): - # print(f' {row["text"]}') return gpt_input_text_df From d7c0ac3658111eee8e9853849f5875e877469401 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 22:04:09 +0800 Subject: [PATCH 03/14] prompt_v3 (auto footnote) --- src/LLMService.py | 25 +++++++++++++++++++++++++ src/SearchGPTService.py | 2 +- src/SemanticSearchService.py | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/LLMService.py b/src/LLMService.py index beba34e..a63f31b 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -53,6 +53,31 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame): {context_str} Question: {search_text} Answer: +""" + return prompt + + def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): + logger.info(f"OpenAIService.get_prompt_v3. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") + context_str = "" + gpt_input_text_df = gpt_input_text_df[gpt_input_text_df['in_scope']].sort_values('url_id') + url_id_list = gpt_input_text_df['url_id'].unique() + for url_id in url_id_list: + context_str += f"Source ({url_id})\n" + for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows(): + context_str += f"{row['text']}\n" + context_str += "\n" + prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit') + context_str = context_str[:prompt_length_limit] + prompt = \ + f""" +Web search result: +{context_str} + +Instructions: Using the provided web search results, write a comprehensive reply to the given query. +Make sure to cite results using [number] notation after the reference. +If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. +If the context is insufficient, reply "I cannot answer". +Query: {search_text} """ return prompt diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index e318483..1ad8399 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -70,7 +70,7 @@ def _prompt(self, search_text, text_df, cache_path=None): prompt, response_text = cache['prompt'], cache['response_text'] else: llm_service = LLMServiceFactory.create_llm_service(self.config) - prompt = llm_service.get_prompt(search_text, gpt_input_text_df) + prompt = llm_service.get_prompt_v3(search_text, gpt_input_text_df) response_text = llm_service.call_api(prompt) llm_config = self.config.get(f'{llm_service_provider}_api').copy() diff --git a/src/SemanticSearchService.py b/src/SemanticSearchService.py index af92b65..49c677e 100644 --- a/src/SemanticSearchService.py +++ b/src/SemanticSearchService.py @@ -179,7 +179,7 @@ def batch_call_embeddings(texts, chunk_size=1000): @staticmethod def compute_embeddings_for_text_df(text_df: pd.DataFrame): """Compute embeddings for a text_df and return the text_df with the embeddings column added.""" - print(f'compute_embeddings_2() len(texts): {len(text_df)}') + print(f'compute_embeddings_for_text_df() len(texts): {len(text_df)}') text_df['text'] = text_df['text'].apply(lambda x: x.replace("\n", " ")) text_df['embedding'] = BatchOpenAISemanticSearchService.batch_call_embeddings(text_df['text'].tolist()) return text_df From 6b6ceb97dc7747c27b65d7dcf4e1d4106c6e0add Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 22:11:41 +0800 Subject: [PATCH 04/14] minor --- src/SemanticSearchService.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/SemanticSearchService.py b/src/SemanticSearchService.py index 49c677e..456ec56 100644 --- a/src/SemanticSearchService.py +++ b/src/SemanticSearchService.py @@ -192,6 +192,7 @@ def search_related_source(text_df: pd.DataFrame, target_text, n=30): text_df['similarities'] = text_df['embedding'].apply(lambda x: cosine_similarity(x, embedding)) result_df = text_df.sort_values('similarities', ascending=False).head(n) result_df['rank'] = range(1, len(result_df) + 1) + result_df['docno'] = range(1, len(result_df) + 1) return result_df From 4c9fbb1c7f822d9fe3e4f7c4c981b1a2c5485a41 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 23:28:15 +0800 Subject: [PATCH 05/14] is_use_source true run pass --- src/FootnoteService.py | 154 ---------------------------------------- src/FrontendService.py | 66 +++++++++++++++++ src/SearchGPTService.py | 7 +- 3 files changed, 69 insertions(+), 158 deletions(-) delete mode 100644 src/FootnoteService.py create mode 100644 src/FrontendService.py diff --git a/src/FootnoteService.py b/src/FootnoteService.py deleted file mode 100644 index be5010e..0000000 --- a/src/FootnoteService.py +++ /dev/null @@ -1,154 +0,0 @@ -from urllib.parse import urlparse - -import pandas as pd - -from SemanticSearchService import BatchOpenAISemanticSearchService -from Util import setup_logger, split_sentences_from_paragraph - -logger = setup_logger('FootnoteService') - - -class FootnoteService: - def __init__(self, config, response_text, gpt_input_text_df, semantic_search_service: BatchOpenAISemanticSearchService): - self.config = config - self.response_text = response_text - used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id - self.gpt_input_text_df = gpt_input_text_df[used_columns] - self.semantic_search_service = semantic_search_service - - if self.config.get('semantic_search').get('provider') == 'pyterrier': - import pyterrier as pt - if not pt.started(): - pt.init() - - def extract_sentences_from_paragraph(self): - # TODO: currently only support English - sentences = split_sentences_from_paragraph(self.response_text) - response_df = pd.DataFrame(sentences, columns=['response_text_sentence']) - return response_df - - def get_footnote_from_sentences(self): - def get_footnote_result_sentence_dict(sentence, docno, rank, score, url_unique_ids, url, url_ids, source_sentence): - return { - 'sentence': sentence, - 'docno': docno, - 'rank': rank, - 'score': score, - 'url_unique_ids': url_unique_ids, - 'url': url, - 'url_ids': url_ids, - 'source_sentence': source_sentence - } - - logger.info(f'FootnoteService.get_footnote_from_sentences()') - - response_sentences_df = self.extract_sentences_from_paragraph() - if not self.config.get('search_option').get('is_use_source'): - footnote_result_list = [] - for index, row in response_sentences_df.iterrows(): - footnote_result_sentence_dict = get_footnote_result_sentence_dict(row["response_text_sentence"], [], [], [], [], [], [], []) - footnote_result_list.append(footnote_result_sentence_dict) - return footnote_result_list, pd.DataFrame() - - in_scope_source_df = self.gpt_input_text_df[self.gpt_input_text_df['in_scope']] - source_index = self.semantic_search_service.index_text_df(in_scope_source_df, 'source_index') - - footnote_result_list = [] - for index, row in response_sentences_df.iterrows(): - response_text_sentence = row["response_text_sentence"] - logger.info(f' [S{index + 1}] {response_text_sentence}') - # print(f'[S{index + 1}] {response_text_sentence}') - - cleaned_response_text_sentence = self.semantic_search_service.clean_sentence_to_avoid_lexical_error(response_text_sentence) - result_df = self.semantic_search_service.use_index_to_search(source_index, cleaned_response_text_sentence) - result_df = result_df.merge(in_scope_source_df, on="docno", how="left")[['docno', 'rank', 'score', 'url', 'url_id', 'text']] - - if self.semantic_search_service.provider == 'pyterrier': - SCORE_THRESHOLD = 5 - result_within_scope_df = result_df[result_df['score'] >= SCORE_THRESHOLD] - elif self.semantic_search_service.provider in ['faiss-openai', 'faiss-huggingface']: - # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also - # print(f'sentence {index}') - # print(result_df[['text', 'url_id', 'score']]) - SCORE_THRESHOLD = 0.6 - top_k = 1 - # # distance for faiss (lower is closer) - # result_within_scope_df = result_df[result_df['score'] <= SCORE_THRESHOLD].head(top_k) - result_within_scope_df = result_df.head(top_k) - else: - NotImplementedError(f'Unsupported semantic search provider: {self.semantic_search_service.provider}') - - footnote_result_sentence_dict = get_footnote_result_sentence_dict(response_text_sentence, - result_within_scope_df['docno'].tolist(), - result_within_scope_df['rank'].tolist(), - result_within_scope_df['score'].tolist(), - sorted(result_within_scope_df['url_id'].unique().tolist()), - result_within_scope_df['url'].tolist(), - result_within_scope_df['url_id'].tolist(), - result_within_scope_df['text'].tolist() - ) - footnote_result_list.append(footnote_result_sentence_dict) - return footnote_result_list, in_scope_source_df - - def pretty_print_footnote_result_list(self, footnote_result_list, gpt_input_text_df): - def create_response_json_object(text, type): - return {"text": text, "type": type} - - def create_source_json_object(footnote, domain, url, title, text): - return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} - - url_id_map = {} # to reassign url_id as per appearance order - - # footnote text and json processing - response_text_with_footnote = '' - response_json = [] - - for footnote_result in footnote_result_list: - footnote_print = [] - response_json.append(create_response_json_object(footnote_result["sentence"], "response")) - - for url_id in footnote_result['url_unique_ids']: - - if url_id not in url_id_map: - url_id_map[url_id] = len(url_id_map) + 1 - - footnote_print += [f'[{url_id_map[url_id]}]'] - response_json.append(create_response_json_object(f'[{url_id_map[url_id]}]', "footnote")) - - response_text_with_footnote += f'{footnote_result["sentence"]}{" " + "".join(sorted(footnote_print)) if len(footnote_print) > 0 else ""} ' - - # source text and json processing - in_scope_source_df = gpt_input_text_df[gpt_input_text_df['in_scope']].copy() - in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) - in_scope_source_df.sort_values('docno', inplace=True) - - source_text_list = [] - source_json = [] - - source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True) - for index, row in source_url_df.iterrows(): - if row['url_id'] not in url_id_map: - continue - - url_text = '' - url_text += f"[{url_id_map[row['url_id']]}] {row['url']}\n" - - for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows(): - url_text += f" {row['text']}\n" - - source_text_list.append(url_text) - - domain_name = urlparse(row['url']).netloc.replace('www.', '') - source_json.append(create_source_json_object(f"[{url_id_map[row['url_id']]}]", domain_name, row['url'], row['name'], row['snippet'])) - - source_text = ''.join(sorted(source_text_list)) - source_json = sorted(source_json, key=lambda x: x['footnote']) - - print('===========Response text (ref):============') - print(response_text_with_footnote) - print() - print('===========Source text:============') - print(source_text) - print() - - return response_text_with_footnote, source_text, {'response_json': response_json, 'source_json': source_json} diff --git a/src/FrontendService.py b/src/FrontendService.py new file mode 100644 index 0000000..5079653 --- /dev/null +++ b/src/FrontendService.py @@ -0,0 +1,66 @@ +import re +from urllib.parse import urlparse + +from SemanticSearchService import BatchOpenAISemanticSearchService +from Util import setup_logger + +logger = setup_logger('FootnoteService') + + +class FrontendService: + def __init__(self, config, response_text, gpt_input_text_df): + self.config = config + self.response_text = response_text + used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id + self.gpt_input_text_df = gpt_input_text_df[used_columns] + + def get_data_json(self, response_text, gpt_input_text_df): + def create_response_json_object(text, type): + return {"text": text, "type": type} + + def create_source_json_object(footnote, domain, url, title, text): + return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} + + def get_response_json(create_response_json_object, response_text): + response_json = [] + split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) + for sentence in split_sentence: + if sentence.startswith('[') and sentence.endswith(']'): + response_json.append(create_response_json_object(sentence, "footnote")) + else: + response_json.append(create_response_json_object(sentence, "response")) + response_text_with_footnote = response_text + return response_json, response_text_with_footnote + + def get_source_json(create_source_json_object, gpt_input_text_df): + in_scope_source_df = gpt_input_text_df[gpt_input_text_df['in_scope']].copy() + in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) + in_scope_source_df.sort_values('docno', inplace=True) + source_text_list = [] + source_json = [] + source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True) + for index, row in source_url_df.iterrows(): + url_text = '' + url_text += f"[{row['url_id']}] {row['url']}\n" + + for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows(): + url_text += f" {row['text']}\n" + + source_text_list.append(url_text) + + domain_name = urlparse(row['url']).netloc.replace('www.', '') + source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet'])) + source_text = ''.join(sorted(source_text_list)) + source_json = sorted(source_json, key=lambda x: x['footnote']) + return source_json, source_text + + response_json, response_text_with_footnote = get_response_json(create_response_json_object, response_text) + source_json, source_text = get_source_json(create_source_json_object, gpt_input_text_df) + + return response_text_with_footnote, source_text, {'response_json': response_json, 'source_json': source_json} + + +if __name__ == '__main__': + sentence = "According to the sources [1] [2], it is predicted that the world's natural gas reserves will last about 52.8 years with the current rate of production. [13] TestTest." + split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', sentence) + print(split_sentence) diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 1ad8399..53a7ae4 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -6,7 +6,7 @@ import yaml from BingService import BingService -from FootnoteService import FootnoteService +from FrontendService import FrontendService from LLMService import LLMServiceFactory from SemanticSearchService import BatchOpenAISemanticSearchService from Util import setup_logger, post_process_gpt_input_text_df, check_result_cache_exists, load_result_from_cache, save_result_cache, check_max_number_of_cache, get_project_root @@ -87,9 +87,8 @@ def _prompt(self, search_text, text_df, cache_path=None): print('===========Response text (raw):============') print(response_text) - footnote_service = FootnoteService(self.config, response_text, gpt_input_text_df, semantic_search_service) - footnote_result_list, in_scope_source_df = footnote_service.get_footnote_from_sentences() - response_text_with_footnote, source_text, data_json = footnote_service.pretty_print_footnote_result_list(footnote_result_list, gpt_input_text_df) + frontend_service = FrontendService(self.config, response_text, gpt_input_text_df) + response_text_with_footnote, source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df) return response_text, response_text_with_footnote, source_text, data_json From 842058328d3259e4706e2259f8136c453feed061 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 23:46:14 +0800 Subject: [PATCH 06/14] run pass both use/not use source --- src/LLMService.py | 9 +++++++++ src/SearchGPTService.py | 30 ++++++++++++++++-------------- src/SemanticSearchService.py | 8 ++++++-- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/src/LLMService.py b/src/LLMService.py index a63f31b..f721a13 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -57,6 +57,15 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame): return prompt def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): + if not self.config.get('search_option').get('is_use_source'): + prompt = \ + f""" +Instructions: Write a comprehensive reply to the given query. +If the context is insufficient, reply "I cannot answer". +Query: {search_text} +""" + return prompt + logger.info(f"OpenAIService.get_prompt_v3. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") context_str = "" gpt_input_text_df = gpt_input_text_df[gpt_input_text_df['in_scope']].sort_values('url_id') diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 53a7ae4..fa78282 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -96,25 +96,27 @@ def _extract_bing_text_df(self, search_text, cache_path): # BingSearch using search_text # check if bing search result is cached and load if exists bing_text_df = None - if self.config['search_option']['is_enable_bing_search']: - if self.config.get('cache').get('is_enable_cache') and check_result_cache_exists(cache_path, search_text, 'bing_search'): - logger.info(f"BingService.load_result_from_cache. search_text: {search_text}, cache_path: {cache_path}") - cache = load_result_from_cache(cache_path, search_text, 'bing_search') - bing_text_df = cache['bing_text_df'] - else: - bing_service = BingService(self.config) - website_df = bing_service.call_bing_search_api(search_text) - bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df) - - bing_search_config = self.config.get('bing_search').copy() - bing_search_config.pop('subscription_key') # delete api_key from config to avoid saving it to .cache - save_result_cache(cache_path, search_text, 'bing_search', bing_text_df=bing_text_df, config=bing_search_config) + if not self.config['search_option']['is_use_source'] or not self.config['search_option']['is_enable_bing_search']: + return bing_text_df + + if self.config.get('cache').get('is_enable_cache') and check_result_cache_exists(cache_path, search_text, 'bing_search'): + logger.info(f"BingService.load_result_from_cache. search_text: {search_text}, cache_path: {cache_path}") + cache = load_result_from_cache(cache_path, search_text, 'bing_search') + bing_text_df = cache['bing_text_df'] + else: + bing_service = BingService(self.config) + website_df = bing_service.call_bing_search_api(search_text) + bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df) + + bing_search_config = self.config.get('bing_search').copy() + bing_search_config.pop('subscription_key') # delete api_key from config to avoid saving it to .cache + save_result_cache(cache_path, search_text, 'bing_search', bing_text_df=bing_text_df, config=bing_search_config) return bing_text_df def _extract_doc_text_df(self, bing_text_df): # DocSearch using doc_search_path # bing_text_df is used for doc_id arrangement - if not self.config['search_option']['is_enable_doc_search']: + if not self.config['search_option']['is_use_source'] or not self.config['search_option']['is_enable_doc_search']: return pd.DataFrame([]) files_grabbed = list() for doc_type in support_doc_type: diff --git a/src/SemanticSearchService.py b/src/SemanticSearchService.py index 456ec56..6812a1e 100644 --- a/src/SemanticSearchService.py +++ b/src/SemanticSearchService.py @@ -163,6 +163,7 @@ class BatchOpenAISemanticSearchService: def __init__(self, config): + self.config = config openai.api_key = config.get('openai_api').get('api_key') @staticmethod @@ -184,8 +185,11 @@ def compute_embeddings_for_text_df(text_df: pd.DataFrame): text_df['embedding'] = BatchOpenAISemanticSearchService.batch_call_embeddings(text_df['text'].tolist()) return text_df - @staticmethod - def search_related_source(text_df: pd.DataFrame, target_text, n=30): + def search_related_source(self, text_df: pd.DataFrame, target_text, n=30): + if not self.config.get('search_option').get('is_use_source'): + col = ['name', 'url', 'url_id', 'snippet', 'text', 'similarities', 'rank', 'docno'] + return pd.DataFrame(columns=col) + print(f'search_similar() text: {target_text}') embedding = BatchOpenAISemanticSearchService.batch_call_embeddings([target_text])[0] text_df = BatchOpenAISemanticSearchService.compute_embeddings_for_text_df(text_df) From ff84c0bde3935e820f150be8fcf0c034e2882952 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Wed, 8 Mar 2023 23:55:57 +0800 Subject: [PATCH 07/14] de semantic search provider --- requirements.txt | 8 -------- src/LLMService.py | 2 +- src/SearchGPTService.py | 2 -- src/config/config.yaml | 7 ------- src/website/static/index.js | 3 +-- src/website/templates/index.html | 11 ----------- src/website/views.py | 1 - 7 files changed, 2 insertions(+), 32 deletions(-) diff --git a/requirements.txt b/requirements.txt index ac91aed..5473ef8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,11 +22,3 @@ python-pptx==0.6.21 # html extraction beautifulsoup4==4.11.2 trafilatura==1.4.1 - -# doc indexer1 -# python-terrier==0.9.2 - -# doc indexer2 -faiss-cpu==1.7.3 -langchain==0.0.95 - diff --git a/src/LLMService.py b/src/LLMService.py index f721a13..12eaf72 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -85,7 +85,7 @@ def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): Instructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [number] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. -If the context is insufficient, reply "I cannot answer". +If the context is insufficient, reply "I cannot answer because my reference sources don't have related info". Query: {search_text} """ return prompt diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index fa78282..1b1e38d 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -45,8 +45,6 @@ def overide_config_by_query_string(self, ui_overriden_config): self.config['goose_ai_api']['model'] = value else: raise Exception(f"llm_model is not supported for llm_service_provider: {self.config['llm_service']['provider']}") - elif key == 'semantic_search_provider': - self.config['semantic_search']['provider'] = value else: # invalid query_string but not throwing exception first pass diff --git a/src/config/config.yaml b/src/config/config.yaml index 031a164..e1ac482 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -32,10 +32,3 @@ cache: # .cache result for efficiency and consistency is_enable_cache: false path: .cache max_number_of_cache: 0 -semantic_search: - # provider list: - # faiss-openai (default): Use OpenAIEmbedding. fast, good accuracy but need openai key (cost) - # faiss-huggingface: Use HuggingFaceEmbedding. slow, good accuracy but free (need to download) - # pyterrier: Use PyTerrier Tf (term frequency). fast, fair accuracy and free (need to install pyterrier and java) - provider: faiss-openai - diff --git a/src/website/static/index.js b/src/website/static/index.js index 282b0d6..d3c5e9c 100644 --- a/src/website/static/index.js +++ b/src/website/static/index.js @@ -14,8 +14,7 @@ $(document).ready(function () { openai_api_key: $('#openai_api_key').val(), is_use_source: $('input[name="is_use_source"]')[0].checked, llm_service_provider: $('#llm_service_provider').val(), - llm_model: $('#llm_model').val(), - semantic_search_provider: $('#semantic_search_provider').val() + llm_model: $('#llm_model').val() }, success: function (response) { $('#' + response.id).html(response.html) diff --git a/src/website/templates/index.html b/src/website/templates/index.html index 38edf22..6b764b0 100644 --- a/src/website/templates/index.html +++ b/src/website/templates/index.html @@ -63,17 +63,6 @@ >text-curie-001 -
- - -
diff --git a/src/website/views.py b/src/website/views.py index 71e4d57..e3c4aa3 100644 --- a/src/website/views.py +++ b/src/website/views.py @@ -31,7 +31,6 @@ def index_page(): 'is_use_source': request.values.get('is_use_source'), 'llm_service_provider': request.values.get('llm_service_provider'), 'llm_model': request.values.get('llm_model'), - 'semantic_search_provider': request.values.get('semantic_search_provider'), } logger.info(f"GET ui_overriden_config: {ui_overriden_config}") From 24cb5bcb2d1af38510fefb6b634127619190a8bb Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Thu, 9 Mar 2023 00:50:32 +0800 Subject: [PATCH 08/14] minor --- src/website/templates/base.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/website/templates/base.html b/src/website/templates/base.html index 43778fe..3f1b899 100644 --- a/src/website/templates/base.html +++ b/src/website/templates/base.html @@ -30,7 +30,7 @@
-

SearchGPT 20230306 Version ( +

SearchGPT 20230309 Version ( Github ). Your feedback will help us to improve

From bd5d5835f6dc88cd38b1322d0bc88b8a71c1331c Mon Sep 17 00:00:00 2001 From: Lydia Pang Date: Thu, 9 Mar 2023 22:21:14 +0800 Subject: [PATCH 09/14] Removed redundancy --- src/FrontendService.py | 14 +++++++------- src/SearchGPTService.py | 3 +-- src/gradio_app.py | 4 ++-- src/main.py | 2 +- src/website/views.py | 3 +-- 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/FrontendService.py b/src/FrontendService.py index 5079653..6e73b7b 100644 --- a/src/FrontendService.py +++ b/src/FrontendService.py @@ -21,7 +21,7 @@ def create_response_json_object(text, type): def create_source_json_object(footnote, domain, url, title, text): return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} - def get_response_json(create_response_json_object, response_text): + def get_response_json(response_text): response_json = [] split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) for sentence in split_sentence: @@ -29,10 +29,9 @@ def get_response_json(create_response_json_object, response_text): response_json.append(create_response_json_object(sentence, "footnote")) else: response_json.append(create_response_json_object(sentence, "response")) - response_text_with_footnote = response_text - return response_json, response_text_with_footnote + return response_json - def get_source_json(create_source_json_object, gpt_input_text_df): + def get_source_json(gpt_input_text_df): in_scope_source_df = gpt_input_text_df[gpt_input_text_df['in_scope']].copy() in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) in_scope_source_df.sort_values('docno', inplace=True) @@ -51,13 +50,14 @@ def get_source_json(create_source_json_object, gpt_input_text_df): domain_name = urlparse(row['url']).netloc.replace('www.', '') source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet'])) source_text = ''.join(sorted(source_text_list)) + source_json = sorted(source_json, key=lambda x: x['footnote']) return source_json, source_text - response_json, response_text_with_footnote = get_response_json(create_response_json_object, response_text) - source_json, source_text = get_source_json(create_source_json_object, gpt_input_text_df) + response_json = get_response_json(response_text) + source_json, source_text = get_source_json(gpt_input_text_df) - return response_text_with_footnote, source_text, {'response_json': response_json, 'source_json': source_json} + return source_text, {'response_json': response_json, 'source_json': source_json} if __name__ == '__main__': diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 1b1e38d..0b53c0f 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -87,8 +87,7 @@ def _prompt(self, search_text, text_df, cache_path=None): frontend_service = FrontendService(self.config, response_text, gpt_input_text_df) response_text_with_footnote, source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df) - - return response_text, response_text_with_footnote, source_text, data_json + return response_text, source_text, data_json def _extract_bing_text_df(self, search_text, cache_path): # BingSearch using search_text diff --git a/src/gradio_app.py b/src/gradio_app.py index 4a8f0ee..f83000d 100644 --- a/src/gradio_app.py +++ b/src/gradio_app.py @@ -5,8 +5,8 @@ def query_and_get_answer(search_text): search_gpt_service = SearchGPTService() - response_text, response_text_with_footnote, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) - return response_text, response_text_with_footnote, source_text + response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) + return response_text, source_text demo = gr.Interface(fn=query_and_get_answer, diff --git a/src/main.py b/src/main.py index 7e685a3..33d409a 100644 --- a/src/main.py +++ b/src/main.py @@ -4,6 +4,6 @@ search_text = 'the source of dark energy' search_gpt_service = SearchGPTService() - response_text, response_text_with_footnote, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) + response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) print() diff --git a/src/website/views.py b/src/website/views.py index e3c4aa3..7108002 100644 --- a/src/website/views.py +++ b/src/website/views.py @@ -36,8 +36,7 @@ def index_page(): if search_text is not None: search_gpt_service = SearchGPTService(ui_overriden_config) - response_text, response_text_with_footnote, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) - # response_text, response_text_with_footnote, source_text, data_json = "test", "test", "test", {'response_json': [], 'source_json': []} + _, _, data_json = search_gpt_service.query_and_get_answer(search_text) except Exception as e: error = str(e) From 793cfaf8eaac4af2f901710029943888c6db52c9 Mon Sep 17 00:00:00 2001 From: Lydia Pang Date: Thu, 9 Mar 2023 22:22:08 +0800 Subject: [PATCH 10/14] Cleaned up cli print --- src/SearchGPTService.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 0b53c0f..867a3cc 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -78,15 +78,18 @@ def _prompt(self, search_text, text_df, cache_path=None): # check whether the number of cache exceeds the limit check_max_number_of_cache(cache_path, self.config.get('cache').get('max_number_of_cache')) + frontend_service = FrontendService(self.config, response_text, gpt_input_text_df) + source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df) + print('===========Prompt:============') print(prompt) print('===========Search:============') print(search_text) - print('===========Response text (raw):============') + print('===========Response text:============') print(response_text) + print('===========Source text:============') + print(source_text) - frontend_service = FrontendService(self.config, response_text, gpt_input_text_df) - response_text_with_footnote, source_text, data_json = frontend_service.get_data_json(response_text, gpt_input_text_df) return response_text, source_text, data_json def _extract_bing_text_df(self, search_text, cache_path): From aeed9c86177286ff5aa00dd6ed9953c45b92df64 Mon Sep 17 00:00:00 2001 From: Lydia Pang Date: Thu, 9 Mar 2023 22:43:10 +0800 Subject: [PATCH 11/14] Removed number reference from prompt input --- src/Util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Util.py b/src/Util.py index cf02adf..cbc98d7 100644 --- a/src/Util.py +++ b/src/Util.py @@ -24,6 +24,7 @@ def setup_logger(tag): def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): # clean out of prompt texts + gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 From d2748321be1a9da27c93fea084e48ba0f33d44aa Mon Sep 17 00:00:00 2001 From: Lydia Pang Date: Thu, 9 Mar 2023 22:58:27 +0800 Subject: [PATCH 12/14] Fixed source sequence in prompt & response --- src/FrontendService.py | 20 +++++++++++++++----- src/LLMService.py | 1 - src/Util.py | 3 +++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/FrontendService.py b/src/FrontendService.py index 6e73b7b..d5b320c 100644 --- a/src/FrontendService.py +++ b/src/FrontendService.py @@ -22,17 +22,27 @@ def create_source_json_object(footnote, domain, url, title, text): return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} def get_response_json(response_text): + # find reference in text & re-order + url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))] + url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) + + for url_id, new_url_id in url_id_map.items(): + response_text = response_text.replace(f'[{url_id}]', f'[{new_url_id}]') + response_json = [] split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) + for sentence in split_sentence: if sentence.startswith('[') and sentence.endswith(']'): response_json.append(create_response_json_object(sentence, "footnote")) else: response_json.append(create_response_json_object(sentence, "response")) - return response_json + return response_json, url_id_map - def get_source_json(gpt_input_text_df): - in_scope_source_df = gpt_input_text_df[gpt_input_text_df['in_scope']].copy() + def get_source_json(gpt_input_text_df, url_id_map): + # include only sources used in response_text & remap url_id + in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() + in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) in_scope_source_df.sort_values('docno', inplace=True) source_text_list = [] @@ -54,8 +64,8 @@ def get_source_json(gpt_input_text_df): source_json = sorted(source_json, key=lambda x: x['footnote']) return source_json, source_text - response_json = get_response_json(response_text) - source_json, source_text = get_source_json(gpt_input_text_df) + response_json, url_id_map = get_response_json(response_text) + source_json, source_text = get_source_json(gpt_input_text_df, url_id_map) return source_text, {'response_json': response_json, 'source_json': source_json} diff --git a/src/LLMService.py b/src/LLMService.py index 12eaf72..8ab34cf 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -68,7 +68,6 @@ def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): logger.info(f"OpenAIService.get_prompt_v3. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") context_str = "" - gpt_input_text_df = gpt_input_text_df[gpt_input_text_df['in_scope']].sort_values('url_id') url_id_list = gpt_input_text_df['url_id'].unique() for url_id in url_id_list: context_str += f"Source ({url_id})\n" diff --git a/src/Util.py b/src/Util.py index cbc98d7..e930b32 100644 --- a/src/Util.py +++ b/src/Util.py @@ -29,6 +29,9 @@ def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit + url_id_list = gpt_input_text_df['url_id'].unique() + url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) + gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map) return gpt_input_text_df From 8a74cfb261015223952b083b316bb0137406063a Mon Sep 17 00:00:00 2001 From: Lydia Pang Date: Thu, 9 Mar 2023 23:55:50 +0800 Subject: [PATCH 13/14] Fixed source scope in prompt --- src/LLMService.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/LLMService.py b/src/LLMService.py index 8ab34cf..2f03230 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -71,7 +71,7 @@ def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): url_id_list = gpt_input_text_df['url_id'].unique() for url_id in url_id_list: context_str += f"Source ({url_id})\n" - for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows(): + for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == url_id) & gpt_input_text_df['in_scope']].iterrows(): context_str += f"{row['text']}\n" context_str += "\n" prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit') From 34e22e1b23647abddbd43f85661efce8769bfc08 Mon Sep 17 00:00:00 2001 From: Michael Wan Date: Fri, 10 Mar 2023 00:31:55 +0800 Subject: [PATCH 14/14] refactor --- src/FrontendService.py | 26 ++++++++++++++++---------- src/Util.py | 4 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/FrontendService.py b/src/FrontendService.py index d5b320c..4f6e890 100644 --- a/src/FrontendService.py +++ b/src/FrontendService.py @@ -1,12 +1,13 @@ import re from urllib.parse import urlparse -from SemanticSearchService import BatchOpenAISemanticSearchService from Util import setup_logger logger = setup_logger('FootnoteService') + + class FrontendService: def __init__(self, config, response_text, gpt_input_text_df): self.config = config @@ -14,6 +15,7 @@ def __init__(self, config, response_text, gpt_input_text_df): used_columns = ['docno', 'name', 'url', 'url_id', 'text', 'len_text', 'in_scope'] # TODO: add url_id self.gpt_input_text_df = gpt_input_text_df[used_columns] + def get_data_json(self, response_text, gpt_input_text_df): def create_response_json_object(text, type): return {"text": text, "type": type} @@ -21,14 +23,20 @@ def create_response_json_object(text, type): def create_source_json_object(footnote, domain, url, title, text): return {"footnote": footnote, "domain": domain, "url": url, "title": title, "text": text} - def get_response_json(response_text): - # find reference in text & re-order + def reorder_url_id(response_text, gpt_input_text_df): + # response_text: find reference in text & re-order url_id_list = [int(x) for x in dict.fromkeys(re.findall(r'\[([0-9]+)\]', response_text))] url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) for url_id, new_url_id in url_id_map.items(): response_text = response_text.replace(f'[{url_id}]', f'[{new_url_id}]') + # gpt_input_text_df: find reference in text & re-order + in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() + in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) + return response_text, in_scope_source_df + + def get_response_json(response_text): response_json = [] split_sentence = re.findall(r'\[[0-9]+\]|[^\[\]]+', response_text) @@ -37,12 +45,9 @@ def get_response_json(response_text): response_json.append(create_response_json_object(sentence, "footnote")) else: response_json.append(create_response_json_object(sentence, "response")) - return response_json, url_id_map + return response_json - def get_source_json(gpt_input_text_df, url_id_map): - # include only sources used in response_text & remap url_id - in_scope_source_df = gpt_input_text_df[gpt_input_text_df['url_id'].isin(url_id_map.keys()) & gpt_input_text_df['in_scope']].copy() - in_scope_source_df['url_id'] = in_scope_source_df['url_id'].map(url_id_map) + def get_source_json(in_scope_source_df): in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) in_scope_source_df.sort_values('docno', inplace=True) source_text_list = [] @@ -64,8 +69,9 @@ def get_source_json(gpt_input_text_df, url_id_map): source_json = sorted(source_json, key=lambda x: x['footnote']) return source_json, source_text - response_json, url_id_map = get_response_json(response_text) - source_json, source_text = get_source_json(gpt_input_text_df, url_id_map) + response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df) + response_json = get_response_json(response_text) + source_json, source_text = get_source_json(in_scope_source_df) return source_text, {'response_json': response_json, 'source_json': source_json} diff --git a/src/Util.py b/src/Util.py index e930b32..ee8c259 100644 --- a/src/Util.py +++ b/src/Util.py @@ -23,12 +23,14 @@ def setup_logger(tag): def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): - # clean out of prompt texts + # clean out of prompt texts for existing [1], [2], [3]... in the source_text gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) + gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit + # reorder url_id with url that in scope. url_id_list = gpt_input_text_df['url_id'].unique() url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map)