diff --git a/img/architecture_roadmap.png b/img/architecture_roadmap.png index e070e8a..cc3695b 100644 Binary files a/img/architecture_roadmap.png and b/img/architecture_roadmap.png differ diff --git a/src/BingService.py b/src/BingService.py index 22764c7..2fb87ef 100644 --- a/src/BingService.py +++ b/src/BingService.py @@ -15,19 +15,19 @@ class BingService: def __init__(self, config): self.config = config - extract_svc = self.config.get('bing_search').get('text_extract') + extract_svc = self.config.get('source_service').get('bing_search').get('text_extract') if extract_svc == 'trafilatura': self.txt_extract_svc = TrafilaturaSvc() elif extract_svc == 'beautifulsoup': self.txt_extract_svc = BeautifulSoupSvc() - @storage_cached('bing_search_website', 'query') - def call_bing_search_api(self, query: str) -> pd.DataFrame: - logger.info("BingService.call_bing_search_api. query: " + query) - subscription_key = self.config.get('bing_search').get('subscription_key') - endpoint = self.config.get('bing_search').get('end_point') + "/v7.0/search" + @storage_cached('bing_search_website', 'search_text') + def call_bing_search_api(self, search_text: str) -> pd.DataFrame: + logger.info("BingService.call_bing_search_api. query: " + search_text) + subscription_key = self.config.get('source_service').get('bing_search').get('subscription_key') + endpoint = self.config.get('source_service').get('bing_search').get('end_point') + "/v7.0/search" mkt = 'en-US' - params = {'q': query, 'mkt': mkt} + params = {'q': search_text, 'mkt': mkt} headers = {'Ocp-Apim-Subscription-Key': subscription_key} try: @@ -37,7 +37,7 @@ def call_bing_search_api(self, query: str) -> pd.DataFrame: columns = ['name', 'url', 'snippet'] website_df = pd.DataFrame(response.json()['webPages']['value'])[columns] website_df['url_id'] = website_df.index + 1 - website_df = website_df[:self.config.get('bing_search').get('result_count')] + website_df = website_df[:self.config.get('source_service').get('bing_search').get('result_count')] except Exception as ex: raise ex return website_df diff --git a/src/FrontendService.py b/src/FrontendService.py index 8649a0a..f6cb7d7 100644 --- a/src/FrontendService.py +++ b/src/FrontendService.py @@ -94,27 +94,6 @@ def get_explain_json(text, word_color_dict): source_explain_json = get_explain_json(source_text, word_color_dict) return response_explain_json, source_explain_json - in_scope_source_df.loc[:, 'docno'] = in_scope_source_df['docno'].astype(int) - in_scope_source_df.sort_values('docno', inplace=True) - source_text_list = [] - source_json = [] - source_url_df = in_scope_source_df[['url_id', 'url', 'name', 'snippet']].drop_duplicates().sort_values('url_id').reset_index(drop=True) - for index, row in source_url_df.iterrows(): - url_text = '' - url_text += f"[{row['url_id']}] {row['url']}\n" - - for index, row in in_scope_source_df[in_scope_source_df['url_id'] == row['url_id']].iterrows(): - url_text += f" {row['text']}\n" - - source_text_list.append(url_text) - - domain_name = urlparse(row['url']).netloc.replace('www.', '') - source_json.append(create_source_json_object(f"[{row['url_id']}]", domain_name, row['url'], row['name'], row['snippet'])) - source_text = ''.join(sorted(source_text_list)) - - source_json = sorted(source_json, key=lambda x: x['footnote']) - return source_json, source_text - response_text, in_scope_source_df = reorder_url_id(response_text, gpt_input_text_df) response_json = get_response_json(response_text) source_json, source_text = get_source_json(in_scope_source_df) @@ -125,47 +104,3 @@ def get_explain_json(text, word_color_dict): 'response_explain_json': response_explain_json, 'source_explain_json': source_explain_json } - - -if __name__ == '__main__': - paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]." - paragraph2 = """ -Source (1) -ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. -- ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly. -ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure. -Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF). - -Source (3) -ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data. -But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware. -ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn. -But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader. -ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on. -ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way". -ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times. -ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss - """ - - # common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2) - # # print(common_stems) - # for common_stem in common_stems: - # print(common_stem) - - # text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early", - # "sensitive to tweaks to the input phrasing or attempting the same prompt multiple", - # "is fine-tuned from a model in the gpt-3.5 series, which finished training in", - # "is fine-tuned from a model in the gpt-3.5 series, which finished training", - # "sensitive to tweaks to the input phrasing or attempting the same prompt", - # "is fine-tuned from a model in the gpt-3.5 series, which finished", - # "sensitive to tweaks to the input phrasing or attempting the same", - # "sensitive to tweaks to the input phrasing or attempting the", - # "is fine-tuned from a model in the gpt-3.5 series, which"] - # text_list = FrontendService.remove_substrings(text_list) - # for text in text_list: - # print(text) - - response_text = "is fine-tuned from a gpt-3.5 series" - split_list = FrontendService.split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"]) - for sentence in split_list: - print(sentence) diff --git a/src/LLMService.py b/src/LLMService.py index 19787bb..d3208ae 100644 --- a/src/LLMService.py +++ b/src/LLMService.py @@ -20,8 +20,8 @@ def clean_response_text(self, response_text: str): def get_prompt(self, search_text: str, gpt_input_text_df: pd.DataFrame): logger.info(f"OpenAIService.get_prompt. search_text: {search_text}, gpt_input_text_df.shape: {gpt_input_text_df.shape}") - prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit') - is_use_source = self.config.get('search_option').get('is_use_source') + prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit') + is_use_source = self.config.get('source_service').get('is_use_source') if is_use_source: prompt_engineering = f"\n\nAnswer the question '{search_text}' using above information with about 100 words:" prompt = "" @@ -43,7 +43,7 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame): for index, row in gpt_input_text_df[gpt_input_text_df['url_id'] == url_id].iterrows(): context_str += f"{row['text']}\n" context_str += "\n" - prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit') + prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit') context_str = context_str[:prompt_length_limit] prompt = \ f""" @@ -58,7 +58,7 @@ def get_prompt_v2(self, search_text: str, gpt_input_text_df: pd.DataFrame): return prompt def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): - if not self.config.get('search_option').get('is_use_source'): + if not self.config.get('source_service').get('is_use_source'): prompt = \ f""" Instructions: Write a comprehensive reply to the given query. @@ -75,7 +75,7 @@ def get_prompt_v3(self, search_text: str, gpt_input_text_df: pd.DataFrame): for index, row in gpt_input_text_df[(gpt_input_text_df['url_id'] == row_url['url_id']) & gpt_input_text_df['in_scope']].iterrows(): context_str += f"{row['text']}\n" context_str += "\n\n" - prompt_length_limit = self.config.get('openai_api').get('prompt').get('prompt_length_limit') + prompt_length_limit = self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit') context_str = context_str[:prompt_length_limit] prompt = \ f""" @@ -98,14 +98,14 @@ def call_api(self, prompt): class OpenAIService(LLMService): def __init__(self, config): super().__init__(config) - open_api_key = config.get('openai_api').get('api_key') + open_api_key = config.get('llm_service').get('openai_api').get('api_key') if open_api_key is None: raise Exception("OpenAI API key is not set.") openai.api_key = open_api_key @storage_cached('openai', 'prompt') def call_api(self, prompt: str): - openai_api_config = self.config.get('openai_api') + openai_api_config = self.config.get('llm_service').get('openai_api') model = openai_api_config.get('model') logger.info(f"OpenAIService.call_api. model: {model}, len(prompt): {len(prompt)}") diff --git a/src/NLPUtil.py b/src/NLPUtil.py index 51153b0..0f52af8 100644 --- a/src/NLPUtil.py +++ b/src/NLPUtil.py @@ -73,3 +73,47 @@ def split_with_delimiters(string, delimiter_list): if start < len(string): result.append(string[start:]) return result + + +if __name__ == '__main__': + paragraph1 = "ChatGPT is an AI chatbot that can understand and generate human-like answers to text prompts, as well as create code from natural speech [3]. It is built on a family of large language models collectively called GPT-3, which is trained on huge amounts of data [3][1]. The model is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022 and trained on an Azure AI supercomputing infrastructure [1]. ChatGPT is also sensitive to tweaks to the input phrasing or attempting the same prompt multiple times [1]. The objective of ChatGPT is to predict the next word in a sentence based on what it has learned [3]. The research release of ChatGPT in November 2022 is among OpenAI's iterative deployment of increasingly safe and useful AI systems [1]. ChatGPT Plus also exists, which brings a few benefits over the free tier [3]." + paragraph2 = """ +Source (1) +ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. +- ChatGPT is sensitive to tweaks to the input phrasing or attempting the same prompt multiple times. For example, given one phrasing of a question, the model can claim to not know the answer, but given a slight rephrase, can answer correctly. +ChatGPT is fine-tuned from a model in the GPT-3.5 series, which finished training in early 2022. You can learn more about the 3.5 series here. ChatGPT and GPT-3.5 were trained on an Azure AI supercomputing infrastructure. +Todayâs research release of ChatGPT is the latest step in OpenAI iterative deployment of increasingly safe and useful AI systems. Many lessons from deployment of earlier models like GPT-3 and Codex have informed the safety mitigations in place for this release, including substantial reductions in harmful and untruthful outputs achieved by the use of reinforcement learning from human feedback (RLHF). + +Source (3) +ChatGPT is an AI chatbot that's built on a family of large language models (LLMs) that are collectively called GPT-3. These models can understand and generate human-like answers to text prompts, because they've been trained on huge amounts of data. +But ChatGPT is also equally talented at coding and productivity tasks. For the former, its ability to create code from natural speech makes it a powerful ally for both new and experienced coders who either aren't familiar with a particular language or want to troubleshoot existing code. Unfortunately, there is also the potential for it to be misused to create malicious emails and malware. +ChatGPT stands for "Chat Generative Pre-trained Transformer". Let's take a look at each of those words in turn. +But the short answer? ChatGPT works thanks to a combination of deep learning algorithms, a dash of natural language processing, and a generous dollop of generative pre-training, which all combine to help it produce disarmingly human-like responses to text questions. Even if all it's ultimately been trained to do is fill in the next word, based on its experience of being the world's most voracious reader. +ChatGPT has been created with one main objective to predict the next word in a sentence, based on what's typically happened in the gigabytes of text data that it's been trained on. +ChatGPT was released as a "research preview" on November 30, 2022. A blog post (opens in new tab) casually introduced the AI chatbot to the world, with OpenAI stating that "we’ve trained a model called ChatGPT which interacts in a conversational way". +ChatGPT Plus costs $20 p/month (around £17 / AU$30) and brings a few benefits over the free tier. It promises to give you full access to ChatGPT even during peak times, which is when you'll otherwise frequently see "ChatGPT is at capacity right now messages during down times. +ChatGPT has been trained on a vast amount of text covering a huge range of subjects, so its poss + """ + + # common_stems = FrontendService.longest_common_word_sequences(paragraph1, paragraph2) + # # print(common_stems) + # for common_stem in common_stems: + # print(common_stem) + + # text_list = ["is fine-tuned from a model in the gpt-3.5 series, which finished training in early", + # "sensitive to tweaks to the input phrasing or attempting the same prompt multiple", + # "is fine-tuned from a model in the gpt-3.5 series, which finished training in", + # "is fine-tuned from a model in the gpt-3.5 series, which finished training", + # "sensitive to tweaks to the input phrasing or attempting the same prompt", + # "is fine-tuned from a model in the gpt-3.5 series, which finished", + # "sensitive to tweaks to the input phrasing or attempting the same", + # "sensitive to tweaks to the input phrasing or attempting the", + # "is fine-tuned from a model in the gpt-3.5 series, which"] + # text_list = FrontendService.remove_substrings(text_list) + # for text in text_list: + # print(text) + + response_text = "is fine-tuned from a gpt-3.5 series" + split_list = split_with_delimiters(response_text, ["fine-tuned", "gpt-3.5"]) + for sentence in split_list: + print(sentence) diff --git a/src/SearchGPTService.py b/src/SearchGPTService.py index 88d3e12..ffe57ff 100644 --- a/src/SearchGPTService.py +++ b/src/SearchGPTService.py @@ -1,22 +1,34 @@ -import glob import os -from pathlib import Path import pandas as pd import yaml -from BingService import BingService from FrontendService import FrontendService from LLMService import LLMServiceFactory from SemanticSearchService import BatchOpenAISemanticSearchService -from Util import setup_logger, post_process_gpt_input_text_df, get_project_root, storage_cached -from text_extract.doc import support_doc_type, doc_extract_svc_map -from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc +from SourceService import SourceService +from Util import setup_logger, get_project_root, storage_cached logger = setup_logger('SearchGPTService') class SearchGPTService: + """ + SearchGPT app->service->child-service structure + - (Try to) app import service, child-service inherit service + + SearchGPT class + - SourceService + -- BingService + -- Doc/PPT/PDF Service + - SemanticSearchModule + - LLMService + -- OpenAIService + -- GooseAPIService + - FrontendService + + """ + def __init__(self, ui_overriden_config=None): with open(os.path.join(get_project_root(), 'src/config/config.yaml')) as f: self.config = yaml.load(f, Loader=yaml.FullLoader) @@ -31,18 +43,18 @@ def overide_config_by_query_string(self, ui_overriden_config): # query_string is flattened (one level) while config.yaml is nested (two+ levels) # Any better way to handle this? if key == 'bing_search_subscription_key': - self.config['bing_search']['subscription_key'] = value + self.config['source_service']['bing_search']['subscription_key'] = value elif key == 'openai_api_key': - self.config['openai_api']['api_key'] = value + self.config['llm_service']['openai_api']['api_key'] = value elif key == 'is_use_source': - self.config['search_option']['is_use_source'] = False if value.lower() in ['false', '0'] else True + self.config['source_service']['is_use_source'] = False if value.lower() in ['false', '0'] else True elif key == 'llm_service_provider': self.config['llm_service']['provider'] = value elif key == 'llm_model': if self.config['llm_service']['provider'] == 'openai': - self.config['openai_api']['model'] = value + self.config['llm_service']['openai_api']['model'] = value elif self.config['llm_service']['provider'] == 'goose_ai': - self.config['goose_ai_api']['model'] = value + self.config['llm_service']['goose_ai_api']['model'] = value else: raise Exception(f"llm_model is not supported for llm_service_provider: {self.config['llm_service']['provider']}") else: @@ -50,15 +62,22 @@ def overide_config_by_query_string(self, ui_overriden_config): pass def validate_config(self): - if self.config['search_option']['is_enable_bing_search']: - assert self.config['bing_search']['subscription_key'], 'bing_search_subscription_key is required' + if self.config['source_service']['is_enable_bing_search']: + assert self.config['source_service']['bing_search']['subscription_key'], 'bing_search_subscription_key is required' if self.config['llm_service']['provider'] == 'openai': - assert self.config['openai_api']['api_key'], 'openai_api_key is required' + assert self.config['llm_service']['openai_api']['api_key'], 'openai_api_key is required' + + @storage_cached('web', 'search_text') + def query_and_get_answer(self, search_text): + source_module = SourceService(self.config) + bing_text_df = source_module.extract_bing_text_df(search_text) + doc_text_df = source_module.extract_doc_text_df(bing_text_df) + text_df = pd.concat([bing_text_df, doc_text_df], ignore_index=True) - def _prompt(self, search_text, text_df, cache_path=None): semantic_search_service = BatchOpenAISemanticSearchService(self.config) gpt_input_text_df = semantic_search_service.search_related_source(text_df, search_text) - gpt_input_text_df = post_process_gpt_input_text_df(gpt_input_text_df, self.config.get('openai_api').get('prompt').get('prompt_length_limit')) + gpt_input_text_df = BatchOpenAISemanticSearchService.post_process_gpt_input_text_df(gpt_input_text_df, + self.config.get('llm_service').get('openai_api').get('prompt').get('prompt_length_limit')) llm_service = LLMServiceFactory.create_llm_service(self.config) prompt = llm_service.get_prompt_v3(search_text, gpt_input_text_df) @@ -77,55 +96,3 @@ def _prompt(self, search_text, text_df, cache_path=None): print(source_text) return response_text, source_text, data_json - - def _extract_bing_text_df(self, search_text, cache_path): - # BingSearch using search_text - bing_text_df = None - if not self.config['search_option']['is_use_source'] or not self.config['search_option']['is_enable_bing_search']: - return bing_text_df - - bing_service = BingService(self.config) - website_df = bing_service.call_bing_search_api(query=search_text) - bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df=website_df) - - return bing_text_df - - def _extract_doc_text_df(self, bing_text_df): - # DocSearch using doc_search_path - # bing_text_df is used for doc_id arrangement - if not self.config['search_option']['is_use_source'] or not self.config['search_option']['is_enable_doc_search']: - return pd.DataFrame([]) - files_grabbed = list() - for doc_type in support_doc_type: - tmp_file_list = glob.glob(self.config['search_option']['doc_search_path'] + os.sep + "*." + doc_type) - files_grabbed.extend({"file_path": file_path, "doc_type": doc_type} for file_path in tmp_file_list) - - logger.info(f"File list: {files_grabbed}") - doc_sentence_list = list() - - start_doc_id = 1 if bing_text_df is None else bing_text_df['url_id'].max() + 1 - for doc_id, file in enumerate(files_grabbed, start=start_doc_id): - extract_svc: AbstractDocExtractSvc = doc_extract_svc_map[file['doc_type']] - sentence_list = extract_svc.extract_from_doc(file['file_path']) - - file_name = file['file_path'].split(os.sep)[-1] - for sentence in sentence_list: - doc_sentence_list.append({ - 'name': file_name, - 'url': file['file_path'], - 'url_id': doc_id, - 'snippet': '', - 'text': sentence - }) - doc_text_df = pd.DataFrame(doc_sentence_list) - return doc_text_df - - @storage_cached('web', 'search_text') - def query_and_get_answer(self, search_text): - cache_path = Path(self.config.get('cache').get('path')) - # TODO: strategy pattern to support different text sources (e.g. PDF later) - bing_text_df = self._extract_bing_text_df(search_text, cache_path) - doc_text_df = self._extract_doc_text_df(bing_text_df) - text_df = pd.concat([bing_text_df, doc_text_df], ignore_index=True) - - return self._prompt(search_text, text_df, cache_path) diff --git a/src/SemanticSearchService.py b/src/SemanticSearchService.py index 6812a1e..40e4b11 100644 --- a/src/SemanticSearchService.py +++ b/src/SemanticSearchService.py @@ -1,5 +1,6 @@ import openai import pandas as pd +import re from openai.embeddings_utils import cosine_similarity from Util import setup_logger @@ -104,7 +105,7 @@ # self.provider = self.config.get('semantic_search').get('provider') # self.embeddings = None # if self.provider == 'faiss-openai': -# self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('openai_api').get('api_key')) +# self.embeddings = OpenAIEmbeddings(openai_api_key=self.config.get('llm_service').get('openai_api').get('api_key')) # elif self.provider == 'faiss-huggingface': # self.embeddings = HuggingFaceEmbeddings() # else: @@ -164,7 +165,7 @@ class BatchOpenAISemanticSearchService: def __init__(self, config): self.config = config - openai.api_key = config.get('openai_api').get('api_key') + openai.api_key = config.get('llm_service').get('openai_api').get('api_key') @staticmethod def batch_call_embeddings(texts, chunk_size=1000): @@ -186,7 +187,7 @@ def compute_embeddings_for_text_df(text_df: pd.DataFrame): return text_df def search_related_source(self, text_df: pd.DataFrame, target_text, n=30): - if not self.config.get('search_option').get('is_use_source'): + if not self.config.get('source_service').get('is_use_source'): col = ['name', 'url', 'url_id', 'snippet', 'text', 'similarities', 'rank', 'docno'] return pd.DataFrame(columns=col) @@ -199,6 +200,21 @@ def search_related_source(self, text_df: pd.DataFrame, target_text, n=30): result_df['docno'] = range(1, len(result_df) + 1) return result_df + @staticmethod + def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): + # clean out of prompt texts for existing [1], [2], [3]... in the source_text + gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) + + gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) + gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() + max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 + gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit + # reorder url_id with url that in scope. + url_id_list = gpt_input_text_df['url_id'].unique() + url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) + gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map) + return gpt_input_text_df + if __name__ == '__main__': pass diff --git a/src/SourceService.py b/src/SourceService.py new file mode 100644 index 0000000..86561a1 --- /dev/null +++ b/src/SourceService.py @@ -0,0 +1,59 @@ +import glob +import os + +import pandas as pd + +from BingService import BingService +from Util import setup_logger +from text_extract.doc import support_doc_type, doc_extract_svc_map +from text_extract.doc.abc_doc_extract import AbstractDocExtractSvc + +logger = setup_logger('SourceModule') + + +class SourceService: + def __init__(self, config): + self.config = config + + def extract_bing_text_df(self, search_text): + # BingSearch using search_text + # check if bing search result is cached and load if exists + bing_text_df = None + if not self.config['source_service']['is_use_source'] or not self.config['source_service']['is_enable_bing_search']: + return bing_text_df + + bing_service = BingService(self.config) + website_df = bing_service.call_bing_search_api(search_text=search_text) + bing_text_df = bing_service.call_urls_and_extract_sentences_concurrent(website_df=website_df) + + return bing_text_df + + def extract_doc_text_df(self, bing_text_df): + # DocSearch using doc_search_path + # bing_text_df is used for doc_id arrangement + if not self.config['source_service']['is_use_source'] or not self.config['source_service']['is_enable_doc_search']: + return pd.DataFrame([]) + files_grabbed = list() + for doc_type in support_doc_type: + tmp_file_list = glob.glob(self.config['source_service']['doc_search_path'] + os.sep + "*." + doc_type) + files_grabbed.extend({"file_path": file_path, "doc_type": doc_type} for file_path in tmp_file_list) + + logger.info(f"File list: {files_grabbed}") + doc_sentence_list = list() + + start_doc_id = 1 if bing_text_df is None else bing_text_df['url_id'].max() + 1 + for doc_id, file in enumerate(files_grabbed, start=start_doc_id): + extract_svc: AbstractDocExtractSvc = doc_extract_svc_map[file['doc_type']] + sentence_list = extract_svc.extract_from_doc(file['file_path']) + + file_name = file['file_path'].split(os.sep)[-1] + for sentence in sentence_list: + doc_sentence_list.append({ + 'name': file_name, + 'url': file['file_path'], + 'url_id': doc_id, + 'snippet': '', + 'text': sentence + }) + doc_text_df = pd.DataFrame(doc_sentence_list) + return doc_text_df diff --git a/src/Util.py b/src/Util.py index 19e9890..eb52b56 100644 --- a/src/Util.py +++ b/src/Util.py @@ -23,21 +23,6 @@ def setup_logger(tag): return logger -def post_process_gpt_input_text_df(gpt_input_text_df, prompt_length_limit): - # clean out of prompt texts for existing [1], [2], [3]... in the source_text - gpt_input_text_df['text'] = gpt_input_text_df['text'].apply(lambda x: re.sub(r'\[[0-9]+\]', '', x)) - - gpt_input_text_df['len_text'] = gpt_input_text_df['text'].apply(lambda x: len(x)) - gpt_input_text_df['cumsum_len_text'] = gpt_input_text_df['len_text'].cumsum() - max_rank = gpt_input_text_df[gpt_input_text_df['cumsum_len_text'] <= prompt_length_limit]['rank'].max() + 1 - gpt_input_text_df['in_scope'] = gpt_input_text_df['rank'] <= max_rank # In order to get also the row slightly larger than prompt_length_limit - # reorder url_id with url that in scope. - url_id_list = gpt_input_text_df['url_id'].unique() - url_id_map = dict(zip(url_id_list, range(1, len(url_id_list) + 1))) - gpt_input_text_df['url_id'] = gpt_input_text_df['url_id'].map(url_id_map) - return gpt_input_text_df - - def save_result_cache(path: Path, hash: str, type: str, **kwargs): cache_dir = path / type os.makedirs(cache_dir, exist_ok=True) @@ -57,7 +42,7 @@ def check_result_cache_exists(path: Path, hash: str, type: str) -> bool: return True if os.path.exists(path) else False -def check_max_number_of_cache(path: Path, type: str, max_number_of_cache: int = 10): +def check_max_number_of_cache(path: Path, type, max_number_of_cache: int = 10): path = path / type if len(os.listdir(path)) > max_number_of_cache: ctime_list = [(os.path.getctime(path / file), file) for file in os.listdir(path)] diff --git a/src/config/config.yaml b/src/config/config.yaml index da2a1b4..67a0181 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -1,33 +1,33 @@ -search_option: +source_service: is_use_source: true # grounded or not grounded. If not grounded, meaning just Q&A via LLM is_enable_bing_search: true is_enable_doc_search: false doc_search_path: -bing_search: - end_point: https://api.bing.microsoft.com - subscription_key: - result_count: 3 - text_extract: beautifulsoup # beautifulsoup / trafilatura + bing_search: + end_point: https://api.bing.microsoft.com + subscription_key: + result_count: 3 + text_extract: beautifulsoup # beautifulsoup / trafilatura llm_service: provider: openai # openai/goose_ai -openai_api: - api_key: - # model: gpt-3.5-turbo is the best one. Details: https://platform.openai.com/docs/models/gpt-3-5. -# model: text-babbage-001 -# model: text-curie-001 -# model: text-davinci-003 - model: gpt-3.5-turbo # default - max_tokens: 300 - temperature: 1 - top_p: 1 - prompt: - prompt_length_limit: 3000 -goose_ai_api: - api_key: - api_base: https://api.goose.ai/v1 - # https://goose.ai/docs/models - model: gpt-neo-20b - max_tokens: 100 + openai_api: + api_key: + # model: gpt-3.5-turbo is the best one. Details: https://platform.openai.com/docs/models/gpt-3-5. + # model: text-babbage-001 + # model: text-curie-001 + # model: text-davinci-003 + model: gpt-3.5-turbo # default + max_tokens: 300 + temperature: 1 + top_p: 1 + prompt: + prompt_length_limit: 3000 + goose_ai_api: + api_key: + api_base: https://api.goose.ai/v1 + # https://goose.ai/docs/models + model: gpt-neo-20b + max_tokens: 100 cache: # .cache result for efficiency and consistency is_enable: web: false diff --git a/src/main.py b/src/main.py index 33d409a..36b86aa 100644 --- a/src/main.py +++ b/src/main.py @@ -4,6 +4,6 @@ search_text = 'the source of dark energy' search_gpt_service = SearchGPTService() - response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text) + response_text, source_text, data_json = search_gpt_service.query_and_get_answer(search_text=search_text) print() diff --git a/src/website/templates/base.html b/src/website/templates/base.html index 317d90d..983868f 100644 --- a/src/website/templates/base.html +++ b/src/website/templates/base.html @@ -30,7 +30,7 @@
-

SearchGPT 20230311 Version ( +

SearchGPT 20230311_2 Version ( Github ). Your feedback will help us to improve

diff --git a/src/website/templates/explain_result.html b/src/website/templates/explain_result.html index 110be57..afbfdfd 100644 --- a/src/website/templates/explain_result.html +++ b/src/website/templates/explain_result.html @@ -15,7 +15,7 @@

{{search_text}}

{% for item in response_explain_json %} {% if item['type'] == 'newline' %} -
+
{% elif item['type'] == 'keyword' %} {{ item['text'] }} {% else %} @@ -28,7 +28,7 @@

{{search_text}}

{% for item in source_explain_json %} {% if item['type'] == 'newline' %} -
+
{% elif item['type'] == 'keyword' %} {{ item['text'] }} {% else %} diff --git a/src/website/templates/index.html b/src/website/templates/index.html index 7af63b8..cfacb5f 100644 --- a/src/website/templates/index.html +++ b/src/website/templates/index.html @@ -30,14 +30,14 @@

- +
{% if request.args.get('is_use_source', 'true') != 'False' %} {% else %} {% endif %} -
@@ -54,12 +54,10 @@
diff --git a/src/website/views.py b/src/website/views.py index 406b34e..9f2fa64 100644 --- a/src/website/views.py +++ b/src/website/views.py @@ -35,6 +35,7 @@ def index_page(): error = None data_json = {'response_json': [], 'source_json': []} search_text = request.values.get('q') + try: ui_overriden_config = { 'bing_search_subscription_key': request.values.get('bing_search_subscription_key'), @@ -47,7 +48,7 @@ def index_page(): if search_text is not None: search_gpt_service = SearchGPTService(ui_overriden_config) - _, _, data_json = search_gpt_service.query_and_get_answer(search_text) + _, _, data_json = search_gpt_service.query_and_get_answer(search_text=search_text) except Exception as e: error = str(e)