diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 9643fbf..1a22e48 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -1,5 +1,5 @@ ''' -Advanced, "Chat my PDF" demo, but using self-hosted LLM. +Advanced, "Chat my PDF" demo Use a PDF document as a knowledge base to provide context for natural language Q&A diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py new file mode 100644 index 0000000..7be9016 --- /dev/null +++ b/demo/chat_web_selects.py @@ -0,0 +1,181 @@ +''' +Advanced, "Chat my docs" demo, using docs from the web + +Download one or more web pages and query an LLM using them as context. +Works especially well with airoboros self-hosted LLM. + +Vector store: Qdrant - https://qdrant.tech/ + Alternatives: pgvector, Chroma, Faiss, Weaviate, etc. +Text to vector (embedding) model: + Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002 + +Needs access to an OpenAI-like service. Default assumption is self-hosted +via e.g. llama-cpp-python or text-generation-webui + +Assume for the following it's at host my-llm-host, port 8000 + +pip install rerequisites, in addition to OgbujiPT cloned dir: + +click sentence_transformers qdrant-client httpx html2text + +```sh +python demo/chat_web_selects.py "www.newworldencyclopedia.org/entry/Igbo_People" +``` +''' +# en.wikipedia.org/wiki/Igbo_people|ahiajoku.igbonet.com/2000/|en.wikivoyage.org/wiki/Igbo_phrasebook" +import asyncio +import os + +import click +import httpx +import html2text + +from ogbujipt import config +from ogbujipt.prompting import format, ALPACA_INSTRUCT_DELIMITERS +from ogbujipt.async_helper import schedule_openai_call, openai_api_surrogate +from ogbujipt import oapi_choice1_text +from ogbujipt.text_helper import text_splitter +from ogbujipt.embedding_helper import qdrant_collection + +# Avoid re-entrace complaints from huggingface/tokenizers +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + +# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 +DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2' + +COLLECTION_NAME = 'chat-web-selects' +USER_PROMPT = 'What do you want to know from these sites?:' + +# Hard-code for demo +EMBED_CHUNK_SIZE = 200 +EMBED_CHUNK_OVERLAP = 20 +DOTS_SPACING = 0.5 # Number of seconds between each dot printed to console + + +async def indicate_progress(pause=DOTS_SPACING): + while True: + print('.', end='', flush=True) + await asyncio.sleep(pause) + + +async def read_site(url, collection): + # Crude check; good enough for demo + if not url.startswith('http'): url = 'https://' + url # noqa E701 + print('Downloading & processing', url) + async with httpx.AsyncClient(verify=False) as client: + resp = await client.get(url) + html = resp.content.decode(resp.encoding) + + # with open('/tmp/ahiajoku.igbonet.com-2000.html') as fp: + # html = fp.read() + + text = html2text.html2text(html) + + # Split text into chunks + chunks = text_splitter(text, chunk_size=EMBED_CHUNK_SIZE, + chunk_overlap=EMBED_CHUNK_OVERLAP, separator='\n') + + # print('\n\n'.join([ch[:100] for ch in chunks])) + # Crude—for demo. Set URL metadata for all chunks to doc URL + metas = [{'url': url}]*len(chunks) + # Add the text to the collection. Blocks, so no reentrancy concern + collection.add(texts=chunks, metas=metas) + + +async def async_main(sites, api_params): + # Automatic download from HuggingFace + # Seem to be reentrancy issues with HuggingFace; defer import + from sentence_transformers import SentenceTransformer + embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) + # Sites fuel in-memory Qdrant vector DB instance + collection = qdrant_collection(COLLECTION_NAME, embedding_model) + + url_task_group = asyncio.gather(*[ + asyncio.create_task(read_site(site, collection)) for site in sites.split('|')]) + indicator_task = asyncio.create_task(indicate_progress()) + tasks = [indicator_task, url_task_group] + done, _ = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED) + + done = False + while not done: + user_question = input(USER_PROMPT) + if user_question == done: + break + + docs = collection.search(user_question, limit=4) + + print(docs) + if docs: + # Collects "chunked_doc" into "gathered_chunks" + gathered_chunks = '\n\n'.join( + doc.payload['_text'] for doc in docs + ) + + # Build prompt the doc chunks as context + prompt = format( + f'Given the context, {user_question}\n\n' + f'Context: """\n{gathered_chunks}\n"""\n', + preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' + 'questions directly and as briefly as possible. ' + 'If you cannot answer with the given context, just say so.\n', + delimiters=ALPACA_INSTRUCT_DELIMITERS) + + print(prompt) + + # The rest is much like in alpaca_multitask_fix_xml.py + model_params = dict( + max_tokens=1024, # Limit number of generated tokens + top_p=1, # AKA nucleus sampling; can increase generated text diversity + frequency_penalty=0, # Favor more or less frequent tokens + presence_penalty=1, # Prefer new, previously unused tokens + temperature=0.1 + ) + + indicator_task = asyncio.create_task(indicate_progress()) + llm_task = asyncio.create_task( + schedule_openai_call(openai_api_surrogate, prompt, **model_params)) + tasks = [indicator_task, llm_task] + done, _ = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED) + + # Instance of openai.openai_object.OpenAIObject, with lots of useful info + retval = next(iter(done)).result() + print(type(retval)) + # Response is a json-like object; extract the text + print('\nFull response data from LLM:\n', retval) + + # response is a json-like object; + # just get back the text of the response + response_text = oapi_choice1_text(retval) + print('\nResponse text from LLM:\n\n', response_text) + + +# Command line arguments defined in click decorators +@click.command() +@click.option('--host', default='http://127.0.0.1', help='OpenAI API host') +@click.option('--port', default='8000', help='OpenAI API port') +@click.option('--openai-key', + help='OpenAI API key. Leave blank to specify self-hosted model via --host & --port') +@click.option('--model', default='', type=str, + help='OpenAI model to use (see https://platform.openai.com/docs/models).' + 'Use only with --openai-key') +@click.argument('sites') +def main(host, port, openai_key, model, sites): + # Use OpenAI API if specified, otherwise emulate with supplied host, etc. + if openai_key: + assert not (host or port), 'Don\'t use --host or --port with --openai' + model = model or 'text-davinci-003' + openai_api = config.openai_live( + model=model, debug=True) + else: + # For now the model param is most useful in conjunction with --openai + model = model or config.HOST_DEFAULT + openai_api = config.openai_emulation( + host=host, port=port, model=model, debug=True) + + asyncio.run(async_main(sites, openai_api.params)) + + +if __name__ == '__main__': + main() diff --git a/pylib/async_helper.py b/pylib/async_helper.py index a8c14e4..922436d 100644 --- a/pylib/async_helper.py +++ b/pylib/async_helper.py @@ -16,8 +16,7 @@ async def schedule_callable(callable, *args, **kwargs): ''' - TODO: rename me? this is convenent for more than just LLM calls - Schedule task long-running/blocking LLM requests in a separate process, + Schedule long-running/blocking function call in a separate process, wrapped to work well in an asyncio event loop Basically hides away a bunch of the multiprocessing webbing @@ -38,8 +37,40 @@ async def schedule_callable(callable, *args, **kwargs): # Need to partial execute to get in any kwargs for the target callable prepped_callable = partial(callable, **kwargs) # Spawn a separate process for the LLM call - response = await loop.run_in_executor( - executor, prepped_callable, *args) + response = await loop.run_in_executor(executor, prepped_callable, *args) + return response + + +async def schedule_openai_call(callable, *args, **kwargs): + ''' + Schedule long-running/blocking LLM request in a separate process, + wrapped to work well in an asyncio event loop + + Basically hides away a bunch of the multiprocessing webbing + + e.g. `llm_task = asyncio.create_task(schedule_callable(llm, prompt))` + + Can then use asyncio.wait(), asyncio.gather(), etc. with `llm_task` + + Args: + callable (callable): Callable to be scheduled + + Returns: + response: Response object + ''' + # Link up the current async event loop for multiprocess execution + loop = asyncio.get_running_loop() + executor = concurrent.futures.ProcessPoolExecutor() + # Need to partial execute to get in any kwargs for the target callable + if 'model' not in kwargs: + kwargs['model'] = '' + prepped_callable = partial( + callable, + api_base=openai.api_base, + api_key=openai.api_key, + **kwargs) + # Spawn a separate process for the LLM call + response = await loop.run_in_executor(executor, prepped_callable, *args) return response diff --git a/pylib/config.py b/pylib/config.py index e3be66c..15843a0 100644 --- a/pylib/config.py +++ b/pylib/config.py @@ -26,6 +26,8 @@ class attr_dict(dict): def openai_live( + rev='v1', + model='', apikey=None, debug=True ): @@ -48,16 +50,14 @@ def openai_live( openai_api (openai): Prepared OpenAI API ''' import openai as openai_api - from dotenv import load_dotenv - load_dotenv() # openai_api.api_version openai_api.debug = debug openai_api.params = attr_dict( + rev=rev, api_key=apikey, - api_base=openai_api.api_base, - debug=debug - ) + model=model, + debug=debug) return openai_api @@ -67,7 +67,9 @@ def openai_emulation( port='8000', # llama-cpp-python; for Ooba, use '5001' rev='v1', model=HOST_DEFAULT, - apikey='BOGUS', oaitype='open_ai', debug=True): + apikey='BOGUS', + oaitype='open_ai', + debug=True): ''' Set up emulation, to use a alternative, OpenAI API compatible service diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 2445358..8e1d5d0 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -113,10 +113,11 @@ def upsert(self, texts, metas=None): Update/insert a Qdrant client's collection with the some chunks of text Args: - chunks (List[str]): List of similar length strings to embed + texts (List[str]): Strings to be stored and indexed. For best results these should be of similar length. + They'll be converted to embeddings fo refficient lookup - embedding (SentenceTransformer): SentenceTransformer object of your choice - SentenceTransformer](https://huggingface.co/sentence-transformers) + metas (List[dict]): Optional metadata per text, stored with the text and included whenever the text is + retrieved via search/query ''' current_count = int(str(self.db.count(self.name)).partition('=')[-1]) metas = metas or [] @@ -139,3 +140,16 @@ def upsert(self, texts, metas=None): ) ] ) + + def search(self, text, **kwargs): + ''' + Perform a search on this Qdrant collection + + Args: + query (str): string to compare against items in the collection + + kwargs: other args to be passed to qdrant_client.QdrantClient.search(). Common ones: + limit - maximum number of results to return (useful for top-k query) + ''' + embedded_text = self._embedding_model.encode(text) + return self.db.search(collection_name=self.name, query_vector=embedded_text, **kwargs)