Merge pull request #17 from uogbuji/16-better-vector-db

Refactor qdrant vector DB interface
OoriData · Jul 20, 2023 · 3f2ca21 · 3f2ca21
2 parents 372f121 + e81ff3e
commit 3f2ca21
Show file tree

Hide file tree

Showing 12 changed files with 559 additions and 406 deletions.
diff --git a/demo/README.md b/demo/README.md
@@ -2,7 +2,7 @@
 
 ## alpaca_simple_fix_xml.py
 
-Quick demo, sending an Alpaca-compatible LLm some bad XML & asking it to make corrections.
+Quick demo, sending an Alpaca-compatible LLM some bad XML & asking it to make corrections.
 
 # Intermediate
 
@@ -30,12 +30,23 @@ Demonstrates:
 consideration until more sever-side LLM hosting frameworks reliablly
 support multiprocessing
 
+## chat_web_selects.py
+
+Simple, command-line "chat my web site" demo, but supporting self-hosted LLM.
+
+Definitely a good idea for you to understand demos/alpaca_multitask_fix_xml.py
+before swapping this in.
+
+Vector store: Qdrant - https://qdrant.tech/
+
+Supports multiple web URLs, specified on cmdline
+
 ## chat_pdf_streamlit_ui.py
 
 <img width="970" alt="image" src="https://github.com/uogbuji/OgbujiPT/assets/279982/57b479a9-2dbc-4d65-ac19-e954df2a21d0">
 
-"Chat my PDF" demo, but using self-hosted LLM. Definitely a good idea for you to understand
-demos/alpaca_multitask_fix_xml.py
+"Chat my PDF" demo, supporting self-hosted LLM. Definitely a good idea for you to understand
+alpaca_multitask_fix_xml.py & chat_web_selects.py
 before swapping this in.
 
 UI: Streamlit - streamlit.io

diff --git a/demo/alpaca_simple_fix_xml.py b/demo/alpaca_simple_fix_xml.py
@@ -14,7 +14,7 @@
 
 from ogbujipt.config import openai_live, openai_emulation
 from ogbujipt.prompting.basic import context_build
-from ogbujipt.prompting.model_style import VICUNA_DELIMITERS
+from ogbujipt.prompting.model_style import ALPACA_INSTRUCT_INPUT_DELIMITERS
 
 
 # Command line arguments defined in click decorators
@@ -44,10 +44,11 @@ def main(host, port, llmtemp, openai, model):
 </Earth>'''
 
     prompt = context_build(
-        f'Correct the following XML to make it well-formed\n\n{BAD_XML_CODE}',
+        'Correct the given XML to make it well-formed',
+        contexts= BAD_XML_CODE,
         preamble='You are a helpful assistant, '
         'who answers questions briefly, in 1st grade language',
-        delimiters=VICUNA_DELIMITERS)
+        delimiters=ALPACA_INSTRUCT_INPUT_DELIMITERS)
     print(prompt, '\n')
 
     response = openai_api.Completion.create(

diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py
diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py
@@ -0,0 +1,181 @@
+'''
+Advanced, "Chat my docs" demo, using docs from the web
+
+Download one or more web pages and query an LLM using them as context.
+Works especially well with airoboros self-hosted LLM.
+
+Vector store: Qdrant - https://qdrant.tech/
+    Alternatives: pgvector, Chroma, Faiss, Weaviate, etc.
+Text to vector (embedding) model: 
+    Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002
+
+Needs access to an OpenAI-like service. Default assumption is self-hosted
+via e.g. llama-cpp-python or text-generation-webui
+
+Assume for the following it's at host my-llm-host, port 8000
+
+pip install prerequisites, in addition to OgbujiPT cloned dir:
+
+click sentence_transformers qdrant-client httpx html2text amara3.xml
+
+```sh
+python demo/chat_web_selects.py --host http://my-llm-host --port 8000 "www.newworldencyclopedia.org/entry/Igbo_People"
+```
+
+An example question might be "Who are the neighbors of the Igbo people?"
+'''
+# en.wikipedia.org/wiki/Igbo_people|ahiajoku.igbonet.com/2000/|en.wikivoyage.org/wiki/Igbo_phrasebook"
+import asyncio
+import os
+
+import click
+import httpx
+import html2text
+
+from ogbujipt import config
+from ogbujipt.prompting import format, ALPACA_INSTRUCT_DELIMITERS
+from ogbujipt.async_helper import schedule_openai_call, openai_api_surrogate
+from ogbujipt import oapi_choice1_text
+from ogbujipt.text_helper import text_splitter
+from ogbujipt.embedding_helper import qdrant_collection
+
+# Avoid re-entrace complaints from huggingface/tokenizers
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+DOC_EMBEDDINGS_LLM = 'all-MiniLM-L12-v2'
+
+COLLECTION_NAME = 'chat-web-selects'
+USER_PROMPT = 'What do you want to know from the site(s)?\n'
+
+# Hard-code for demo
+EMBED_CHUNK_SIZE = 200
+EMBED_CHUNK_OVERLAP = 20
+DOTS_SPACING = 0.2  # Number of seconds between each dot printed to console
+
+
+async def indicate_progress(pause=DOTS_SPACING):
+    while True:
+        print('.', end='', flush=True)
+        await asyncio.sleep(pause)
+
+
+async def read_site(url, collection):
+    # Crude check; good enough for demo
+    if not url.startswith('http'): url = 'https://' + url  # noqa E701
+    print('Downloading & processing', url)
+    async with httpx.AsyncClient(verify=False) as client:
+        resp = await client.get(url)
+        html = resp.content.decode(resp.encoding or 'utf-8')
+
+    text = html2text.html2text(html)
+
+    # Split text into chunks
+    chunks = text_splitter(text, chunk_size=EMBED_CHUNK_SIZE,
+                           chunk_overlap=EMBED_CHUNK_OVERLAP, separator='\n')
+
+    # print('\n\n'.join([ch[:100] for ch in chunks]))
+    # Crude—for demo. Set URL metadata for all chunks to doc URL
+    metas = [{'url': url}]*len(chunks)
+    # Add the text to the collection. Blocks, so no reentrancy concern
+    collection.update(texts=chunks, metas=metas)
+    print(f'{collection.count()} chunks added to collection')
+
+
+async def async_main(sites, api_params):
+    # Automatic download from HuggingFace
+    # Seem to be reentrancy issues with HuggingFace; defer import
+    from sentence_transformers import SentenceTransformer
+    embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM)
+    # Sites fuel in-memory Qdrant vector DB instance
+    collection = qdrant_collection(COLLECTION_NAME, embedding_model)
+
+    url_task_group = asyncio.gather(*[
+        asyncio.create_task(read_site(site, collection)) for site in sites.split('|')])
+    indicator_task = asyncio.create_task(indicate_progress())
+    tasks = [indicator_task, url_task_group]
+    done, _ = await asyncio.wait(
+        tasks, return_when=asyncio.FIRST_COMPLETED)
+
+    done = False
+    while not done:
+        print()
+        user_question = input(USER_PROMPT)
+        if user_question.strip() == 'done':
+            break
+
+        docs = collection.search(user_question, limit=4)
+
+        print(docs)
+        if docs:
+            # Collects "chunked_doc" into "gathered_chunks"
+            gathered_chunks = '\n\n'.join(
+                doc.payload['_text'] for doc in docs if doc.payload)
+
+            # Build prompt the doc chunks as context
+            prompt = format(
+                f'Given the context, {user_question}\n\n'
+                f'Context: """\n{gathered_chunks}\n"""\n',
+                preamble='### SYSTEM:\nYou are a helpful assistant, who answers '
+                'questions directly and as briefly as possible. '
+                'If you cannot answer with the given context, just say so.\n',
+                delimiters=ALPACA_INSTRUCT_DELIMITERS)
+
+            print(prompt)
+
+            # The rest is much like in demo/alpaca_multitask_fix_xml.py
+            model_params = dict(
+                max_tokens=1024,  # Limit number of generated tokens
+                top_p=1,  # AKA nucleus sampling; can increase generated text diversity
+                frequency_penalty=0,  # Favor more or less frequent tokens
+                presence_penalty=1,  # Prefer new, previously unused tokens
+                temperature=0.1
+                )
+
+            indicator_task = asyncio.create_task(indicate_progress())
+            llm_task = asyncio.create_task(
+                schedule_openai_call(openai_api_surrogate, prompt, **model_params))
+            tasks = [indicator_task, llm_task]
+            done, _ = await asyncio.wait(
+                tasks, return_when=asyncio.FIRST_COMPLETED)
+
+            # Instance of openai.openai_object.OpenAIObject, with lots of useful info
+            retval = next(iter(done)).result()
+            print(type(retval))
+            # Response is a json-like object; extract the text
+            print('\nFull response data from LLM:\n', retval)
+
+            # response is a json-like object; 
+            # just get back the text of the response
+            response_text = oapi_choice1_text(retval)
+            print('\nResponse text from LLM:\n\n', response_text)
+
+
+# Command line arguments defined in click decorators
+@click.command()
+@click.option('--host', default='http://127.0.0.1', help='OpenAI API host')
+@click.option('--port', default='8000', help='OpenAI API port')
+@click.option('--openai-key',
+              help='OpenAI API key. Leave blank to specify self-hosted model via --host & --port')
+@click.option('--model', default='', type=str, 
+              help='OpenAI model to use (see https://platform.openai.com/docs/models).'
+              'Use only with --openai-key')
+@click.argument('sites')
+def main(host, port, openai_key, model, sites):
+    # Use OpenAI API if specified, otherwise emulate with supplied host, etc.
+    if openai_key:
+        assert not (host or port), 'Don\'t use --host or --port with --openai'
+        model = model or 'text-davinci-003'
+        openai_api = config.openai_live(
+            model=model, debug=True)
+    else:
+        # For now the model param is most useful in conjunction with --openai
+        model = model or config.HOST_DEFAULT
+        openai_api = config.openai_emulation(
+            host=host, port=port, model=model, debug=True)
+
+    asyncio.run(async_main(sites, openai_api.params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo.env b/demo/demo.env
@@ -1,6 +1,8 @@
 # Copy to .env in the same dir, and update as needed
+# Used by:
+# demo/alpaca_simple_qa_discord.py
+# demo/chat_pdf_streamlit_ui
 
-# Used by demo/alpaca_simple_qa_discord.py
 # DISCORD_TOKEN={REPLACEME}
 LLM_HOST=http://{my-llm-host}
 # LLM_PORT=8000

diff --git a/pylib/async_helper.py b/pylib/async_helper.py
@@ -16,8 +16,7 @@
 
 async def schedule_callable(callable, *args, **kwargs):
     '''
-    TODO: rename me? this is convenent for more than just LLM calls
-    Schedule task long-running/blocking LLM requests in a separate process,
+    Schedule long-running/blocking function call in a separate process,
     wrapped to work well in an asyncio event loop
 
     Basically hides away a bunch of the multiprocessing webbing
@@ -38,8 +37,40 @@ async def schedule_callable(callable, *args, **kwargs):
     # Need to partial execute to get in any kwargs for the target callable
     prepped_callable = partial(callable, **kwargs)
     # Spawn a separate process for the LLM call
-    response = await loop.run_in_executor(
-        executor, prepped_callable, *args)
+    response = await loop.run_in_executor(executor, prepped_callable, *args)
+    return response
+
+
+async def schedule_openai_call(callable, *args, **kwargs):
+    '''
+    Schedule long-running/blocking LLM request in a separate process,
+    wrapped to work well in an asyncio event loop
+
+    Basically hides away a bunch of the multiprocessing webbing
+
+    e.g. `llm_task = asyncio.create_task(schedule_callable(llm, prompt))`
+
+    Can then use asyncio.wait(), asyncio.gather(), etc. with `llm_task`
+
+    Args:
+        callable (callable): Callable to be scheduled
+
+    Returns:
+        response: Response object
+    '''
+    # Link up the current async event loop for multiprocess execution
+    loop = asyncio.get_running_loop()
+    executor = concurrent.futures.ProcessPoolExecutor()
+    # Need to partial execute to get in any kwargs for the target callable
+    if 'model' not in kwargs:
+        kwargs['model'] = ''
+    prepped_callable = partial(
+        callable,
+        api_base=openai.api_base,
+        api_key=openai.api_key,
+        **kwargs)
+    # Spawn a separate process for the LLM call
+    response = await loop.run_in_executor(executor, prepped_callable, *args)
     return response
 
 

diff --git a/pylib/config.py b/pylib/config.py
@@ -26,6 +26,8 @@ class attr_dict(dict):
 
 
 def openai_live(
+        rev='v1',
+        model='',
         apikey=None,
         debug=True
         ):
@@ -48,16 +50,14 @@ def openai_live(
         openai_api (openai): Prepared OpenAI API
     '''
     import openai as openai_api
-    from dotenv import load_dotenv
 
-    load_dotenv()
     # openai_api.api_version
     openai_api.debug = debug
     openai_api.params = attr_dict(
+        rev=rev,
         api_key=apikey,
-        api_base=openai_api.api_base,
-        debug=debug
-        )
+        model=model,
+        debug=debug)
 
     return openai_api
 
@@ -67,7 +67,9 @@ def openai_emulation(
         port='8000',
         rev='v1',
         model=HOST_DEFAULT,
-        apikey='BOGUS', oaitype='open_ai', debug=True):
+        apikey='BOGUS',
+        oaitype='open_ai',
+        debug=True):
     '''
     Set up emulation, to use a alternative, OpenAI API compatible service
     Port 8000 for llama-cpp-python, Port 5001 for Oobabooga