[#16] Fixes to async_helper module. Add embedding_helper.qdrant_colle…

…ction.search() method. Add demo/chat_web_selects.py demo 1st draft
OoriData · Jul 17, 2023 · a35cef1 · a35cef1
1 parent 870807f
commit a35cef1
Show file tree

Hide file tree

Showing 5 changed files with 242 additions and 14 deletions.
diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py
@@ -1,5 +1,5 @@
 '''
-Advanced, "Chat my PDF" demo, but using self-hosted LLM.
+Advanced, "Chat my PDF" demo
 
 Use a PDF document as a knowledge base to provide context for natural language Q&A
 

diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py
@@ -0,0 +1,181 @@
+'''
+Advanced, "Chat my docs" demo, using docs from the web
+
+Download one or more web pages and query an LLM using them as context.
+Works especially well with airoboros self-hosted LLM.
+
+Vector store: Qdrant - https://qdrant.tech/
+    Alternatives: pgvector, Chroma, Faiss, Weaviate, etc.
+Text to vector (embedding) model: 
+    Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002
+
+Needs access to an OpenAI-like service. Default assumption is self-hosted
+via e.g. llama-cpp-python or text-generation-webui
+
+Assume for the following it's at host my-llm-host, port 8000
+
+pip install rerequisites, in addition to OgbujiPT cloned dir:
+
+click sentence_transformers qdrant-client httpx html2text
+
+```sh
+python demo/chat_web_selects.py "www.newworldencyclopedia.org/entry/Igbo_People"
+```
+'''
+# en.wikipedia.org/wiki/Igbo_people|ahiajoku.igbonet.com/2000/|en.wikivoyage.org/wiki/Igbo_phrasebook"
+import asyncio
+import os
+
+import click
+import httpx
+import html2text
+
+from ogbujipt import config
+from ogbujipt.prompting import format, ALPACA_INSTRUCT_DELIMITERS
+from ogbujipt.async_helper import schedule_openai_call, openai_api_surrogate
+from ogbujipt import oapi_choice1_text
+from ogbujipt.text_helper import text_splitter
+from ogbujipt.embedding_helper import qdrant_collection
+
+# Avoid re-entrace complaints from huggingface/tokenizers
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2'
+
+COLLECTION_NAME = 'chat-web-selects'
+USER_PROMPT = 'What do you want to know from these sites?:'
+
+# Hard-code for demo
+EMBED_CHUNK_SIZE = 200
+EMBED_CHUNK_OVERLAP = 20
+DOTS_SPACING = 0.5  # Number of seconds between each dot printed to console
+
+
+async def indicate_progress(pause=DOTS_SPACING):
+    while True:
+        print('.', end='', flush=True)
+        await asyncio.sleep(pause)
+
+
+async def read_site(url, collection):
+    # Crude check; good enough for demo
+    if not url.startswith('http'): url = 'https://' + url  # noqa E701
+    print('Downloading & processing', url)
+    async with httpx.AsyncClient(verify=False) as client:
+        resp = await client.get(url)
+        html = resp.content.decode(resp.encoding)
+
+    # with open('/tmp/ahiajoku.igbonet.com-2000.html') as fp:
+    #     html = fp.read()
+
+    text = html2text.html2text(html)
+
+    # Split text into chunks
+    chunks = text_splitter(text, chunk_size=EMBED_CHUNK_SIZE,
+                           chunk_overlap=EMBED_CHUNK_OVERLAP, separator='\n')
+
+    # print('\n\n'.join([ch[:100] for ch in chunks]))
+    # Crude—for demo. Set URL metadata for all chunks to doc URL
+    metas = [{'url': url}]*len(chunks)
+    # Add the text to the collection. Blocks, so no reentrancy concern
+    collection.add(texts=chunks, metas=metas)
+
+
+async def async_main(sites, api_params):
+    # Automatic download from HuggingFace
+    # Seem to be reentrancy issues with HuggingFace; defer import
+    from sentence_transformers import SentenceTransformer
+    embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM)
+    # Sites fuel in-memory Qdrant vector DB instance
+    collection = qdrant_collection(COLLECTION_NAME, embedding_model)
+
+    url_task_group = asyncio.gather(*[
+        asyncio.create_task(read_site(site, collection)) for site in sites.split('|')])
+    indicator_task = asyncio.create_task(indicate_progress())
+    tasks = [indicator_task, url_task_group]
+    done, _ = await asyncio.wait(
+        tasks, return_when=asyncio.FIRST_COMPLETED)
+
+    done = False
+    while not done:
+        user_question = input(USER_PROMPT)
+        if user_question == done:
+            break
+
+        docs = collection.search(user_question, limit=4)
+
+        print(docs)
+        if docs:
+            # Collects "chunked_doc" into "gathered_chunks"
+            gathered_chunks = '\n\n'.join(
+                doc.payload['_text'] for doc in docs
+                )
+
+            # Build prompt the doc chunks as context
+            prompt = format(
+                f'Given the context, {user_question}\n\n'
+                f'Context: """\n{gathered_chunks}\n"""\n',
+                preamble='### SYSTEM:\nYou are a helpful assistant, who answers '
+                'questions directly and as briefly as possible. '
+                'If you cannot answer with the given context, just say so.\n',
+                delimiters=ALPACA_INSTRUCT_DELIMITERS)
+
+            print(prompt)
+
+            # The rest is much like in alpaca_multitask_fix_xml.py
+            model_params = dict(
+                max_tokens=1024,  # Limit number of generated tokens
+                top_p=1,  # AKA nucleus sampling; can increase generated text diversity
+                frequency_penalty=0,  # Favor more or less frequent tokens
+                presence_penalty=1,  # Prefer new, previously unused tokens
+                temperature=0.1
+                )
+
+            indicator_task = asyncio.create_task(indicate_progress())
+            llm_task = asyncio.create_task(
+                schedule_openai_call(openai_api_surrogate, prompt, **model_params))
+            tasks = [indicator_task, llm_task]
+            done, _ = await asyncio.wait(
+                tasks, return_when=asyncio.FIRST_COMPLETED)
+
+            # Instance of openai.openai_object.OpenAIObject, with lots of useful info
+            retval = next(iter(done)).result()
+            print(type(retval))
+            # Response is a json-like object; extract the text
+            print('\nFull response data from LLM:\n', retval)
+
+            # response is a json-like object; 
+            # just get back the text of the response
+            response_text = oapi_choice1_text(retval)
+            print('\nResponse text from LLM:\n\n', response_text)
+
+
+# Command line arguments defined in click decorators
+@click.command()
+@click.option('--host', default='http://127.0.0.1', help='OpenAI API host')
+@click.option('--port', default='8000', help='OpenAI API port')
+@click.option('--openai-key',
+              help='OpenAI API key. Leave blank to specify self-hosted model via --host & --port')
+@click.option('--model', default='', type=str, 
+              help='OpenAI model to use (see https://platform.openai.com/docs/models).'
+              'Use only with --openai-key')
+@click.argument('sites')
+def main(host, port, openai_key, model, sites):
+    # Use OpenAI API if specified, otherwise emulate with supplied host, etc.
+    if openai_key:
+        assert not (host or port), 'Don\'t use --host or --port with --openai'
+        model = model or 'text-davinci-003'
+        openai_api = config.openai_live(
+            model=model, debug=True)
+    else:
+        # For now the model param is most useful in conjunction with --openai
+        model = model or config.HOST_DEFAULT
+        openai_api = config.openai_emulation(
+            host=host, port=port, model=model, debug=True)
+
+    asyncio.run(async_main(sites, openai_api.params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/pylib/async_helper.py b/pylib/async_helper.py
@@ -16,8 +16,7 @@
 
 async def schedule_callable(callable, *args, **kwargs):
     '''
-    TODO: rename me? this is convenent for more than just LLM calls
-    Schedule task long-running/blocking LLM requests in a separate process,
+    Schedule long-running/blocking function call in a separate process,
     wrapped to work well in an asyncio event loop
 
     Basically hides away a bunch of the multiprocessing webbing
@@ -38,8 +37,40 @@ async def schedule_callable(callable, *args, **kwargs):
     # Need to partial execute to get in any kwargs for the target callable
     prepped_callable = partial(callable, **kwargs)
     # Spawn a separate process for the LLM call
-    response = await loop.run_in_executor(
-        executor, prepped_callable, *args)
+    response = await loop.run_in_executor(executor, prepped_callable, *args)
+    return response
+
+
+async def schedule_openai_call(callable, *args, **kwargs):
+    '''
+    Schedule long-running/blocking LLM request in a separate process,
+    wrapped to work well in an asyncio event loop
+
+    Basically hides away a bunch of the multiprocessing webbing
+
+    e.g. `llm_task = asyncio.create_task(schedule_callable(llm, prompt))`
+
+    Can then use asyncio.wait(), asyncio.gather(), etc. with `llm_task`
+
+    Args:
+        callable (callable): Callable to be scheduled
+
+    Returns:
+        response: Response object
+    '''
+    # Link up the current async event loop for multiprocess execution
+    loop = asyncio.get_running_loop()
+    executor = concurrent.futures.ProcessPoolExecutor()
+    # Need to partial execute to get in any kwargs for the target callable
+    if 'model' not in kwargs:
+        kwargs['model'] = ''
+    prepped_callable = partial(
+        callable,
+        api_base=openai.api_base,
+        api_key=openai.api_key,
+        **kwargs)
+    # Spawn a separate process for the LLM call
+    response = await loop.run_in_executor(executor, prepped_callable, *args)
     return response
 
 

diff --git a/pylib/config.py b/pylib/config.py
@@ -26,6 +26,8 @@ class attr_dict(dict):
 
 
 def openai_live(
+        rev='v1',
+        model='',
         apikey=None,
         debug=True
         ):
@@ -48,16 +50,14 @@ def openai_live(
         openai_api (openai): Prepared OpenAI API
     '''
     import openai as openai_api
-    from dotenv import load_dotenv
 
-    load_dotenv()
     # openai_api.api_version
     openai_api.debug = debug
     openai_api.params = attr_dict(
+        rev=rev,
         api_key=apikey,
-        api_base=openai_api.api_base,
-        debug=debug
-        )
+        model=model,
+        debug=debug)
 
     return openai_api
 
@@ -67,7 +67,9 @@ def openai_emulation(
         port='8000',  # llama-cpp-python; for Ooba, use '5001'
         rev='v1',
         model=HOST_DEFAULT,
-        apikey='BOGUS', oaitype='open_ai', debug=True):
+        apikey='BOGUS',
+        oaitype='open_ai',
+        debug=True):
     '''
     Set up emulation, to use a alternative, OpenAI API compatible service
 

diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py
@@ -113,10 +113,11 @@ def upsert(self, texts, metas=None):
         Update/insert a Qdrant client's collection with the some chunks of text
 
         Args:
-            chunks (List[str]): List of similar length strings to embed
+            texts (List[str]): Strings to be stored and indexed. For best results these should be of similar length.
+                                They'll be converted to embeddings fo refficient lookup
 
-            embedding (SentenceTransformer): SentenceTransformer object of your choice
-            SentenceTransformer](https://huggingface.co/sentence-transformers)
+            metas (List[dict]): Optional metadata per text, stored with the text and included whenever the text is
+                                retrieved via search/query
         '''
         current_count = int(str(self.db.count(self.name)).partition('=')[-1])
         metas = metas or []
@@ -139,3 +140,16 @@ def upsert(self, texts, metas=None):
                         )
                     ]
                 )
+
+    def search(self, text, **kwargs):
+        '''
+        Perform a search on this Qdrant collection
+
+        Args:
+            query (str): string to compare against items in the collection
+
+            kwargs: other args to be passed to qdrant_client.QdrantClient.search(). Common ones:
+                    limit - maximum number of results to return (useful for top-k query)
+        '''
+        embedded_text = self._embedding_model.encode(text)
+        return self.db.search(collection_name=self.name, query_vector=embedded_text, **kwargs)