From 86af70bdfcadbbca9c49ed8933dd2a5e8f001058 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Fri, 14 Jul 2023 10:36:15 -0600 Subject: [PATCH 01/39] [#16] Replace routines in embedding_helper with a class, qdrant_collection. Improvements to model_styles.py, and other tweaks. Begin porting demo/chat_pdf_streamlit_ui.py --- demo/chat_pdf_streamlit_ui.py | 147 ++++++++++----------- pylib/embedding_helper.py | 234 ++++++++++++++------------------- pylib/prompting/model_style.py | 82 ++---------- 3 files changed, 178 insertions(+), 285 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 1f20e07..9643fbf 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -48,9 +48,10 @@ from PyPDF2 import PdfReader from ogbujipt.config import openai_emulation, openai_live, HOST_DEFAULT -from ogbujipt.prompting.basic import context_build, pdelim +from ogbujipt.prompting import format, CHATGPT_DELIMITERS +from ogbujipt import oapi_choice1_text from ogbujipt.text_helper import text_splitter -from ogbujipt.embedding_helper import qdrant_init_embedding_db, qdrant_add_collection +from ogbujipt.embedding_helper import qdrant_collection # Avoid re-entrace complaints from huggingface/tokenizers os.environ['TOKENIZERS_PARALLELISM'] = 'false' @@ -82,14 +83,18 @@ throbber = '' # noqa E501 -async def prep_pdf(pdf, knowledge_base, embedding_model, collection_name): +async def prep_pdf(pdf, embedding_model, collection_name): # Streamlit treats function docstrings as magic strings for user display # Describe function via comments instead: # Converts pdf content into chunks according to chunk size & overlap # Vectorizes chunks for sLLM lookup # returns `knowledge_base`, the vector DB with indexed chunks + + # Create in-memory Qdrant instance + knowledge_base = qdrant_collection(collection_name, embedding_model) + pdf_reader = PdfReader(pdf) - + # Collect text from pdf text = ''.join((page.extract_text() for page in pdf_reader.pages)) @@ -101,13 +106,9 @@ async def prep_pdf(pdf, knowledge_base, embedding_model, collection_name): separator='\n' ) + print('\n\n'.join([ch[:100] for ch in chunks])) # Add a new collection for this document, and upsert the chunks into it - knowledge_base = qdrant_add_collection( - client=knowledge_base, - collection_name=collection_name, - chunks=chunks, - embedding_model=embedding_model - ) + knowledge_base.add(texts=chunks) return knowledge_base @@ -119,78 +120,76 @@ async def async_main(openai_api, model, LLM_TEMP): ''' Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' - # Define delimeters in OpenAI's style - openai_delimiters = { - pdelim.PREQUERY: '### USER:', - pdelim.POSTQUERY: '### ASSISTANT:', - } - - # LLM will be downloaded from HuggingFace automatically - # There seem to be reentrancy issues with HuggingFace; defer import - from sentence_transformers import SentenceTransformer - embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) - - # Create in-memory Qdrant instance - knowledge_base = qdrant_init_embedding_db() - # create file upload box on Streamlit, set from the user's upload pdf = st.file_uploader("Upload a PDF", type=["pdf"], accept_multiple_files=False) - + if pdf: # Show throbber, vectorize the PDF, and setup for similarity search - with st.empty(): - st.image(throbber) - kb = await prep_pdf(pdf, knowledge_base, embedding_model, collection_name=pdf.name) - - user_question = st.text_input(PDF_USER_QUESTION_PROMPT) - - embedded_question = embedding_model.encode(user_question) - - docs = None - if user_question: - docs = kb.search( - collection_name=pdf.name, - query_vector=embedded_question, - limit=K + placeholder = st.empty() + with placeholder: + placeholder.image(throbber) + + # LLM will be downloaded from HuggingFace automatically + # There seem to be reentrancy issues with HuggingFace; defer import + from sentence_transformers import SentenceTransformer + embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) + + kb = await prep_pdf(pdf, embedding_model, collection_name=pdf.name) + placeholder.empty() + + user_question = st.text_input(PDF_USER_QUESTION_PROMPT) + + # docs = None + while not user_question: + await asyncio.sleep(0.1) + + embedded_question = embedding_model.encode(user_question) + docs = kb.db.search( + collection_name=kb.name, + query_vector=embedded_question, + limit=K + ) + + print(kb.name, pdf.name, docs) + if docs: + # Collects "chunked_doc" into "gathered_chunks" + gathered_chunks = '\n\n'.join( + doc.payload['_text'] for doc in docs + ) + + # Build prompt the doc chunks as context + prompt = format( + f'Given the context, {user_question}\n\n' + f'Context: """\n{gathered_chunks}\n"""\n', + preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' + 'questions directly and as briefly as possible. ' + 'If you cannot answer with the given context, just say so.\n', + delimiters=CHATGPT_DELIMITERS + ) + + print(prompt) + # Show throbber, and send LLM prompt + with st.empty(): + st.image(throbber) + response = openai_api.Completion.create( + model=model, # Model (Required) + prompt=prompt, # Prompt (Required) + temperature=LLM_TEMP, # Temp (Default 1) + max_tokens=1024 # Maximum tokens to return (Default 16) ) - if docs: - # Collects "chunked_doc" into "gathered_chunks" - gathered_chunks = '\n\n'.join( - doc.payload['chunk_string'] for doc in docs - ) + # Response is a json-like object; extract the text + print('\nFull response data from LLM:\n', response) - # Build "prompt" with the context of "chunked_doc" - prompt = context_build( - f'Given the context, {user_question}\n\n' - f'Context: """\n{gathered_chunks}\n"""\n', - preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' - 'questions directly and as briefly as possible. ' - 'If you cannot answer with the given context, just say so.\n', - delimiters=openai_delimiters - ) + # Response is a json-like object; + # just get back the text of the response + response_text = oapi_choice1_text(response).strip() + print('\nResponse text from LLM:\n', response_text) - print(prompt) - # Show throbber, and send LLM prompt - with st.empty(): - st.image(throbber) - response = openai_api.Completion.create( - model=model, # Model (Required) - prompt=prompt, # Prompt (Required) - temperature=LLM_TEMP, # Temp (Default 1) - max_tokens=1024 # Maximum tokens to return (Default 16) - ) - - # Response is a json-like object; extract the text - print('\nFull response data from LLM:\n', response) - - # Response is a json-like object; - # just get back the text of the response - response_text = response.choices[0].text.strip() - print('\nResponse text from LLM:\n', response_text) - - # Write the response text to Streamlit - st.write(response_text) + # Write the response text to Streamlit + st.write(response_text) + else: + st.write('No context info found') def main(): diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 6db0d52..2445358 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -3,7 +3,7 @@ # ogbujipt.embedding_helper ''' -Routines to help with embedding for vector databases such as Qdrant +Helper for vector databases embeddings in such as Qdrant Vector DBs are useful when you have a lot of context to use with LLMs, e.g. a large document or collection of docs. One common pattern is to create @@ -38,146 +38,104 @@ QDRANT_AVAILABLE = False QdrantClient = object() # Set up a dummy to satisfy the type hints - # Option for running a Qdrant DB locally in memory MEMORY_QDRANT_CONNECTION_PARAMS = {'location': ':memory:'} -def qdrant_init_embedding_db(**qdrant_conn_params) -> QdrantClient: - ''' - Initialize a Qdrant client - - Args: - qdrant_conn_params (mapping): keyword parameters for setting up QdrantClient - See the main docstring (or run `help(QdrantClient)`) - https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 - - Returns: - QdrantClient: Qdrant client object - ''' - if not QDRANT_AVAILABLE: - raise RuntimeError('Qdrant not installed, you can run `pip install qdrant-client`') - - # Create a Qdrant client - if not qdrant_conn_params: - qdrant_conn_params = MEMORY_QDRANT_CONNECTION_PARAMS - client = QdrantClient(**qdrant_conn_params) - - # Return the Qdrant client object - return client - - -def qdrant_add_collection( - client, - chunks, - embedding_model, - collection_name, - distance_function='Cosine' - ) -> QdrantClient: - ''' - Add a collection to a Qdrant client, and add some strings (chunks) to that collection - - Args: - client (QdrantClient): Initialized Qdrant client object - - chunks (List[str]): List of similar length strings to embed - - embedding (SentenceTransformer): SentenceTransformer object of your choice - https://huggingface.co/sentence-transformers - - collection_name (str): Name that describes "chunks" - - distance_function (str): Distance function by which vectors will be compared - - qdrant_conn_params (mapping): keyword parameters for setting up QdrantClient - See the main docstring (or run `help(QdrantClient)`) - https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 - - Returns: - QdrantClient: Qdrant client object with new collection - ''' - if not QDRANT_AVAILABLE: - raise RuntimeError('Qdrant not installed, you can run `pip install qdrant-client`') - - # Find the size of the first chunk's embedding - partial_embeddings = embedding_model.encode(chunks[0]) - vector_size = len(partial_embeddings) - - # Set the default distance function, and catch for incorrect capitalization - distance_function = distance_function.lower().capitalize() - - ## Create a collection in the Qdrant client, and configure its vectors - # Using REcreate_collection ensures overwrite - client.recreate_collection( - collection_name=collection_name, - vectors_config=models.VectorParams( - size=vector_size, - distance=distance_function - ) - ) - - # Put the chunks in the collection - client = qdrant_upsert_collection( - client=client, - chunks=chunks, - embedding_model=embedding_model, - collection_name=collection_name - ) - - # Return the Qdrant client object - return client - - -def qdrant_upsert_collection( - client, - chunks, - embedding_model, - collection_name - ) -> QdrantClient: - ''' - Update/insert a Qdrant client's collection with the some chunks of text - - Args: - client (QdrantClient): Initialized Qdrant client object - - chunks (List[str]): List of similar length strings to embed - - embedding (SentenceTransformer): SentenceTransformer object of your choice - SentenceTransformer](https://huggingface.co/sentence-transformers) - - collection_name (str): Name of the collection being modified - - Returns: - QdrantClient: Upserted Qdrant client object - ''' - if not QDRANT_AVAILABLE: - raise RuntimeError('Qdrant not installed, you can run `pip install qdrant-client`') - - # Get the current count of chunks in the collection - # TODO: the grossness here is a workaround for client.count() returning - # an object which can then be cast to a string such as "count=0" - # We'll prefer to use a method to get the count directly as an int, - # once one becomes available - current_count = int(str(client.count(collection_name)).partition('=')[-1]) - - for ix, chunk in enumerate(chunks): # For each chunk - # Embeddings as float/vectors - embeddings = list(map(float, embedding_model.encode(chunk))) - - # Create a payload of the (now embedded) chunk - prepped_payload = {'chunk_string': chunk} - - # Upsert the embedded chunk and its payload into the collection - client.upsert( - collection_name=collection_name, - points=[ - models.PointStruct( - id=ix + current_count, # Make sure all chunks have sequential IDs - vector=embeddings, - payload=prepped_payload - ) - ] +class qdrant_collection: + def __init__(self, name, embedding_model, db=None, **conn_params): + ''' + Initialize a Qdrant client + + Args: + name (str): of the collection + + embedding (SentenceTransformer): SentenceTransformer object of your choice + https://huggingface.co/sentence-transformers + + db (optional QdrantClient): existing DB/cliient to use + + conn_params (mapping): keyword parameters for setting up QdrantClient + See the main docstring (or run `help(QdrantClient)`) + https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 + + ''' + self.name = name + self.db = db + self._embedding_model = embedding_model + if not self.db: + if not QDRANT_AVAILABLE: + raise RuntimeError('Qdrant not installed, you can run `pip install qdrant-client`') + + # Create a Qdrant client + if not conn_params: + conn_params = MEMORY_QDRANT_CONNECTION_PARAMS + self.db = QdrantClient(**conn_params) + + def add(self, texts, distance_function='Cosine', + metas=None): + ''' + Add a collection to a Qdrant client, and add some strings (chunks) to that collection + + Args: + chunks (List[str]): List of similar length strings to embed + + distance_function (str): Distance function by which vectors will be compared + + qdrant_conn_params (mapping): keyword parameters for setting up QdrantClient + See the main docstring (or run `help(QdrantClient)`) + https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 + ''' + metas = metas or [] + # meta is a list of dicts + # Find the size of the first chunk's embedding + partial_embeddings = self._embedding_model.encode(texts[0]) + vector_size = len(partial_embeddings) + + # Set the default distance function, giving grace to capitalization + distance_function = distance_function.lower().capitalize() + + # Create a collection in the Qdrant client, and configure its vectors + # Using REcreate_collection ensures overwrite + self.db.recreate_collection( + collection_name=self.name, + vectors_config=models.VectorParams( + size=vector_size, + distance=distance_function + ) ) - # Return the modified Qdrant client object - return client + # Put the items in the collection + self.upsert(texts=texts, metas=metas) + + def upsert(self, texts, metas=None): + ''' + Update/insert a Qdrant client's collection with the some chunks of text + + Args: + chunks (List[str]): List of similar length strings to embed + + embedding (SentenceTransformer): SentenceTransformer object of your choice + SentenceTransformer](https://huggingface.co/sentence-transformers) + ''' + current_count = int(str(self.db.count(self.name)).partition('=')[-1]) + metas = metas or [] + + for ix, (text, meta) in enumerate(zip(texts, metas)): + # Embeddings as float/vectors + # The inline prints actually turnn into a cool progress indicator in jupyter 😁 + embeddings = list(map(float, self._embedding_model.encode(text))) + + payload = dict(_text=text, **meta) + + # Upsert the embedded chunk and its payload into the collection + self.db.upsert( + collection_name=self.name, + points=[ + models.PointStruct( + id=ix + current_count, # Sequential IDs + vector=embeddings, + payload=payload + ) + ] + ) diff --git a/pylib/prompting/model_style.py b/pylib/prompting/model_style.py index 280eaa1..a7a206b 100644 --- a/pylib/prompting/model_style.py +++ b/pylib/prompting/model_style.py @@ -3,23 +3,9 @@ # ogbujipt.prompting.model_style ''' -Delimiters for common LLM model prompting styles +Delimiters for common LLM model prompting styles. -Plain Alpaca style, e.g.: - -* WizardLM - -Alpaca-instruct style, e.g. - -* Nous-Hermes - -VicuΓ±a style, e.g. - -* Robin - -Also includes Orca & Airoboros - -Useful collection of Alpaca demo prompts: https://huggingface.co/datasets/tatsu-lab/alpaca +For details see https://github.com/uogbuji/OgbujiPT/wiki/Prompt-templates ''' from ogbujipt.prompting.basic import pdelim, ordering @@ -56,62 +42,6 @@ } -''' -Model style for airoboros: https://huggingface.co/jondurbin/airoboros-13b-gpt4 - -Nice, meaty example of context-obedient QA prompting: https://huggingface.co/datasets/jondurbin/airoboros-gpt4/blob/main/full-example.md - -https://www.reddit.com/r/LocalLLaMA/comments/1408ued/airoboros_gpt4_instructed_contextobedient/ - -Example: - -BEGININPUT -BEGINCONTEXT -name: John Doe -date: June 3, 2023 -ticket number: JIRA-12345 -ENDCONTEXT -Summary:Search results missing random items - -Description: -I encountered a bug while performing a search within the application. -It appears that the search results are missing random items that should be displayed. -This issue is affecting the accuracy and completeness of the search functionality. - -Steps to Reproduce: -1. Log in to the application. -2. Navigate to the search feature. -3. Enter a search query that should return multiple results. -4. Observe the displayed search results. - -Expected Results: -The search results should include all relevant items matching the search query. - -Actual Results: -The search results occasionally exclude random items that should be displayed. -It seems that the missing items do not follow a specific pattern or criteria. -Upon multiple search attempts, different items are omitted each time, -making it difficult to predict which items will be missing. -ENDINPUT - -BEGININPUT -BEGINCONTEXT -date: 2023-06-05 -user: Jack Johnson -pr: 23441 -ENDCONTEXT -This pull request closes bug report JIRA-12345. - -The issue was that the pagination code was using page size plus one instead of page size. -ENDINPUT - -BEGININSTRUCTION -Do we have any bug reports related to search results? If so, were they fixed? Source? -ENDINSTRUCTION - - -''' - # Closed-context prompting AIROBOROS_OBEDIENT_DELIMITERS = { pdelim.PREQUERY: 'BEGININSTRUCTION', @@ -121,7 +51,7 @@ pdelim.PRE_ALL_CONTEXT: 'USER:', } -# If you're not using the closed-context/obedient prompting, it's just VicuΓ±a style +# If not using the closed-context/obedient prompting, it's just VicuΓ±a style AIROBOROS_DELIMITERS = VICUNA_DELIMITERS # XXX: Should this just be a FIXED_PREAMBLE? @@ -134,6 +64,12 @@ {text} ''' +# Define delimeters in ChatGPT style +CHATGPT_DELIMITERS = { + pdelim.PREQUERY: '### USER:', + pdelim.POSTQUERY: '### ASSISTANT:', +} + def concat_input_prompts(context_content_pairs): ''' From 870807f2a12298717dede3a6f2e1d4a0d6191f57 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Fri, 14 Jul 2023 10:48:05 -0600 Subject: [PATCH 02/39] [#16] Initial swipe at test/test_embedding_helper.py --- test/test_embedding_helper.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/test/test_embedding_helper.py b/test/test_embedding_helper.py index 2dbb6f3..f56c56f 100644 --- a/test/test_embedding_helper.py +++ b/test/test_embedding_helper.py @@ -10,12 +10,12 @@ import pytest from ogbujipt import embedding_helper -from ogbujipt.embedding_helper import qdrant_init_embedding_db, \ - qdrant_add_collection, qdrant_upsert_collection +from ogbujipt.embedding_helper import qdrant_collection from ogbujipt.text_helper import text_splitter embedding_helper.QDRANT_AVAILABLE = True + @pytest.fixture def CORRECT_STRING(): return 'And the secret thing in its heaving\nThreatens with iron mask\nThe last lighted torch of the century…' @@ -41,36 +41,27 @@ def test_embed_poem(mocker, COME_THUNDER_POEM, CORRECT_STRING): embedding_helper.models.VectorParams.side_effect = [mock_vparam] mocker.patch('ogbujipt.embedding_helper.QdrantClient') - client = qdrant_init_embedding_db() + coll = qdrant_collection(collection_name, embedding) - #client.count.side_effect = ['count=0'] - client.count.side_effect = lambda collection_name: 'count=0' - client = qdrant_add_collection( - client, - chunks, - embedding, - collection_name - ) - client.recreate_collection.assert_called_once_with( + # client.count.side_effect = ['count=0'] + coll.db.count.side_effect = lambda collection_name: 'count=0' + coll.add(chunks, collection_name) + coll.db.recreate_collection.assert_called_once_with( collection_name='test_collection', vectors_config=mock_vparam ) - + embedding.encode.assert_called_with(CORRECT_STRING) # Test update/insert into the DB mock_pstruct = object() embedding_helper.models.PointStruct.side_effect = lambda id=None, vector=None, payload=None: mock_pstruct - - client.count.reset_mock() - client = qdrant_upsert_collection( - client, - chunks, - embedding, - collection_name - ) - client.upsert.assert_called_with( + coll.db.count.reset_mock() + coll.upsert(chunks) + + # XXX: Add test with metadata + coll.db.upsert.assert_called_with( collection_name=collection_name, points=[mock_pstruct] ) From a35cef1301508578f17094655a0777c2796867e3 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Mon, 17 Jul 2023 10:28:00 -0600 Subject: [PATCH 03/39] [#16] Fixes to async_helper module. Add embedding_helper.qdrant_collection.search() method. Add demo/chat_web_selects.py demo 1st draft --- demo/chat_pdf_streamlit_ui.py | 2 +- demo/chat_web_selects.py | 181 ++++++++++++++++++++++++++++++++++ pylib/async_helper.py | 39 +++++++- pylib/config.py | 14 +-- pylib/embedding_helper.py | 20 +++- 5 files changed, 242 insertions(+), 14 deletions(-) create mode 100644 demo/chat_web_selects.py diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 9643fbf..1a22e48 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -1,5 +1,5 @@ ''' -Advanced, "Chat my PDF" demo, but using self-hosted LLM. +Advanced, "Chat my PDF" demo Use a PDF document as a knowledge base to provide context for natural language Q&A diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py new file mode 100644 index 0000000..7be9016 --- /dev/null +++ b/demo/chat_web_selects.py @@ -0,0 +1,181 @@ +''' +Advanced, "Chat my docs" demo, using docs from the web + +Download one or more web pages and query an LLM using them as context. +Works especially well with airoboros self-hosted LLM. + +Vector store: Qdrant - https://qdrant.tech/ + Alternatives: pgvector, Chroma, Faiss, Weaviate, etc. +Text to vector (embedding) model: + Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002 + +Needs access to an OpenAI-like service. Default assumption is self-hosted +via e.g. llama-cpp-python or text-generation-webui + +Assume for the following it's at host my-llm-host, port 8000 + +pip install rerequisites, in addition to OgbujiPT cloned dir: + +click sentence_transformers qdrant-client httpx html2text + +```sh +python demo/chat_web_selects.py "www.newworldencyclopedia.org/entry/Igbo_People" +``` +''' +# en.wikipedia.org/wiki/Igbo_people|ahiajoku.igbonet.com/2000/|en.wikivoyage.org/wiki/Igbo_phrasebook" +import asyncio +import os + +import click +import httpx +import html2text + +from ogbujipt import config +from ogbujipt.prompting import format, ALPACA_INSTRUCT_DELIMITERS +from ogbujipt.async_helper import schedule_openai_call, openai_api_surrogate +from ogbujipt import oapi_choice1_text +from ogbujipt.text_helper import text_splitter +from ogbujipt.embedding_helper import qdrant_collection + +# Avoid re-entrace complaints from huggingface/tokenizers +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + +# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 +DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2' + +COLLECTION_NAME = 'chat-web-selects' +USER_PROMPT = 'What do you want to know from these sites?:' + +# Hard-code for demo +EMBED_CHUNK_SIZE = 200 +EMBED_CHUNK_OVERLAP = 20 +DOTS_SPACING = 0.5 # Number of seconds between each dot printed to console + + +async def indicate_progress(pause=DOTS_SPACING): + while True: + print('.', end='', flush=True) + await asyncio.sleep(pause) + + +async def read_site(url, collection): + # Crude check; good enough for demo + if not url.startswith('http'): url = 'https://' + url # noqa E701 + print('Downloading & processing', url) + async with httpx.AsyncClient(verify=False) as client: + resp = await client.get(url) + html = resp.content.decode(resp.encoding) + + # with open('/tmp/ahiajoku.igbonet.com-2000.html') as fp: + # html = fp.read() + + text = html2text.html2text(html) + + # Split text into chunks + chunks = text_splitter(text, chunk_size=EMBED_CHUNK_SIZE, + chunk_overlap=EMBED_CHUNK_OVERLAP, separator='\n') + + # print('\n\n'.join([ch[:100] for ch in chunks])) + # Crudeβ€”for demo. Set URL metadata for all chunks to doc URL + metas = [{'url': url}]*len(chunks) + # Add the text to the collection. Blocks, so no reentrancy concern + collection.add(texts=chunks, metas=metas) + + +async def async_main(sites, api_params): + # Automatic download from HuggingFace + # Seem to be reentrancy issues with HuggingFace; defer import + from sentence_transformers import SentenceTransformer + embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) + # Sites fuel in-memory Qdrant vector DB instance + collection = qdrant_collection(COLLECTION_NAME, embedding_model) + + url_task_group = asyncio.gather(*[ + asyncio.create_task(read_site(site, collection)) for site in sites.split('|')]) + indicator_task = asyncio.create_task(indicate_progress()) + tasks = [indicator_task, url_task_group] + done, _ = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED) + + done = False + while not done: + user_question = input(USER_PROMPT) + if user_question == done: + break + + docs = collection.search(user_question, limit=4) + + print(docs) + if docs: + # Collects "chunked_doc" into "gathered_chunks" + gathered_chunks = '\n\n'.join( + doc.payload['_text'] for doc in docs + ) + + # Build prompt the doc chunks as context + prompt = format( + f'Given the context, {user_question}\n\n' + f'Context: """\n{gathered_chunks}\n"""\n', + preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' + 'questions directly and as briefly as possible. ' + 'If you cannot answer with the given context, just say so.\n', + delimiters=ALPACA_INSTRUCT_DELIMITERS) + + print(prompt) + + # The rest is much like in alpaca_multitask_fix_xml.py + model_params = dict( + max_tokens=1024, # Limit number of generated tokens + top_p=1, # AKA nucleus sampling; can increase generated text diversity + frequency_penalty=0, # Favor more or less frequent tokens + presence_penalty=1, # Prefer new, previously unused tokens + temperature=0.1 + ) + + indicator_task = asyncio.create_task(indicate_progress()) + llm_task = asyncio.create_task( + schedule_openai_call(openai_api_surrogate, prompt, **model_params)) + tasks = [indicator_task, llm_task] + done, _ = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED) + + # Instance of openai.openai_object.OpenAIObject, with lots of useful info + retval = next(iter(done)).result() + print(type(retval)) + # Response is a json-like object; extract the text + print('\nFull response data from LLM:\n', retval) + + # response is a json-like object; + # just get back the text of the response + response_text = oapi_choice1_text(retval) + print('\nResponse text from LLM:\n\n', response_text) + + +# Command line arguments defined in click decorators +@click.command() +@click.option('--host', default='http://127.0.0.1', help='OpenAI API host') +@click.option('--port', default='8000', help='OpenAI API port') +@click.option('--openai-key', + help='OpenAI API key. Leave blank to specify self-hosted model via --host & --port') +@click.option('--model', default='', type=str, + help='OpenAI model to use (see https://platform.openai.com/docs/models).' + 'Use only with --openai-key') +@click.argument('sites') +def main(host, port, openai_key, model, sites): + # Use OpenAI API if specified, otherwise emulate with supplied host, etc. + if openai_key: + assert not (host or port), 'Don\'t use --host or --port with --openai' + model = model or 'text-davinci-003' + openai_api = config.openai_live( + model=model, debug=True) + else: + # For now the model param is most useful in conjunction with --openai + model = model or config.HOST_DEFAULT + openai_api = config.openai_emulation( + host=host, port=port, model=model, debug=True) + + asyncio.run(async_main(sites, openai_api.params)) + + +if __name__ == '__main__': + main() diff --git a/pylib/async_helper.py b/pylib/async_helper.py index a8c14e4..922436d 100644 --- a/pylib/async_helper.py +++ b/pylib/async_helper.py @@ -16,8 +16,7 @@ async def schedule_callable(callable, *args, **kwargs): ''' - TODO: rename me? this is convenent for more than just LLM calls - Schedule task long-running/blocking LLM requests in a separate process, + Schedule long-running/blocking function call in a separate process, wrapped to work well in an asyncio event loop Basically hides away a bunch of the multiprocessing webbing @@ -38,8 +37,40 @@ async def schedule_callable(callable, *args, **kwargs): # Need to partial execute to get in any kwargs for the target callable prepped_callable = partial(callable, **kwargs) # Spawn a separate process for the LLM call - response = await loop.run_in_executor( - executor, prepped_callable, *args) + response = await loop.run_in_executor(executor, prepped_callable, *args) + return response + + +async def schedule_openai_call(callable, *args, **kwargs): + ''' + Schedule long-running/blocking LLM request in a separate process, + wrapped to work well in an asyncio event loop + + Basically hides away a bunch of the multiprocessing webbing + + e.g. `llm_task = asyncio.create_task(schedule_callable(llm, prompt))` + + Can then use asyncio.wait(), asyncio.gather(), etc. with `llm_task` + + Args: + callable (callable): Callable to be scheduled + + Returns: + response: Response object + ''' + # Link up the current async event loop for multiprocess execution + loop = asyncio.get_running_loop() + executor = concurrent.futures.ProcessPoolExecutor() + # Need to partial execute to get in any kwargs for the target callable + if 'model' not in kwargs: + kwargs['model'] = '' + prepped_callable = partial( + callable, + api_base=openai.api_base, + api_key=openai.api_key, + **kwargs) + # Spawn a separate process for the LLM call + response = await loop.run_in_executor(executor, prepped_callable, *args) return response diff --git a/pylib/config.py b/pylib/config.py index e3be66c..15843a0 100644 --- a/pylib/config.py +++ b/pylib/config.py @@ -26,6 +26,8 @@ class attr_dict(dict): def openai_live( + rev='v1', + model='', apikey=None, debug=True ): @@ -48,16 +50,14 @@ def openai_live( openai_api (openai): Prepared OpenAI API ''' import openai as openai_api - from dotenv import load_dotenv - load_dotenv() # openai_api.api_version openai_api.debug = debug openai_api.params = attr_dict( + rev=rev, api_key=apikey, - api_base=openai_api.api_base, - debug=debug - ) + model=model, + debug=debug) return openai_api @@ -67,7 +67,9 @@ def openai_emulation( port='8000', # llama-cpp-python; for Ooba, use '5001' rev='v1', model=HOST_DEFAULT, - apikey='BOGUS', oaitype='open_ai', debug=True): + apikey='BOGUS', + oaitype='open_ai', + debug=True): ''' Set up emulation, to use a alternative, OpenAI API compatible service diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 2445358..8e1d5d0 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -113,10 +113,11 @@ def upsert(self, texts, metas=None): Update/insert a Qdrant client's collection with the some chunks of text Args: - chunks (List[str]): List of similar length strings to embed + texts (List[str]): Strings to be stored and indexed. For best results these should be of similar length. + They'll be converted to embeddings fo refficient lookup - embedding (SentenceTransformer): SentenceTransformer object of your choice - SentenceTransformer](https://huggingface.co/sentence-transformers) + metas (List[dict]): Optional metadata per text, stored with the text and included whenever the text is + retrieved via search/query ''' current_count = int(str(self.db.count(self.name)).partition('=')[-1]) metas = metas or [] @@ -139,3 +140,16 @@ def upsert(self, texts, metas=None): ) ] ) + + def search(self, text, **kwargs): + ''' + Perform a search on this Qdrant collection + + Args: + query (str): string to compare against items in the collection + + kwargs: other args to be passed to qdrant_client.QdrantClient.search(). Common ones: + limit - maximum number of results to return (useful for top-k query) + ''' + embedded_text = self._embedding_model.encode(text) + return self.db.search(collection_name=self.name, query_vector=embedded_text, **kwargs) From 16808111a39569507c3177dd5eeec06427cd3104 Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 10:47:41 -0600 Subject: [PATCH 04/39] comments and consistent single quotes --- demo/chat_pdf_streamlit_ui.py | 23 +++++++++++++---------- demo/demo.env | 4 +++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 1a22e48..f58a366 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -65,14 +65,17 @@ LLM = os.getenv('LLM', 'LLM') # TODO: get this from non-openai openai api hosts LLM_HOST = os.getenv('LLM_HOST', 'my-llm-host') LLM_PORT = os.getenv('LLM_PORT', '8000') +# LLM "temperature" LLM_TEMP = float(os.getenv('LLM_TEMP', '1')) -N_CTX = int(os.getenv('N_CTX', '2048')) # LLM max context size -K = int(os.getenv('K', '6')) # K - how many chunks to return for query context +# LLM max context size +N_CTX = int(os.getenv('N_CTX', '2048')) +# K - how many chunks to return for query context +K = int(os.getenv('K', '6')) # Chunk size is the number of characters counted in the chunks EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) # Chunk Overlap to connect ends of chunks together EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) -# sLLM for embeddings +# small LM for embeddings # default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L6-v2') @@ -121,7 +124,7 @@ async def async_main(openai_api, model, LLM_TEMP): Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' # create file upload box on Streamlit, set from the user's upload - pdf = st.file_uploader("Upload a PDF", type=["pdf"], accept_multiple_files=False) + pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) if pdf: # Show throbber, vectorize the PDF, and setup for similarity search @@ -178,7 +181,7 @@ async def async_main(openai_api, model, LLM_TEMP): max_tokens=1024 # Maximum tokens to return (Default 16) ) - # Response is a json-like object; extract the text + # Response is a json-like object print('\nFull response data from LLM:\n', response) # Response is a json-like object; @@ -196,10 +199,10 @@ def main(): # Describing function via comments instead: # Set up Streamlit page, LLM host connection & launch the main loop st.set_page_config( - page_title="Ask your PDF", - page_icon="πŸ“„πŸ’¬", - layout="wide", - initial_sidebar_state="expanded", + page_title='Ask your PDF', + page_icon='πŸ“„πŸ’¬', + layout='wide', + initial_sidebar_state='expanded', ) # Use OpenAI API if specified, otherwise emulate with supplied host, etc. @@ -217,6 +220,6 @@ def main(): asyncio.run(async_main(openai_api, model, LLM_TEMP)) -if __name__ == "__main__": +if __name__ == '__main__': # TODO: Look into isolating huggingface's one time per process setup routines main() diff --git a/demo/demo.env b/demo/demo.env index bf0c0d1..abd2862 100644 --- a/demo/demo.env +++ b/demo/demo.env @@ -1,6 +1,8 @@ # Copy to .env in the same dir, and update as needed +# Used by: +# demo/alpaca_simple_qa_discord.py +# demo/chat_pdf_streamlit_ui -# Used by demo/alpaca_simple_qa_discord.py # DISCORD_TOKEN={REPLACEME} LLM_HOST=http://{my-llm-host} # LLM_PORT=8000 From 9b65a38d82ce322fc7311dfc789b5da75c943ae7 Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 11:00:33 -0600 Subject: [PATCH 05/39] minor fixes --- demo/chat_web_selects.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index 7be9016..9d82270 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -14,7 +14,7 @@ Assume for the following it's at host my-llm-host, port 8000 -pip install rerequisites, in addition to OgbujiPT cloned dir: +pip install prerequisites, in addition to OgbujiPT cloned dir: click sentence_transformers qdrant-client httpx html2text @@ -64,7 +64,7 @@ async def read_site(url, collection): print('Downloading & processing', url) async with httpx.AsyncClient(verify=False) as client: resp = await client.get(url) - html = resp.content.decode(resp.encoding) + html = resp.content.decode(resp.encoding or 'utf-8') # with open('/tmp/ahiajoku.igbonet.com-2000.html') as fp: # html = fp.read() @@ -109,7 +109,7 @@ async def async_main(sites, api_params): if docs: # Collects "chunked_doc" into "gathered_chunks" gathered_chunks = '\n\n'.join( - doc.payload['_text'] for doc in docs + doc.payload['_text'] for doc in docs if doc.payload ) # Build prompt the doc chunks as context From b56708e2c4adaebb2b8faa4ecc3fbf9c5025abcb Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 11:13:10 -0600 Subject: [PATCH 06/39] minor comment spelling fixes --- pylib/embedding_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 8e1d5d0..583e313 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -53,7 +53,7 @@ def __init__(self, name, embedding_model, db=None, **conn_params): embedding (SentenceTransformer): SentenceTransformer object of your choice https://huggingface.co/sentence-transformers - db (optional QdrantClient): existing DB/cliient to use + db (optional QdrantClient): existing DB/client to use conn_params (mapping): keyword parameters for setting up QdrantClient See the main docstring (or run `help(QdrantClient)`) @@ -124,7 +124,7 @@ def upsert(self, texts, metas=None): for ix, (text, meta) in enumerate(zip(texts, metas)): # Embeddings as float/vectors - # The inline prints actually turnn into a cool progress indicator in jupyter 😁 + # The inline prints actually turn into a cool progress indicator in jupyter 😁 embeddings = list(map(float, self._embedding_model.encode(text))) payload = dict(_text=text, **meta) From d52ffdf5a3f8553e51d48c0f7c49612dec8a1fb1 Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 12:03:11 -0600 Subject: [PATCH 07/39] simple XML to ALPACA_INSTRUCT_INPUT_DELIMETERS --- demo/alpaca_simple_fix_xml.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/demo/alpaca_simple_fix_xml.py b/demo/alpaca_simple_fix_xml.py index 8675828..51ca060 100644 --- a/demo/alpaca_simple_fix_xml.py +++ b/demo/alpaca_simple_fix_xml.py @@ -14,7 +14,7 @@ from ogbujipt.config import openai_live, openai_emulation from ogbujipt.prompting.basic import context_build -from ogbujipt.prompting.model_style import VICUNA_DELIMITERS +from ogbujipt.prompting.model_style import ALPACA_INSTRUCT_DELIMITERS # Command line arguments defined in click decorators @@ -44,10 +44,11 @@ def main(host, port, llmtemp, openai, model): ''' prompt = context_build( - f'Correct the following XML to make it well-formed\n\n{BAD_XML_CODE}', + 'Correct the given XML to make it well-formed', + contexts= BAD_XML_CODE, preamble='You are a helpful assistant, ' 'who answers questions briefly, in 1st grade language', - delimiters=VICUNA_DELIMITERS) + delimiters=ALPACA_INSTRUCT_DELIMITERS) print(prompt, '\n') response = openai_api.Completion.create( From 4557d4278ed8631ed4118cd5cd8673e69bd23784 Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 12:03:23 -0600 Subject: [PATCH 08/39] input --- demo/alpaca_simple_fix_xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo/alpaca_simple_fix_xml.py b/demo/alpaca_simple_fix_xml.py index 51ca060..f01282e 100644 --- a/demo/alpaca_simple_fix_xml.py +++ b/demo/alpaca_simple_fix_xml.py @@ -14,7 +14,7 @@ from ogbujipt.config import openai_live, openai_emulation from ogbujipt.prompting.basic import context_build -from ogbujipt.prompting.model_style import ALPACA_INSTRUCT_DELIMITERS +from ogbujipt.prompting.model_style import ALPACA_INSTRUCT_INPUT_DELIMITERS # Command line arguments defined in click decorators @@ -48,7 +48,7 @@ def main(host, port, llmtemp, openai, model): contexts= BAD_XML_CODE, preamble='You are a helpful assistant, ' 'who answers questions briefly, in 1st grade language', - delimiters=ALPACA_INSTRUCT_DELIMITERS) + delimiters=ALPACA_INSTRUCT_INPUT_DELIMITERS) print(prompt, '\n') response = openai_api.Completion.create( From f02780d2d0ef84f13a58826be023e6f92ca94d82 Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 12:04:06 -0600 Subject: [PATCH 09/39] minor formatting prints --- demo/chat_web_selects.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index 9d82270..7b6fd5c 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -21,6 +21,8 @@ ```sh python demo/chat_web_selects.py "www.newworldencyclopedia.org/entry/Igbo_People" ``` + +An example question might be "Who are the neighbors of the Igbo people?" ''' # en.wikipedia.org/wiki/Igbo_people|ahiajoku.igbonet.com/2000/|en.wikivoyage.org/wiki/Igbo_phrasebook" import asyncio @@ -44,12 +46,12 @@ DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2' COLLECTION_NAME = 'chat-web-selects' -USER_PROMPT = 'What do you want to know from these sites?:' +USER_PROMPT = 'What do you want to know from these sites?\n' # Hard-code for demo EMBED_CHUNK_SIZE = 200 EMBED_CHUNK_OVERLAP = 20 -DOTS_SPACING = 0.5 # Number of seconds between each dot printed to console +DOTS_SPACING = 0.2 # Number of seconds between each dot printed to console async def indicate_progress(pause=DOTS_SPACING): @@ -99,6 +101,7 @@ async def async_main(sites, api_params): done = False while not done: + print() user_question = input(USER_PROMPT) if user_question == done: break From daaa4fc215ff2e5036a70a3d8ed4e6f8b0f1b04b Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 12:04:50 -0600 Subject: [PATCH 10/39] "x < 2" to "x <= 1" for clarity --- pylib/prompting/model_style.py | 2 +- pylib/text_helper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pylib/prompting/model_style.py b/pylib/prompting/model_style.py index a7a206b..9aa69a5 100644 --- a/pylib/prompting/model_style.py +++ b/pylib/prompting/model_style.py @@ -55,7 +55,7 @@ AIROBOROS_DELIMITERS = VICUNA_DELIMITERS # XXX: Should this just be a FIXED_PREAMBLE? -AIROBOROS_SUGGESTED_PREAMBLE = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions, and doesn\'t make up answers if it doesn\'t know.' # noqa +AIROBOROS_SUGGESTED_PREAMBLE = 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions, and doesn\'t make up answers if it doesn\'t know.' # noqa E501 AIR_CONOB_INPUT_PRETMPL = '''\ BEGINCONTEXT diff --git a/pylib/text_helper.py b/pylib/text_helper.py index 3b0e038..febb1d2 100644 --- a/pylib/text_helper.py +++ b/pylib/text_helper.py @@ -57,7 +57,7 @@ def text_splitter(text, chunk_size, chunk_overlap, separator='\n\n', fine_split = re.split(sep_pat, text) separator_len = len_func(separator) - if len(fine_split) < 2: + if len(fine_split) <= 1: warnings.warn( f'No splits detected. Problem with separator ({repr(separator)})?') From e6217301068913ebeec2d7564b01515d1eeeb4ee Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Mon, 17 Jul 2023 12:38:49 -0600 Subject: [PATCH 11/39] [#16] Add docstring example for qdrant_collection --- demo/chat_web_selects.py | 14 +++++--------- pylib/embedding_helper.py | 9 +++++++++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index 7b6fd5c..557e477 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -16,10 +16,10 @@ pip install prerequisites, in addition to OgbujiPT cloned dir: -click sentence_transformers qdrant-client httpx html2text +click sentence_transformers qdrant-client httpx html2text amara3.xml ```sh -python demo/chat_web_selects.py "www.newworldencyclopedia.org/entry/Igbo_People" +python demo/chat_web_selects.py --host http://my-llm-host --port 8000 "www.newworldencyclopedia.org/entry/Igbo_People" ``` An example question might be "Who are the neighbors of the Igbo people?" @@ -46,7 +46,7 @@ DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2' COLLECTION_NAME = 'chat-web-selects' -USER_PROMPT = 'What do you want to know from these sites?\n' +USER_PROMPT = 'What do you want to know from this site(s)?: ' # Hard-code for demo EMBED_CHUNK_SIZE = 200 @@ -68,9 +68,6 @@ async def read_site(url, collection): resp = await client.get(url) html = resp.content.decode(resp.encoding or 'utf-8') - # with open('/tmp/ahiajoku.igbonet.com-2000.html') as fp: - # html = fp.read() - text = html2text.html2text(html) # Split text into chunks @@ -103,7 +100,7 @@ async def async_main(sites, api_params): while not done: print() user_question = input(USER_PROMPT) - if user_question == done: + if user_question.strip() == 'done': break docs = collection.search(user_question, limit=4) @@ -112,8 +109,7 @@ async def async_main(sites, api_params): if docs: # Collects "chunked_doc" into "gathered_chunks" gathered_chunks = '\n\n'.join( - doc.payload['_text'] for doc in docs if doc.payload - ) + doc.payload['_text'] for doc in docs if doc.payload) # Build prompt the doc chunks as context prompt = format( diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 583e313..ab68f14 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -59,6 +59,15 @@ def __init__(self, name, embedding_model, db=None, **conn_params): See the main docstring (or run `help(QdrantClient)`) https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 + Example: + >>> from ogbujipt.text_helper import text_splitter + >>> from ogbujipt.embedding_helper import qdrant_collection + >>> text = 'The quick brown fox\njumps over the lazy dog,\nthen hides under a log\nwith a frog.' + >>> collection = qdrant_collection('my-text', 'all-MiniLM-L6-v2') + >>> chunks = text_splitter(text, chunk_size=30, chunk_overlap=5, separator='\n') + >>> collection.add(texts=chunks, metas=[{'seq-index': i} for (i, _) in enumerate(chunks)]) + >>> retval = collection.search('what does the fox say?', limit=1) + retval ''' self.name = name self.db = db From 6b57b22aedc15f6ade6f82449c5b17d80f0ae82f Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 12:39:24 -0600 Subject: [PATCH 12/39] minor formatting --- demo/chat_web_selects.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index 7b6fd5c..5816128 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -46,7 +46,7 @@ DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2' COLLECTION_NAME = 'chat-web-selects' -USER_PROMPT = 'What do you want to know from these sites?\n' +USER_PROMPT = 'What do you want to know from the site(s)?\n' # Hard-code for demo EMBED_CHUNK_SIZE = 200 @@ -126,7 +126,7 @@ async def async_main(sites, api_params): print(prompt) - # The rest is much like in alpaca_multitask_fix_xml.py + # The rest is much like in demo/alpaca_multitask_fix_xml.py model_params = dict( max_tokens=1024, # Limit number of generated tokens top_p=1, # AKA nucleus sampling; can increase generated text diversity From 643d701de620bf7b57b0b919b465a6006a2ab956 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Mon, 17 Jul 2023 12:41:32 -0600 Subject: [PATCH 13/39] [#16] Sync back up --- pylib/embedding_helper.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index ab68f14..ea79aa6 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -60,11 +60,14 @@ def __init__(self, name, embedding_model, db=None, **conn_params): https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 Example: + >>> from ogbujipt.text_helper import text_splitter - >>> from ogbujipt.embedding_helper import qdrant_collection + >>> from ogbujipt.embedding_helper import qdrant_collection # pip install qdrant_client + >>> from sentence_transformers import SentenceTransformer # pip install sentence_transformers >>> text = 'The quick brown fox\njumps over the lazy dog,\nthen hides under a log\nwith a frog.' - >>> collection = qdrant_collection('my-text', 'all-MiniLM-L6-v2') - >>> chunks = text_splitter(text, chunk_size=30, chunk_overlap=5, separator='\n') + >>> embedding_model = SentenceTransformer('all-MiniLM-L6-v2') + >>> collection = qdrant_collection('my-text', embedding_model) + >>> chunks = text_splitter(text, chunk_size=20, chunk_overlap=4, separator='\n') >>> collection.add(texts=chunks, metas=[{'seq-index': i} for (i, _) in enumerate(chunks)]) >>> retval = collection.search('what does the fox say?', limit=1) retval From b641dacd6d00b8a57506b7c90b9c268ad4fbed4b Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Mon, 17 Jul 2023 13:00:01 -0600 Subject: [PATCH 14/39] [#16] Extend the qdrant_collection docsring example a bit --- pylib/embedding_helper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index ea79aa6..a4571c1 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -64,7 +64,8 @@ def __init__(self, name, embedding_model, db=None, **conn_params): >>> from ogbujipt.text_helper import text_splitter >>> from ogbujipt.embedding_helper import qdrant_collection # pip install qdrant_client >>> from sentence_transformers import SentenceTransformer # pip install sentence_transformers - >>> text = 'The quick brown fox\njumps over the lazy dog,\nthen hides under a log\nwith a frog.' + >>> text = 'The quick brown fox\njumps over the lazy dog,\nthen hides under a log\nwith a frog.\n' + >>> text += 'Should the hound wake up,\nall jumpers beware\nin a log, in a bog\nhe\'ll search everywhere.\n' >>> embedding_model = SentenceTransformer('all-MiniLM-L6-v2') >>> collection = qdrant_collection('my-text', embedding_model) >>> chunks = text_splitter(text, chunk_size=20, chunk_overlap=4, separator='\n') From 7c9b70764a61c828f4ec05582a19f5db2dea0320 Mon Sep 17 00:00:00 2001 From: choccccy Date: Mon, 17 Jul 2023 14:39:46 -0600 Subject: [PATCH 15/39] models.Distance.COSINE --- pylib/embedding_helper.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index a4571c1..104e4e3 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -107,6 +107,7 @@ def add(self, texts, distance_function='Cosine', # Set the default distance function, giving grace to capitalization distance_function = distance_function.lower().capitalize() + distance_function = models.Distance.COSINE # Create a collection in the Qdrant client, and configure its vectors # Using REcreate_collection ensures overwrite @@ -121,6 +122,9 @@ def add(self, texts, distance_function='Cosine', # Put the items in the collection self.upsert(texts=texts, metas=metas) + current_count = int(str(self.db.count(self.name)).partition('=')[-1]) + print('COLLECTION COUNT:', current_count) + def upsert(self, texts, metas=None): ''' Update/insert a Qdrant client's collection with the some chunks of text @@ -132,6 +136,7 @@ def upsert(self, texts, metas=None): metas (List[dict]): Optional metadata per text, stored with the text and included whenever the text is retrieved via search/query ''' + # This ugly declaration just gets the count as an integer current_count = int(str(self.db.count(self.name)).partition('=')[-1]) metas = metas or [] From 7c49d483a1719d5d65b0b6c3fc305dba49d6b1ab Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Mon, 17 Jul 2023 15:07:43 -0600 Subject: [PATCH 16/39] [#16] Add some sanity checks for bad params to qdrant_collection.add(). Implement qdrant_collection.count() --- pylib/embedding_helper.py | 59 +++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 104e4e3..1abb65b 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -30,6 +30,9 @@ responses to similar questions without having to use the most powerful LLM ''' +import warnings +import itertools + try: from qdrant_client import QdrantClient from qdrant_client.http import models @@ -84,9 +87,13 @@ def __init__(self, name, embedding_model, db=None, **conn_params): if not conn_params: conn_params = MEMORY_QDRANT_CONNECTION_PARAMS self.db = QdrantClient(**conn_params) + self._vector_size = -1 + + def _determine_vector_size(self, text): + partial_embeddings = self._embedding_model.encode(text) + self._vector_size = len(partial_embeddings) - def add(self, texts, distance_function='Cosine', - metas=None): + def add(self, texts, distance_function='Cosine', metas=None): ''' Add a collection to a Qdrant client, and add some strings (chunks) to that collection @@ -99,11 +106,25 @@ def add(self, texts, distance_function='Cosine', See the main docstring (or run `help(QdrantClient)`) https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 ''' - metas = metas or [] + if len(texts) == 0: + warnings.warn(f'Empty sequence of texts provided. No action will be taken.') + return + + if metas is None: + metas = [] + else: + if len(texts) > len(metas): + warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') + metas = itertools.chain(metas, [{}]*(len(texts)-len(texts))) + elif len(metas) > len(texts): + warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' + 'The extra text will be given empty metadata.') + metas = itertools.islice(metas, len(texts)) + # meta is a list of dicts # Find the size of the first chunk's embedding - partial_embeddings = self._embedding_model.encode(texts[0]) - vector_size = len(partial_embeddings) + if self._vector_size == -1: + self._determine_vector_size(texts[0]) # Set the default distance function, giving grace to capitalization distance_function = distance_function.lower().capitalize() @@ -114,7 +135,7 @@ def add(self, texts, distance_function='Cosine', self.db.recreate_collection( collection_name=self.name, vectors_config=models.VectorParams( - size=vector_size, + size=self._vector_size, distance=distance_function ) ) @@ -122,8 +143,8 @@ def add(self, texts, distance_function='Cosine', # Put the items in the collection self.upsert(texts=texts, metas=metas) - current_count = int(str(self.db.count(self.name)).partition('=')[-1]) - print('COLLECTION COUNT:', current_count) + # current_count = int(str(self.db.count(self.name)).partition('=')[-1]) + # print('COLLECTION COUNT:', current_count) def upsert(self, texts, metas=None): ''' @@ -136,10 +157,18 @@ def upsert(self, texts, metas=None): metas (List[dict]): Optional metadata per text, stored with the text and included whenever the text is retrieved via search/query ''' - # This ugly declaration just gets the count as an integer - current_count = int(str(self.db.count(self.name)).partition('=')[-1]) metas = metas or [] + if len(texts) > len(metas): + warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') + metas = itertools.chain(metas, [{}]*(len(texts)-len(texts))) + elif len(metas) > len(texts): + warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' + 'The extra text will be given empty metadata.') + metas = itertools.islice(metas, len(texts)) + + before_count = self.count() + for ix, (text, meta) in enumerate(zip(texts, metas)): # Embeddings as float/vectors # The inline prints actually turn into a cool progress indicator in jupyter 😁 @@ -152,7 +181,7 @@ def upsert(self, texts, metas=None): collection_name=self.name, points=[ models.PointStruct( - id=ix + current_count, # Sequential IDs + id=ix + before_count, # Insistenmtly sequential IDs vector=embeddings, payload=payload ) @@ -171,3 +200,11 @@ def search(self, text, **kwargs): ''' embedded_text = self._embedding_model.encode(text) return self.db.search(collection_name=self.name, query_vector=embedded_text, **kwargs) + + def count(self): + ''' + Return the count of items in this Qdrant collection + ''' + # This ugly declaration just gets the count as an integer + current_count = int(str(self.db.count(self.name)).partition('=')[-1]) + return current_count From ff62150c70c020ca775f3073d43cbbc370e06eea Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Mon, 17 Jul 2023 15:32:34 -0600 Subject: [PATCH 17/39] [#16] Eliminate qdrant_collection.add() and consolidate into update() --- demo/chat_web_selects.py | 2 +- pylib/embedding_helper.py | 87 ++++++++++++++------------------------- 2 files changed, 31 insertions(+), 58 deletions(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index ccf18a4..d8e2f97 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -78,7 +78,7 @@ async def read_site(url, collection): # Crudeβ€”for demo. Set URL metadata for all chunks to doc URL metas = [{'url': url}]*len(chunks) # Add the text to the collection. Blocks, so no reentrancy concern - collection.add(texts=chunks, metas=metas) + collection.update(texts=chunks, metas=metas) async def async_main(sites, api_params): diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 1abb65b..80b717b 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -46,7 +46,8 @@ class qdrant_collection: - def __init__(self, name, embedding_model, db=None, **conn_params): + def __init__(self, name, embedding_model, db=None, + distance_function=None, **conn_params): ''' Initialize a Qdrant client @@ -58,6 +59,8 @@ def __init__(self, name, embedding_model, db=None, **conn_params): db (optional QdrantClient): existing DB/client to use + distance_function (str): Distance function by which vectors will be compared + conn_params (mapping): keyword parameters for setting up QdrantClient See the main docstring (or run `help(QdrantClient)`) https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 @@ -87,68 +90,29 @@ def __init__(self, name, embedding_model, db=None, **conn_params): if not conn_params: conn_params = MEMORY_QDRANT_CONNECTION_PARAMS self.db = QdrantClient(**conn_params) - self._vector_size = -1 + self._distance_function = distance_function or models.Distance.COSINE + self._db_initialized = False - def _determine_vector_size(self, text): + def _first_update_prep(self, text): + # Make sure we have a vector size set; use a sample embedding if need be partial_embeddings = self._embedding_model.encode(text) self._vector_size = len(partial_embeddings) - def add(self, texts, distance_function='Cosine', metas=None): - ''' - Add a collection to a Qdrant client, and add some strings (chunks) to that collection - - Args: - chunks (List[str]): List of similar length strings to embed - - distance_function (str): Distance function by which vectors will be compared - - qdrant_conn_params (mapping): keyword parameters for setting up QdrantClient - See the main docstring (or run `help(QdrantClient)`) - https://github.com/qdrant/qdrant-client/blob/master/qdrant_client/qdrant_client.py#L12 - ''' - if len(texts) == 0: - warnings.warn(f'Empty sequence of texts provided. No action will be taken.') - return - - if metas is None: - metas = [] - else: - if len(texts) > len(metas): - warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') - metas = itertools.chain(metas, [{}]*(len(texts)-len(texts))) - elif len(metas) > len(texts): - warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' - 'The extra text will be given empty metadata.') - metas = itertools.islice(metas, len(texts)) - - # meta is a list of dicts - # Find the size of the first chunk's embedding - if self._vector_size == -1: - self._determine_vector_size(texts[0]) - - # Set the default distance function, giving grace to capitalization - distance_function = distance_function.lower().capitalize() - distance_function = models.Distance.COSINE - # Create a collection in the Qdrant client, and configure its vectors # Using REcreate_collection ensures overwrite self.db.recreate_collection( collection_name=self.name, vectors_config=models.VectorParams( size=self._vector_size, - distance=distance_function + distance=self._distance_function ) ) - # Put the items in the collection - self.upsert(texts=texts, metas=metas) - - # current_count = int(str(self.db.count(self.name)).partition('=')[-1]) - # print('COLLECTION COUNT:', current_count) + self._db_initialized = True - def upsert(self, texts, metas=None): + def update(self, texts, metas=None): ''' - Update/insert a Qdrant client's collection with the some chunks of text + Update/insert into a Qdrant client's collection with the some chunks of text Args: texts (List[str]): Strings to be stored and indexed. For best results these should be of similar length. @@ -157,17 +121,26 @@ def upsert(self, texts, metas=None): metas (List[dict]): Optional metadata per text, stored with the text and included whenever the text is retrieved via search/query ''' - metas = metas or [] + if len(texts) == 0: + warnings.warn('Empty sequence of texts provided. No action will be taken.') + return - if len(texts) > len(metas): - warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') - metas = itertools.chain(metas, [{}]*(len(texts)-len(texts))) - elif len(metas) > len(texts): - warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' - 'The extra text will be given empty metadata.') - metas = itertools.islice(metas, len(texts)) + if metas is None: + metas = [] + else: + if len(texts) > len(metas): + warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') + metas = itertools.chain(metas, [{}]*(len(texts)-len(texts))) + elif len(metas) > len(texts): + warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' + 'The extra text will be given empty metadata.') + metas = itertools.islice(metas, len(texts)) - before_count = self.count() + if not self._db_initialized: + self._first_update_prep(texts[0]) + before_count = 0 + else: + before_count = self.count() for ix, (text, meta) in enumerate(zip(texts, metas)): # Embeddings as float/vectors From 6602503205eee9615f24dffee3537f671c72a3ab Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Tue, 18 Jul 2023 14:24:55 -0600 Subject: [PATCH 18/39] [#16] Bug fixes to qdrant_collection.update() --- demo/chat_web_selects.py | 1 + pylib/embedding_helper.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index d8e2f97..77fd281 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -79,6 +79,7 @@ async def read_site(url, collection): metas = [{'url': url}]*len(chunks) # Add the text to the collection. Blocks, so no reentrancy concern collection.update(texts=chunks, metas=metas) + print(f'{collection.count()} chunks added to collection') async def async_main(sites, api_params): diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 80b717b..39902a4 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -126,11 +126,11 @@ def update(self, texts, metas=None): return if metas is None: - metas = [] + metas = [{}]*(len(texts)) else: if len(texts) > len(metas): warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') - metas = itertools.chain(metas, [{}]*(len(texts)-len(texts))) + metas = itertools.chain(metas, [{}]*(len(texts)-len(metas))) elif len(metas) > len(texts): warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' 'The extra text will be given empty metadata.') From 43726d3fa632164a0a2ba653fcd9c2500af10089 Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 14:04:20 -0600 Subject: [PATCH 19/39] miniLM-L12 --- demo/chat_web_selects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/chat_web_selects.py b/demo/chat_web_selects.py index 77fd281..725893e 100644 --- a/demo/chat_web_selects.py +++ b/demo/chat_web_selects.py @@ -43,7 +43,7 @@ os.environ['TOKENIZERS_PARALLELISM'] = 'false' # default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 -DOC_EMBEDDINGS_LLM = 'all-MiniLM-L6-v2' +DOC_EMBEDDINGS_LLM = 'all-MiniLM-L12-v2' COLLECTION_NAME = 'chat-web-selects' USER_PROMPT = 'What do you want to know from the site(s)?\n' From e1b1f4335998d1c2a20c3cbc386d3ad6f42af390 Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 14:04:51 -0600 Subject: [PATCH 20/39] minor commenting improvement --- pylib/embedding_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 39902a4..e7b6154 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -99,7 +99,7 @@ def _first_update_prep(self, text): self._vector_size = len(partial_embeddings) # Create a collection in the Qdrant client, and configure its vectors - # Using REcreate_collection ensures overwrite + # Using REcreate_collection ensures overwrite for a clean, fresh, new collection self.db.recreate_collection( collection_name=self.name, vectors_config=models.VectorParams( From ef5ab21befc25e069fb3a7e22d3426e75c198046 Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 14:05:02 -0600 Subject: [PATCH 21/39] new streamlit demo --- demo/chat_pdf_streamlit_ui.py | 189 +++++++++++++------------ demo/chat_pdf_streamlit_ui_OLD.py | 221 ++++++++++++++++++++++++++++++ 2 files changed, 314 insertions(+), 96 deletions(-) create mode 100644 demo/chat_pdf_streamlit_ui_OLD.py diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index f58a366..359476a 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -53,8 +53,15 @@ from ogbujipt.text_helper import text_splitter from ogbujipt.embedding_helper import qdrant_collection +# LLM will be downloaded from HuggingFace automatically +# There seem to be reentrancy issues with HuggingFace; defer import +from sentence_transformers import SentenceTransformer + +# import zlib for crc32 checksums +import zlib + # Avoid re-entrace complaints from huggingface/tokenizers -os.environ['TOKENIZERS_PARALLELISM'] = 'false' +#os.environ['TOKENIZERS_PARALLELISM'] = 'false' # Load the main parameters from .env file load_dotenv() @@ -70,23 +77,32 @@ # LLM max context size N_CTX = int(os.getenv('N_CTX', '2048')) # K - how many chunks to return for query context -K = int(os.getenv('K', '6')) +K = int(os.getenv('K', '3')) # Chunk size is the number of characters counted in the chunks EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) # Chunk Overlap to connect ends of chunks together EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) # small LM for embeddings -# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 -DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L6-v2') +# default https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 +DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L12-v2') + +PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' -PDF_USER_QUESTION_PROMPT = 'Ask a question about your PDF:' +# Streamlit uses caching to save on repeat loads +@st.cache_data +def load_throbber(): + # throbber.gif as a data URL + # TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image + return '' # noqa E501 -# throbber.gif as a data URL -# TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image -throbber = '' # noqa E501 +# Streamlit uses caching to save on repeat loads +@st.cache_resource +def load_embedding_model(embedding_model_name): + # LLM will be downloaded from HuggingFace automatically + return SentenceTransformer(embedding_model_name) -async def prep_pdf(pdf, embedding_model, collection_name): +def prep_pdf(pdf, embedding_model, collection_name): # Streamlit treats function docstrings as magic strings for user display # Describe function via comments instead: # Converts pdf content into chunks according to chunk size & overlap @@ -109,95 +125,19 @@ async def prep_pdf(pdf, embedding_model, collection_name): separator='\n' ) - print('\n\n'.join([ch[:100] for ch in chunks])) - # Add a new collection for this document, and upsert the chunks into it - knowledge_base.add(texts=chunks) + #print('\n\n'.join([ch[:100] for ch in chunks])) + # Add a new collection for this document, and insert the chunks into it + knowledge_base.update(texts=chunks) return knowledge_base -# Schedule one task to do a long-running/blocking LLM request, -# and another to chat the PDF -async def async_main(openai_api, model, LLM_TEMP): - # Doc strings turn into streamlit headers +def streamlit_loop(openai_api, model, LLM_TEMP): + # Streamlit treats function docstrings as magic strings for user display ''' Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' - # create file upload box on Streamlit, set from the user's upload - pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) - - if pdf: - # Show throbber, vectorize the PDF, and setup for similarity search - placeholder = st.empty() - with placeholder: - placeholder.image(throbber) - - # LLM will be downloaded from HuggingFace automatically - # There seem to be reentrancy issues with HuggingFace; defer import - from sentence_transformers import SentenceTransformer - embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) - - kb = await prep_pdf(pdf, embedding_model, collection_name=pdf.name) - placeholder.empty() - - user_question = st.text_input(PDF_USER_QUESTION_PROMPT) - - # docs = None - while not user_question: - await asyncio.sleep(0.1) - - embedded_question = embedding_model.encode(user_question) - docs = kb.db.search( - collection_name=kb.name, - query_vector=embedded_question, - limit=K - ) - - print(kb.name, pdf.name, docs) - if docs: - # Collects "chunked_doc" into "gathered_chunks" - gathered_chunks = '\n\n'.join( - doc.payload['_text'] for doc in docs - ) - - # Build prompt the doc chunks as context - prompt = format( - f'Given the context, {user_question}\n\n' - f'Context: """\n{gathered_chunks}\n"""\n', - preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' - 'questions directly and as briefly as possible. ' - 'If you cannot answer with the given context, just say so.\n', - delimiters=CHATGPT_DELIMITERS - ) - - print(prompt) - # Show throbber, and send LLM prompt - with st.empty(): - st.image(throbber) - response = openai_api.Completion.create( - model=model, # Model (Required) - prompt=prompt, # Prompt (Required) - temperature=LLM_TEMP, # Temp (Default 1) - max_tokens=1024 # Maximum tokens to return (Default 16) - ) - - # Response is a json-like object - print('\nFull response data from LLM:\n', response) - - # Response is a json-like object; - # just get back the text of the response - response_text = oapi_choice1_text(response).strip() - print('\nResponse text from LLM:\n', response_text) - - # Write the response text to Streamlit - st.write(response_text) - else: - st.write('No context info found') - - -def main(): - # Describing function via comments instead: - # Set up Streamlit page, LLM host connection & launch the main loop + # Set up Streamlit page st.set_page_config( page_title='Ask your PDF', page_icon='πŸ“„πŸ’¬', @@ -205,6 +145,64 @@ def main(): initial_sidebar_state='expanded', ) + # Create file upload box on Streamlit, set from the user's upload + # Use st.session_state to avoid unnessisary reprocessing/reloading + if 'pdf' not in st.session_state: # First use and need to init the PDF + pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) + st.session_state['pdf'] = pdf + + new_pdf = True # Flag to know if the new pdf needs to be embedded + + else: # PDF does exist + temp_pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) + + # Encode PDF content and encode them for comparison + pdf_new_checksum = zlib.adler32(str(temp_pdf).encode('utf-8')) + pdf_old_checksum = zlib.adler32(str(st.session_state['pdf']).encode('utf-8')) + + + if pdf_new_checksum == pdf_old_checksum: # PDF is the same + pdf = st.session_state['pdf'] + new_pdf = False # Flag to know if the new pdf needs to be embedded + + else: # PDF is now different and needs to swap out session_state + pdf, st.session_state['pdf'] = temp_pdf, temp_pdf + new_pdf = True # Flag to know if the new pdf needs to be embedded + + if pdf: # Only run once the program has a "pdf" loaded + # Show throbber, embed the PDF, and get ready for similarity search + placeholder = st.empty() + + # Load throbber from cache + throbber = load_throbber() + placeholder.image(throbber) + + # Get the embedding model + embedding_model = load_embedding_model(DOC_EMBEDDINGS_LLM) + + # Prepare and embed a knowledgebase (kb) with the pdf as its contents + # Use st.session_state to avoid unnessisary reprocessing/reloading + if new_pdf: + kb = prep_pdf(pdf, embedding_model, collection_name=pdf.name) + st.session_state['kb'] = kb + else: + kb = st.session_state['kb'] + + # Clear all elements in placeholder (in this case, just the throbber) + placeholder.empty() + + # Get the user query + user_query = st.text_input(PDF_USER_QUERY_PROMPT) + if user_query: # Only run once the program has a "user_query" + docs = kb.search(user_query, limit=K) + serg_rezulds = '\n\n'.join(doc.payload['_text'] for doc in docs if doc.payload) + st.code(serg_rezulds, language="python", line_numbers=True) + + +def main(): + # Streamlit treats function docstrings as magic strings for user display + # Describing function via comments instead: + # Set up LLM host connection & launch the main loop # Use OpenAI API if specified, otherwise emulate with supplied host, etc. if OPENAI: assert not (LLM_HOST or LLM_PORT), 'Don\'t use --host or --port with --openai' @@ -216,10 +214,9 @@ def main(): model = LLM or HOST_DEFAULT openai_api = openai_emulation( host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True) - - asyncio.run(async_main(openai_api, model, LLM_TEMP)) + + streamlit_loop(openai_api, model, LLM_TEMP) -if __name__ == '__main__': - # TODO: Look into isolating huggingface's one time per process setup routines - main() +# Streamlit requires a main call(?????) +main() \ No newline at end of file diff --git a/demo/chat_pdf_streamlit_ui_OLD.py b/demo/chat_pdf_streamlit_ui_OLD.py new file mode 100644 index 0000000..bb43cc5 --- /dev/null +++ b/demo/chat_pdf_streamlit_ui_OLD.py @@ -0,0 +1,221 @@ +''' +Advanced, "Chat my PDF" demo + +Use a PDF document as a knowledge base to provide context for natural language Q&A + +UI: Streamlit - streamlit.io +Vector store: Qdrant - https://qdrant.tech/ + Alternatives: pgvector, Chroma, Faiss, Weaviate, etc. +PDF to text: PyPDF2 + Alternatives: pdfplumber +Text to vector (embedding) model: + Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002 + +Single-PDF support, for now, to keep the demo code simple, +though you can easily extend it to e.g. work with multiple docs +dropped in a directory + +Based on https://github.com/wafflecomposite/langchain-ask-pdf-local +but taking advantage of OgbujiPT + +You need access to an OpenAI-like service. Default assumption is that you +have a self-hosted framework such as llama-cpp-python or text-generation-webui +running. Assume for the following it's at my-llm-host:8000 + +Prerequisites. From OgbujiPT cloned dir:. + +```sh +pip install --upgrade . +pip install streamlit watchdog PyPDF2 PyCryptodome sentence_transformers qdrant-client tiktoken +``` + +You'll probably need a .env file. See demo/.env for an example to copy. Run the demo: + +```sh +streamlit run demo/chat_pdf_streamlit_ui.py +``` + +Something about Streamlit's event loop is interacting with asyncio to cause the `async_main` +coroutine to be invoked twice. We might need to perhaps bundle our coroutines and +`streamlit.bootstrap.run()` together in an `asyncio.gather()`,or something of the sort. +''' +import asyncio +import os + +from dotenv import load_dotenv + +import streamlit as st +from PyPDF2 import PdfReader + +from ogbujipt.config import openai_emulation, openai_live, HOST_DEFAULT +from ogbujipt.prompting import format, CHATGPT_DELIMITERS +from ogbujipt import oapi_choice1_text +from ogbujipt.text_helper import text_splitter +from ogbujipt.embedding_helper import qdrant_collection + +# Avoid re-entrace complaints from huggingface/tokenizers +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + +# Load the main parameters from .env file +load_dotenv() +# User can set a variety of likely values to trigger use of OpenAI full-service +OPENAI = os.getenv('OPENAI', 'False') in \ + ['True', 'true', 'TRUE', 'Yes', 'yes', 'YES'] +OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') +LLM = os.getenv('LLM', 'LLM') # TODO: get this from non-openai openai api hosts +LLM_HOST = os.getenv('LLM_HOST', 'my-llm-host') +LLM_PORT = os.getenv('LLM_PORT', '8000') +# LLM "temperature" +LLM_TEMP = float(os.getenv('LLM_TEMP', '1')) +# LLM max context size +N_CTX = int(os.getenv('N_CTX', '2048')) +# K - how many chunks to return for query context +K = int(os.getenv('K', '6')) +# Chunk size is the number of characters counted in the chunks +EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) +# Chunk Overlap to connect ends of chunks together +EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) +# small LM for embeddings +# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 +DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L6-v2') + +PDF_USER_QUESTION_PROMPT = 'Ask a question about your PDF:' + +# throbber.gif as a data URL +# TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image +throbber = '' # noqa E501 + + +async def prep_pdf(pdf, embedding_model, collection_name): + # Streamlit treats function docstrings as magic strings for user display + # Describe function via comments instead: + # Converts pdf content into chunks according to chunk size & overlap + # Vectorizes chunks for sLLM lookup + # returns `knowledge_base`, the vector DB with indexed chunks + + # Create in-memory Qdrant instance + knowledge_base = qdrant_collection(collection_name, embedding_model) + + pdf_reader = PdfReader(pdf) + + # Collect text from pdf + text = ''.join((page.extract_text() for page in pdf_reader.pages)) + + # Split the text into chunks + chunks = text_splitter( + text, + chunk_size=EMBED_CHUNK_SIZE, + chunk_overlap=EMBED_CHUNK_OVERLAP, + separator='\n' + ) + + print('\n\n'.join([ch[:100] for ch in chunks])) + # Add a new collection for this document, and upsert the chunks into it + knowledge_base.add(texts=chunks) + + return knowledge_base + + +# Schedule one task to do a long-running/blocking LLM request, +# and another to chat the PDF +async def async_main(openai_api, model, LLM_TEMP): + # Doc strings turn into streamlit headers + ''' + Oori β€” Ask your PDF πŸ“„πŸ’¬ + ''' + # create file upload box on Streamlit, set from the user's upload + pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) + + if pdf: + # Show throbber, vectorize the PDF, and setup for similarity search + placeholder = st.empty() + with placeholder: + placeholder.image(throbber) + + # LLM will be downloaded from HuggingFace automatically + # There seem to be reentrancy issues with HuggingFace; defer import + from sentence_transformers import SentenceTransformer + embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) + + kb = await prep_pdf(pdf, embedding_model, collection_name=pdf.name) + placeholder.empty() + + user_question = st.text_input(PDF_USER_QUESTION_PROMPT) + + embedded_question = embedding_model.encode(user_question) + docs = kb.db.search( + collection_name=kb.name, + query_vector=embedded_question, + limit=K + ) + + print(kb.name, pdf.name, docs) + if docs: + # Collects "chunked_doc" into "gathered_chunks" + gathered_chunks = '\n\n'.join( + doc.payload['_text'] for doc in docs + ) + + # Build prompt the doc chunks as context + prompt = format( + f'Given the context, {user_question}\n\n' + f'Context: """\n{gathered_chunks}\n"""\n', + preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' + 'questions directly and as briefly as possible. ' + 'If you cannot answer with the given context, just say so.\n', + delimiters=CHATGPT_DELIMITERS + ) + + print(prompt) + # Show throbber, and send LLM prompt + with st.empty(): + st.image(throbber) + response = openai_api.Completion.create( + model=model, # Model (Required) + prompt=prompt, # Prompt (Required) + temperature=LLM_TEMP, # Temp (Default 1) + max_tokens=1024 # Maximum tokens to return (Default 16) + ) + + # Response is a json-like object + print('\nFull response data from LLM:\n', response) + + # Response is a json-like object; + # just get back the text of the response + response_text = oapi_choice1_text(response).strip() + print('\nResponse text from LLM:\n', response_text) + + # Write the response text to Streamlit + st.write(response_text) + else: + st.write('No context info found') + + +def main(): + # Describing function via comments instead: + # Set up Streamlit page, LLM host connection & launch the main loop + st.set_page_config( + page_title='Ask your PDF', + page_icon='πŸ“„πŸ’¬', + layout='wide', + initial_sidebar_state='expanded', + ) + + # Use OpenAI API if specified, otherwise emulate with supplied host, etc. + if OPENAI: + assert not (LLM_HOST or LLM_PORT), 'Don\'t use --host or --port with --openai' + model = LLM + openai_api = openai_live( + model=LLM, debug=True) + else: + # For now the model param is most useful when OPENAI is True + model = LLM or HOST_DEFAULT + openai_api = openai_emulation( + host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True) + print('oooh im tryin') + asyncio.run(async_main(openai_api, model, LLM_TEMP)) + + +if __name__ == '__main__': + # TODO: Look into isolating huggingface's one time per process setup routines + main() From 7b70dde316ff928974e43bb379e5f089e092a897 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Wed, 19 Jul 2023 14:30:01 -0600 Subject: [PATCH 22/39] [#16] Fix up test/test_embedding_helper.py --- pylib/embedding_helper.py | 2 +- test/test_embedding_helper.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 39902a4..2c53c49 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -75,7 +75,7 @@ def __init__(self, name, embedding_model, db=None, >>> embedding_model = SentenceTransformer('all-MiniLM-L6-v2') >>> collection = qdrant_collection('my-text', embedding_model) >>> chunks = text_splitter(text, chunk_size=20, chunk_overlap=4, separator='\n') - >>> collection.add(texts=chunks, metas=[{'seq-index': i} for (i, _) in enumerate(chunks)]) + >>> collection.update(texts=chunks, metas=[{'seq-index': i} for (i, _) in enumerate(chunks)]) >>> retval = collection.search('what does the fox say?', limit=1) retval ''' diff --git a/test/test_embedding_helper.py b/test/test_embedding_helper.py index f56c56f..3a6b73a 100644 --- a/test/test_embedding_helper.py +++ b/test/test_embedding_helper.py @@ -23,7 +23,7 @@ def CORRECT_STRING(): def test_embed_poem(mocker, COME_THUNDER_POEM, CORRECT_STRING): # LLM will be downloaded from HuggingFace automatically - # FIXME: We want to mock this instead + # FIXME: We want to mock this instead, or rather just have a fixture with the results # Split the chunks chunks = text_splitter( COME_THUNDER_POEM, @@ -45,7 +45,7 @@ def test_embed_poem(mocker, COME_THUNDER_POEM, CORRECT_STRING): # client.count.side_effect = ['count=0'] coll.db.count.side_effect = lambda collection_name: 'count=0' - coll.add(chunks, collection_name) + coll.update(chunks) coll.db.recreate_collection.assert_called_once_with( collection_name='test_collection', vectors_config=mock_vparam @@ -58,7 +58,7 @@ def test_embed_poem(mocker, COME_THUNDER_POEM, CORRECT_STRING): embedding_helper.models.PointStruct.side_effect = lambda id=None, vector=None, payload=None: mock_pstruct coll.db.count.reset_mock() - coll.upsert(chunks) + coll.update(chunks) # XXX: Add test with metadata coll.db.upsert.assert_called_with( From e016d760a5515026358101dcb62fb0a49c3385ee Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 14:30:35 -0600 Subject: [PATCH 23/39] working streamlit! --- demo/chat_pdf_streamlit_ui.py | 50 ++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 359476a..1316196 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -39,7 +39,6 @@ coroutine to be invoked twice. We might need to perhaps bundle our coroutines and `streamlit.bootstrap.run()` together in an `asyncio.gather()`,or something of the sort. ''' -import asyncio import os from dotenv import load_dotenv @@ -50,6 +49,7 @@ from ogbujipt.config import openai_emulation, openai_live, HOST_DEFAULT from ogbujipt.prompting import format, CHATGPT_DELIMITERS from ogbujipt import oapi_choice1_text +from ogbujipt.async_helper import schedule_openai_call, openai_api_surrogate from ogbujipt.text_helper import text_splitter from ogbujipt.embedding_helper import qdrant_collection @@ -171,11 +171,11 @@ def streamlit_loop(openai_api, model, LLM_TEMP): if pdf: # Only run once the program has a "pdf" loaded # Show throbber, embed the PDF, and get ready for similarity search - placeholder = st.empty() + embedding_placeholder = st.empty() # Load throbber from cache throbber = load_throbber() - placeholder.image(throbber) + embedding_placeholder.image(throbber) # Get the embedding model embedding_model = load_embedding_model(DOC_EMBEDDINGS_LLM) @@ -189,14 +189,52 @@ def streamlit_loop(openai_api, model, LLM_TEMP): kb = st.session_state['kb'] # Clear all elements in placeholder (in this case, just the throbber) - placeholder.empty() + embedding_placeholder.empty() # Get the user query user_query = st.text_input(PDF_USER_QUERY_PROMPT) if user_query: # Only run once the program has a "user_query" + response_placeholder = st.empty() + + # Load throbber from cache + throbber = load_throbber() + response_placeholder.image(throbber) + docs = kb.search(user_query, limit=K) - serg_rezulds = '\n\n'.join(doc.payload['_text'] for doc in docs if doc.payload) - st.code(serg_rezulds, language="python", line_numbers=True) + + # Collects "chunked_doc" into "gathered_chunks" + gathered_chunks = '\n\n'.join( + doc.payload['_text'] for doc in docs if doc.payload) + + # Build prompt the doc chunks as context + prompt = format( + f'Given the context, {user_query}\n\n' + f'Context: """\n{gathered_chunks}\n"""\n', + preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' + 'questions directly and as briefly as possible. ' + 'If you cannot answer with the given context, just say so.', + delimiters=CHATGPT_DELIMITERS) + + print(prompt) + + response = openai_api.Completion.create( + model=model, # Model (Required) + prompt=prompt, # Prompt (Required) + temperature=LLM_TEMP, # Temp (Default 1) + max_tokens=1024, # Max Token length of generated text (Default 16) + ) + + # Response is a json-like object; extract the text + print('\nFull response data from LLM:\n', response) + + # response is a json-like object; + # just get back the text of the response + response_text = oapi_choice1_text(response) + print('\nResponse text from LLM:\n', response_text) + + response_placeholder.write(response_text) + + user_query = None def main(): From fe923353b333447c45e3b1a1a26d6445741e56b0 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Wed, 19 Jul 2023 14:49:18 -0600 Subject: [PATCH 24/39] [#16] Merge --- demo/chat_pdf_streamlit_ui.py | 85 ++++++++++++----------------------- 1 file changed, 28 insertions(+), 57 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 359476a..b00c2bc 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -11,12 +11,8 @@ Text to vector (embedding) model: Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002 -Single-PDF support, for now, to keep the demo code simple, -though you can easily extend it to e.g. work with multiple docs -dropped in a directory - -Based on https://github.com/wafflecomposite/langchain-ask-pdf-local -but taking advantage of OgbujiPT +Single-PDF support, for now, to keep the demo code simple. Can easily extend to +e.g. work with multiple docs dropped in a directory You need access to an OpenAI-like service. Default assumption is that you have a self-hosted framework such as llama-cpp-python or text-generation-webui @@ -34,10 +30,6 @@ ```sh streamlit run demo/chat_pdf_streamlit_ui.py ``` - -Something about Streamlit's event loop is interacting with asyncio to cause the `async_main` -coroutine to be invoked twice. We might need to perhaps bundle our coroutines and -`streamlit.bootstrap.run()` together in an `asyncio.gather()`,or something of the sort. ''' import asyncio import os @@ -54,14 +46,9 @@ from ogbujipt.embedding_helper import qdrant_collection # LLM will be downloaded from HuggingFace automatically -# There seem to be reentrancy issues with HuggingFace; defer import from sentence_transformers import SentenceTransformer -# import zlib for crc32 checksums -import zlib - -# Avoid re-entrace complaints from huggingface/tokenizers -#os.environ['TOKENIZERS_PARALLELISM'] = 'false' +import zlib # for crc32 checksums # Load the main parameters from .env file load_dotenv() @@ -72,30 +59,24 @@ LLM = os.getenv('LLM', 'LLM') # TODO: get this from non-openai openai api hosts LLM_HOST = os.getenv('LLM_HOST', 'my-llm-host') LLM_PORT = os.getenv('LLM_PORT', '8000') -# LLM "temperature" -LLM_TEMP = float(os.getenv('LLM_TEMP', '1')) -# LLM max context size -N_CTX = int(os.getenv('N_CTX', '2048')) -# K - how many chunks to return for query context -K = int(os.getenv('K', '3')) -# Chunk size is the number of characters counted in the chunks -EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) -# Chunk Overlap to connect ends of chunks together -EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) -# small LM for embeddings -# default https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 +LLM_TEMP = float(os.getenv('LLM_TEMP', '1')) # LLM temperature (randomness) +N_CTX = int(os.getenv('N_CTX', '2048')) # LLM max context size +K = int(os.getenv('K', '3')) # how many chunks to return for query context +EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) # Character count used in slicing up the document +EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) # Character count overlap between chunks +# LLM used for vector DB embeddings: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L12-v2') PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' -# Streamlit uses caching to save on repeat loads +# Streamlit caching to save on repeat loads @st.cache_data def load_throbber(): # throbber.gif as a data URL # TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image return '' # noqa E501 -# Streamlit uses caching to save on repeat loads +# Streamlit caching to save on repeat loads @st.cache_resource def load_embedding_model(embedding_model_name): # LLM will be downloaded from HuggingFace automatically @@ -103,14 +84,12 @@ def load_embedding_model(embedding_model_name): def prep_pdf(pdf, embedding_model, collection_name): - # Streamlit treats function docstrings as magic strings for user display - # Describe function via comments instead: + # Streamlit treats function docstrings as magic strings for user display. Use comments instead # Converts pdf content into chunks according to chunk size & overlap # Vectorizes chunks for sLLM lookup # returns `knowledge_base`, the vector DB with indexed chunks - # Create in-memory Qdrant instance - knowledge_base = qdrant_collection(collection_name, embedding_model) + knowledge_base = qdrant_collection(collection_name, embedding_model) # in-memory vector DB instance pdf_reader = PdfReader(pdf) @@ -122,13 +101,10 @@ def prep_pdf(pdf, embedding_model, collection_name): text, chunk_size=EMBED_CHUNK_SIZE, chunk_overlap=EMBED_CHUNK_OVERLAP, - separator='\n' - ) + separator='\n') - #print('\n\n'.join([ch[:100] for ch in chunks])) - # Add a new collection for this document, and insert the chunks into it + # New collection for this document, and insert the chunks into it knowledge_base.update(texts=chunks) - return knowledge_base @@ -137,8 +113,7 @@ def streamlit_loop(openai_api, model, LLM_TEMP): ''' Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' - # Set up Streamlit page - st.set_page_config( + st.set_page_config( # Set up Streamlit page page_title='Ask your PDF', page_icon='πŸ“„πŸ’¬', layout='wide', @@ -153,35 +128,33 @@ def streamlit_loop(openai_api, model, LLM_TEMP): new_pdf = True # Flag to know if the new pdf needs to be embedded - else: # PDF does exist + else: # No PDF has yet been uploaded temp_pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) # Encode PDF content and encode them for comparison pdf_new_checksum = zlib.adler32(str(temp_pdf).encode('utf-8')) pdf_old_checksum = zlib.adler32(str(st.session_state['pdf']).encode('utf-8')) - - + if pdf_new_checksum == pdf_old_checksum: # PDF is the same pdf = st.session_state['pdf'] new_pdf = False # Flag to know if the new pdf needs to be embedded - - else: # PDF is now different and needs to swap out session_state + else: # PDF is now different and needs to swap out session_state pdf, st.session_state['pdf'] = temp_pdf, temp_pdf new_pdf = True # Flag to know if the new pdf needs to be embedded - + if pdf: # Only run once the program has a "pdf" loaded # Show throbber, embed the PDF, and get ready for similarity search placeholder = st.empty() - + # Load throbber from cache throbber = load_throbber() placeholder.image(throbber) # Get the embedding model embedding_model = load_embedding_model(DOC_EMBEDDINGS_LLM) - - # Prepare and embed a knowledgebase (kb) with the pdf as its contents - # Use st.session_state to avoid unnessisary reprocessing/reloading + + # Prepare a vector knowledgebase based on the pdf contents + # Use st.session_state to avoid unnecessary reprocessing/reloading if new_pdf: kb = prep_pdf(pdf, embedding_model, collection_name=pdf.name) st.session_state['kb'] = kb @@ -200,10 +173,9 @@ def streamlit_loop(openai_api, model, LLM_TEMP): def main(): - # Streamlit treats function docstrings as magic strings for user display - # Describing function via comments instead: + # Streamlit treats function docstrings as magic strings for user display. Use comments instead # Set up LLM host connection & launch the main loop - # Use OpenAI API if specified, otherwise emulate with supplied host, etc. + # Use OpenAI API if specified, otherwise emulate with supplied host, etc. for self-hosted LLM if OPENAI: assert not (LLM_HOST or LLM_PORT), 'Don\'t use --host or --port with --openai' model = LLM @@ -214,9 +186,8 @@ def main(): model = LLM or HOST_DEFAULT openai_api = openai_emulation( host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True) - + streamlit_loop(openai_api, model, LLM_TEMP) -# Streamlit requires a main call(?????) -main() \ No newline at end of file +main() # Code to execute From c1452c92675efeb80dfa15e48b1110eb86eea3b4 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Wed, 19 Jul 2023 14:54:46 -0600 Subject: [PATCH 25/39] [#16] Fix linter errors --- demo/chat_pdf_streamlit_ui.py | 1 - pylib/embedding_helper.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 3eeb4be..65368c4 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -41,7 +41,6 @@ from ogbujipt.config import openai_emulation, openai_live, HOST_DEFAULT from ogbujipt.prompting import format, CHATGPT_DELIMITERS from ogbujipt import oapi_choice1_text -from ogbujipt.async_helper import schedule_openai_call, openai_api_surrogate from ogbujipt.text_helper import text_splitter from ogbujipt.embedding_helper import qdrant_collection diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index 54ad1dd..de5cb3a 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -129,7 +129,8 @@ def update(self, texts, metas=None): metas = [{}]*(len(texts)) else: if len(texts) > len(metas): - warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}). Extra metadata items will be ignored.') + warnings.warn(f'More texts ({len(texts)} provided than metadata {len(metas)}).' + 'Extra metadata items will be ignored.') metas = itertools.chain(metas, [{}]*(len(texts)-len(metas))) elif len(metas) > len(texts): warnings.warn(f'Fewer texts ({len(texts)} provided than metadata {len(metas)}). ' From a7c565a8483b525065dd9b38f644728869597b32 Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 17:32:05 -0600 Subject: [PATCH 26/39] oh well, can't fix it --- demo/chat_pdf_streamlit_ui.py | 128 +++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 56 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 65368c4..310effa 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -49,6 +49,9 @@ import zlib # for crc32 checksums +# Avoid re-entrace complaints from huggingface/tokenizers +os.environ['TOKENIZERS_PARALLELISM'] = 'false' + # Load the main parameters from .env file load_dotenv() # User can set a variety of likely values to trigger use of OpenAI full-service @@ -66,6 +69,8 @@ # LLM used for vector DB embeddings: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L12-v2') +CONSOLE_WIDTH = 80 + PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' # Streamlit caching to save on repeat loads @@ -107,6 +112,52 @@ def prep_pdf(pdf, embedding_model, collection_name): return knowledge_base +def query_llm(kb, openai_api, model): + user_query = st.session_state['user_query_str'] + + # Create placeholder st.empty() for throbber and LLM response + response_placeholder = st.empty() + + # Load throbber from cache + throbber = load_throbber() + response_placeholder.image(throbber) + + docs = kb.search(user_query, limit=K) + + # Collects "chunked_doc" into "gathered_chunks" + gathered_chunks = '\n\n'.join( + doc.payload['_text'] for doc in docs if doc.payload) + + # Build prompt the doc chunks as context + prompt = format( + f'Given the context, {user_query}\n\n' + f'Context: """\n{gathered_chunks}\n"""\n', + preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' + 'questions directly and as briefly as possible. ' + 'If you cannot answer with the given context, just say so.', + delimiters=CHATGPT_DELIMITERS) + + print(' PROMPT FOR LLM: '.center(CONSOLE_WIDTH, '=')) + print(prompt) + + response = openai_api.Completion.create( + model=model, # Model (Required) + prompt=prompt, # Prompt (Required) + temperature=LLM_TEMP, # Temp (Default 1) + max_tokens=1024, # Max Token length of generated text (Default 16) + ) + + # Response is a json-like object; extract the text + print('\nFull response data from LLM:\n', response) + + # response is a json-like object; + # just get back the text of the response + response_text = oapi_choice1_text(response) + print('\nResponse text from LLM:\n', response_text) + + response_placeholder.write(response_text) + + def streamlit_loop(openai_api, model, LLM_TEMP): # Streamlit treats function docstrings as magic strings for user display ''' @@ -142,71 +193,34 @@ def streamlit_loop(openai_api, model, LLM_TEMP): new_pdf = True # Flag to know if the new pdf needs to be embedded if pdf: # Only run once the program has a "pdf" loaded - # Show throbber, embed the PDF, and get ready for similarity search - embedding_placeholder = st.empty() - - # Load throbber from cache - throbber = load_throbber() - embedding_placeholder.image(throbber) + if st.session_state['embedding_model']: + # Show throbber, embed the PDF, and get ready for similarity search + embedding_placeholder = st.container() - # Get the embedding model - embedding_model = load_embedding_model(DOC_EMBEDDINGS_LLM) + embedding_placeholder.write('Embedding PDF...') - # Prepare a vector knowledgebase based on the pdf contents - # Use st.session_state to avoid unnecessary reprocessing/reloading - if new_pdf: - kb = prep_pdf(pdf, embedding_model, collection_name=pdf.name) - st.session_state['kb'] = kb - else: - kb = st.session_state['kb'] - - # Clear all elements in placeholder (in this case, just the throbber) - embedding_placeholder.empty() - - # Get the user query - user_query = st.text_input(PDF_USER_QUERY_PROMPT) - if user_query: # Only run once the program has a "user_query" - response_placeholder = st.empty() - # Load throbber from cache throbber = load_throbber() - response_placeholder.image(throbber) - - docs = kb.search(user_query, limit=K) + embedding_placeholder.image(throbber) - # Collects "chunked_doc" into "gathered_chunks" - gathered_chunks = '\n\n'.join( - doc.payload['_text'] for doc in docs if doc.payload) + # Get the embedding model + embedding_model = load_embedding_model(embedding_model_name=DOC_EMBEDDINGS_LLM) - # Build prompt the doc chunks as context - prompt = format( - f'Given the context, {user_query}\n\n' - f'Context: """\n{gathered_chunks}\n"""\n', - preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' - 'questions directly and as briefly as possible. ' - 'If you cannot answer with the given context, just say so.', - delimiters=CHATGPT_DELIMITERS) + # Prepare a vector knowledgebase based on the pdf contents + # Use st.session_state to avoid unnecessary reprocessing/reloading + if new_pdf: + kb = prep_pdf(pdf, embedding_model, collection_name=pdf.name) + st.session_state['kb'] = kb + else: + kb = st.session_state['kb'] - print(prompt) + st.session_state['embedding_model'] = False - response = openai_api.Completion.create( - model=model, # Model (Required) - prompt=prompt, # Prompt (Required) - temperature=LLM_TEMP, # Temp (Default 1) - max_tokens=1024, # Max Token length of generated text (Default 16) - ) + # Rerun the app to hide the embedding throbber + st.experimental_rerun() - # Response is a json-like object; extract the text - print('\nFull response data from LLM:\n', response) - - # response is a json-like object; - # just get back the text of the response - response_text = oapi_choice1_text(response) - print('\nResponse text from LLM:\n', response_text) - - response_placeholder.write(response_text) - - user_query = None + # Get the user query + st.text_input(label=PDF_USER_QUERY_PROMPT, key='user_query_str', on_change=query_llm, args=(kb, openai_api, model)) def main(): @@ -223,6 +237,8 @@ def main(): model = LLM or HOST_DEFAULT openai_api = openai_emulation( host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True) + + st.session_state['embedding_model'] = True streamlit_loop(openai_api, model, LLM_TEMP) From d323996fde459f4d9fe9810f9e77610c1d194ac9 Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 17:44:54 -0600 Subject: [PATCH 27/39] oori favicon --- demo/chat_pdf_streamlit_ui.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 310effa..d5df4c4 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -73,10 +73,15 @@ PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' +@st.cache_data +def load_favicon(): + # oori_logo[32px].png as a data URL + return '' + # Streamlit caching to save on repeat loads @st.cache_data def load_throbber(): - # throbber.gif as a data URL + # oori_throbber.gif as a data URL # TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image return '' # noqa E501 @@ -163,9 +168,10 @@ def streamlit_loop(openai_api, model, LLM_TEMP): ''' Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' + favicon = load_favicon() st.set_page_config( # Set up Streamlit page page_title='Ask your PDF', - page_icon='πŸ“„πŸ’¬', + page_icon=favicon, layout='wide', initial_sidebar_state='expanded', ) From 303b81abb047466a475888c16862910c107c9ccb Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 17:45:26 -0600 Subject: [PATCH 28/39] centered page layout --- demo/chat_pdf_streamlit_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index d5df4c4..6001ec0 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -172,7 +172,7 @@ def streamlit_loop(openai_api, model, LLM_TEMP): st.set_page_config( # Set up Streamlit page page_title='Ask your PDF', page_icon=favicon, - layout='wide', + layout='centered', initial_sidebar_state='expanded', ) From 3a8fe7fe02bf711508e82f26a4fc4b992c290b1a Mon Sep 17 00:00:00 2001 From: choccccy Date: Wed, 19 Jul 2023 17:46:14 -0600 Subject: [PATCH 29/39] noqa E501 on favicon data URL --- demo/chat_pdf_streamlit_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 6001ec0..bdf3714 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -76,7 +76,7 @@ @st.cache_data def load_favicon(): # oori_logo[32px].png as a data URL - return '' + return '' # noqa E501 # Streamlit caching to save on repeat loads @st.cache_data From ce4fa27bb117a887be9179aa3d6b1d1cae7f8e9f Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Wed, 19 Jul 2023 19:38:42 -0600 Subject: [PATCH 30/39] [#16] Preliminary code review --- demo/chat_pdf_streamlit_ui.py | 54 +++++++++++++---------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index bdf3714..7f9f7ca 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -44,7 +44,6 @@ from ogbujipt.text_helper import text_splitter from ogbujipt.embedding_helper import qdrant_collection -# LLM will be downloaded from HuggingFace automatically from sentence_transformers import SentenceTransformer import zlib # for crc32 checksums @@ -66,26 +65,27 @@ K = int(os.getenv('K', '3')) # how many chunks to return for query context EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) # Character count used in slicing up the document EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) # Character count overlap between chunks -# LLM used for vector DB embeddings: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 +# LLM for vector DB embeddings; will be d/led: https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L12-v2') CONSOLE_WIDTH = 80 PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' -@st.cache_data + +@st.cache_data # Streamlit caching to save on repeat loads def load_favicon(): # oori_logo[32px].png as a data URL return '' # noqa E501 -# Streamlit caching to save on repeat loads + @st.cache_data def load_throbber(): # oori_throbber.gif as a data URL # TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image return '' # noqa E501 -# Streamlit caching to save on repeat loads + @st.cache_resource def load_embedding_model(embedding_model_name): # LLM will be downloaded from HuggingFace automatically @@ -96,42 +96,35 @@ def prep_pdf(pdf, embedding_model, collection_name): # Streamlit treats function docstrings as magic strings for user display. Use comments instead # Converts pdf content into chunks according to chunk size & overlap # Vectorizes chunks for sLLM lookup - # returns `knowledge_base`, the vector DB with indexed chunks - - knowledge_base = qdrant_collection(collection_name, embedding_model) # in-memory vector DB instance + kb = qdrant_collection(collection_name, embedding_model) # in-memory vector DB instance + # Load PDF & collect its text & split it into chunks pdf_reader = PdfReader(pdf) - - # Collect text from pdf text = ''.join((page.extract_text() for page in pdf_reader.pages)) - - # Split the text into chunks chunks = text_splitter( text, chunk_size=EMBED_CHUNK_SIZE, chunk_overlap=EMBED_CHUNK_OVERLAP, separator='\n') - # New collection for this document, and insert the chunks into it - knowledge_base.update(texts=chunks) - return knowledge_base + # Update vector DB collection, insert the text chunks & update app state + kb.update(texts=chunks) + st.session_state['kb'] = kb # Update state def query_llm(kb, openai_api, model): user_query = st.session_state['user_query_str'] - # Create placeholder st.empty() for throbber and LLM response + # Placeholder for throbber & LLM response response_placeholder = st.empty() - # Load throbber from cache throbber = load_throbber() response_placeholder.image(throbber) docs = kb.search(user_query, limit=K) - # Collects "chunked_doc" into "gathered_chunks" - gathered_chunks = '\n\n'.join( - doc.payload['_text'] for doc in docs if doc.payload) + # Concatenate text chunks for insertion into prompt + gathered_chunks = '\n\n'.join(doc.payload['_text'] for doc in docs if doc.payload) # Build prompt the doc chunks as context prompt = format( @@ -142,21 +135,14 @@ def query_llm(kb, openai_api, model): 'If you cannot answer with the given context, just say so.', delimiters=CHATGPT_DELIMITERS) - print(' PROMPT FOR LLM: '.center(CONSOLE_WIDTH, '=')) - print(prompt) + print(' PROMPT FOR LLM: '.center(CONSOLE_WIDTH, '='), '\n', prompt) - response = openai_api.Completion.create( - model=model, # Model (Required) - prompt=prompt, # Prompt (Required) - temperature=LLM_TEMP, # Temp (Default 1) - max_tokens=1024, # Max Token length of generated text (Default 16) - ) + # Remember max Token length default is 16) + response = openai_api.Completion.create(model=model, prompt=prompt, temperature=LLM_TEMP, max_tokens=1024) - # Response is a json-like object; extract the text print('\nFull response data from LLM:\n', response) - # response is a json-like object; - # just get back the text of the response + # Response is a json-like object; extract the text response_text = oapi_choice1_text(response) print('\nResponse text from LLM:\n', response_text) @@ -164,7 +150,6 @@ def query_llm(kb, openai_api, model): def streamlit_loop(openai_api, model, LLM_TEMP): - # Streamlit treats function docstrings as magic strings for user display ''' Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' @@ -182,13 +167,14 @@ def streamlit_loop(openai_api, model, LLM_TEMP): pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) st.session_state['pdf'] = pdf - new_pdf = True # Flag to know if the new pdf needs to be embedded + new_pdf = True # Are embeddings needed for the pdf? else: # No PDF has yet been uploaded temp_pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) - # Encode PDF content and encode them for comparison + # Encode PDF content and compare CRCs to see if it's changed pdf_new_checksum = zlib.adler32(str(temp_pdf).encode('utf-8')) + # FIXME: Store the old checksum in state, so it needn't be recomputed each time pdf_old_checksum = zlib.adler32(str(st.session_state['pdf']).encode('utf-8')) if pdf_new_checksum == pdf_old_checksum: # PDF is the same From 254bd2f5aeafac3ae9514c3b38b6bb0fcddcf887 Mon Sep 17 00:00:00 2001 From: choccccy Date: Thu, 20 Jul 2023 11:06:25 -0600 Subject: [PATCH 31/39] add aidan and osi to pyproject.toml --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 47923e3..1d8a341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,8 @@ license = "Apache-2.0" keywords = [] authors = [ { name = "Uche Ogbuji", email = "uche@ogbuji.net" }, + { name = "Osi Ogbuji", email = "osita@ogbuji.net" }, + { name = "Aidan Reese", email = "aidanreese.professional@gmail.com" }, ] classifiers = [ "Development Status :: 4 - Beta", From 4156f59605a4aa5eafc44725e30904669caa408f Mon Sep 17 00:00:00 2001 From: choccccy Date: Thu, 20 Jul 2023 11:06:43 -0600 Subject: [PATCH 32/39] add aidan and osi to the pyproject.toml --- demo/chat_pdf_streamlit_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index bdf3714..5cd1b25 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -76,7 +76,7 @@ @st.cache_data def load_favicon(): # oori_logo[32px].png as a data URL - return '' # noqa E501 + return '' # noqa E501 # Streamlit caching to save on repeat loads @st.cache_data From 0500f5cfc6c18eb127d1d37dad82f894485f3833 Mon Sep 17 00:00:00 2001 From: choccccy Date: Thu, 20 Jul 2023 11:08:53 -0600 Subject: [PATCH 33/39] remove llama index from pyproject.toml dependancies --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1d8a341..37edaa1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,7 @@ classifiers = [ ] dependencies = [ "openai", - "python-dotenv", - "llama_index", + "python-dotenv" ] [project.urls] From 6ef7ddaaad138d8ed71b75a115939ce24848cde6 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Thu, 20 Jul 2023 11:35:40 -0600 Subject: [PATCH 34/39] [#16] Sync up. Deal with "Calling st.experimental_rerun() within a callback is a no-op." & sew up the use of session data & call-backs --- demo/chat_pdf_streamlit_ui.py | 156 ++++++++++++++-------------------- 1 file changed, 65 insertions(+), 91 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 6504669..2f7fc20 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -72,11 +72,11 @@ PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' - -@st.cache_data # Streamlit caching to save on repeat loads -def load_favicon(): - # oori_logo[32px].png as a data URL - return '' # noqa E501 +st.set_page_config( # Set up Streamlit page + page_title='Ask your PDF', + layout='centered', + initial_sidebar_state='expanded', + ) @st.cache_data @@ -92,27 +92,59 @@ def load_embedding_model(embedding_model_name): return SentenceTransformer(embedding_model_name) -def prep_pdf(pdf, embedding_model, collection_name): - # Streamlit treats function docstrings as magic strings for user display. Use comments instead - # Converts pdf content into chunks according to chunk size & overlap - # Vectorizes chunks for sLLM lookup - kb = qdrant_collection(collection_name, embedding_model) # in-memory vector DB instance +def prep_pdf(): + 'Convert pdf content into chunks according to chunk size & overlap' + placeholder = st.empty() # Load PDF & collect its text & split it into chunks - pdf_reader = PdfReader(pdf) - text = ''.join((page.extract_text() for page in pdf_reader.pages)) - chunks = text_splitter( - text, - chunk_size=EMBED_CHUNK_SIZE, - chunk_overlap=EMBED_CHUNK_OVERLAP, - separator='\n') - - # Update vector DB collection, insert the text chunks & update app state - kb.update(texts=chunks) - st.session_state['kb'] = kb # Update state - - -def query_llm(kb, openai_api, model): + pdf = st.session_state['pdf'] + if not pdf: + return + + with placeholder.container(): + # Get the embedding model + if not st.session_state['embedding_model']: + st.session_state['embedding_model'] = load_embedding_model(embedding_model_name=DOC_EMBEDDINGS_LLM) + emb_model = st.session_state['embedding_model'] + + # Vectorizes chunks for sLLM lookup + # XXX: Look up rules around uploaded object names + kb = qdrant_collection(pdf.name, emb_model) # in-memory vector DB instance + + # Show throbber, embed the PDF, and get ready for similarity search + embedding_placeholder = st.container() + embedding_placeholder.write('Embedding PDF...') + + # Load throbber from cache + throbber = load_throbber() + embedding_placeholder.image(throbber) + + # Prepare a vector knowledgebase based on the pdf contents + # Use st.session_state to avoid unnecessary reprocessing/reloading + pdf_reader = PdfReader(pdf) + text = ''.join((page.extract_text() for page in pdf_reader.pages)) + chunks = text_splitter( + text, + chunk_size=EMBED_CHUNK_SIZE, + chunk_overlap=EMBED_CHUNK_OVERLAP, + separator='\n') + + # Update vector DB collection, insert the text chunks & update app state + kb.update(texts=chunks) + st.session_state['kb'] = kb # Update state + placeholder.empty() + + # Get the user query + st.text_input( + label=PDF_USER_QUERY_PROMPT, + key='user_query_str', + on_change=query_llm, + args=(st.session_state['openai_api'], st.session_state['model'])) + + + +def query_llm(openai_api, model): + kb = st.session_state['kb'] user_query = st.session_state['user_query_str'] # Placeholder for throbber & LLM response @@ -149,73 +181,10 @@ def query_llm(kb, openai_api, model): response_placeholder.write(response_text) -def streamlit_loop(openai_api, model, LLM_TEMP): +def main(): ''' Oori β€” Ask your PDF πŸ“„πŸ’¬ ''' - favicon = load_favicon() - st.set_page_config( # Set up Streamlit page - page_title='Ask your PDF', - page_icon=favicon, - layout='centered', - initial_sidebar_state='expanded', - ) - - # Create file upload box on Streamlit, set from the user's upload - # Use st.session_state to avoid unnessisary reprocessing/reloading - if 'pdf' not in st.session_state: # First use and need to init the PDF - pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) - st.session_state['pdf'] = pdf - - new_pdf = True # Are embeddings needed for the pdf? - - else: # No PDF has yet been uploaded - temp_pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) - - # Encode PDF content and compare CRCs to see if it's changed - pdf_new_checksum = zlib.adler32(str(temp_pdf).encode('utf-8')) - # FIXME: Store the old checksum in state, so it needn't be recomputed each time - pdf_old_checksum = zlib.adler32(str(st.session_state['pdf']).encode('utf-8')) - - if pdf_new_checksum == pdf_old_checksum: # PDF is the same - pdf = st.session_state['pdf'] - new_pdf = False # Flag to know if the new pdf needs to be embedded - else: # PDF is now different and needs to swap out session_state - pdf, st.session_state['pdf'] = temp_pdf, temp_pdf - new_pdf = True # Flag to know if the new pdf needs to be embedded - - if pdf: # Only run once the program has a "pdf" loaded - if st.session_state['embedding_model']: - # Show throbber, embed the PDF, and get ready for similarity search - embedding_placeholder = st.container() - - embedding_placeholder.write('Embedding PDF...') - - # Load throbber from cache - throbber = load_throbber() - embedding_placeholder.image(throbber) - - # Get the embedding model - embedding_model = load_embedding_model(embedding_model_name=DOC_EMBEDDINGS_LLM) - - # Prepare a vector knowledgebase based on the pdf contents - # Use st.session_state to avoid unnecessary reprocessing/reloading - if new_pdf: - kb = prep_pdf(pdf, embedding_model, collection_name=pdf.name) - st.session_state['kb'] = kb - else: - kb = st.session_state['kb'] - - st.session_state['embedding_model'] = False - - # Rerun the app to hide the embedding throbber - st.experimental_rerun() - - # Get the user query - st.text_input(label=PDF_USER_QUERY_PROMPT, key='user_query_str', on_change=query_llm, args=(kb, openai_api, model)) - - -def main(): # Streamlit treats function docstrings as magic strings for user display. Use comments instead # Set up LLM host connection & launch the main loop # Use OpenAI API if specified, otherwise emulate with supplied host, etc. for self-hosted LLM @@ -229,10 +198,15 @@ def main(): model = LLM or HOST_DEFAULT openai_api = openai_emulation( host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True) - - st.session_state['embedding_model'] = True - streamlit_loop(openai_api, model, LLM_TEMP) + st.session_state['embedding_model'] = None + st.session_state['openai_api'] = openai_api + st.session_state['model'] = model + + # Create file upload box on Streamlit, set from the user's upload + # Use st.session_state to avoid unnessisary reprocessing/reloading + st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False, + on_change=prep_pdf, key='pdf') main() # Code to execute From d3556ad62d73ce67c0c6f48335781f9aa0d0a3a4 Mon Sep 17 00:00:00 2001 From: choccccy Date: Thu, 20 Jul 2023 13:47:03 -0600 Subject: [PATCH 35/39] search using the keyword "query" rather than "text" --- pylib/embedding_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pylib/embedding_helper.py b/pylib/embedding_helper.py index de5cb3a..4dfdaeb 100644 --- a/pylib/embedding_helper.py +++ b/pylib/embedding_helper.py @@ -162,7 +162,7 @@ def update(self, texts, metas=None): ] ) - def search(self, text, **kwargs): + def search(self, query, **kwargs): ''' Perform a search on this Qdrant collection @@ -172,7 +172,7 @@ def search(self, text, **kwargs): kwargs: other args to be passed to qdrant_client.QdrantClient.search(). Common ones: limit - maximum number of results to return (useful for top-k query) ''' - embedded_text = self._embedding_model.encode(text) + embedded_text = self._embedding_model.encode(query) return self.db.search(collection_name=self.name, query_vector=embedded_text, **kwargs) def count(self): From 85e9875d2224e4f7a42eca0c5d5fb7c58310fe76 Mon Sep 17 00:00:00 2001 From: choccccy Date: Thu, 20 Jul 2023 14:17:23 -0600 Subject: [PATCH 36/39] favicon --- demo/chat_pdf_streamlit_ui.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 2f7fc20..a90995d 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -72,8 +72,11 @@ PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:' +favicon = '' # noqa E501 + st.set_page_config( # Set up Streamlit page page_title='Ask your PDF', + page_icon=favicon, layout='centered', initial_sidebar_state='expanded', ) @@ -93,6 +96,9 @@ def load_embedding_model(embedding_model_name): def prep_pdf(): + ''' + Oori β€” Ask your PDF πŸ“„πŸ’¬ + ''' 'Convert pdf content into chunks according to chunk size & overlap' placeholder = st.empty() @@ -144,6 +150,9 @@ def prep_pdf(): def query_llm(openai_api, model): + ''' + Oori β€” Ask your PDF πŸ“„πŸ’¬ + ''' kb = st.session_state['kb'] user_query = st.session_state['user_query_str'] From d50ead3dd369d674dba061a293e84fb5ac02cf26 Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Thu, 20 Jul 2023 16:05:05 -0600 Subject: [PATCH 37/39] [#16] Lint fix --- demo/chat_pdf_streamlit_ui.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/demo/chat_pdf_streamlit_ui.py b/demo/chat_pdf_streamlit_ui.py index 2f7fc20..7af1705 100644 --- a/demo/chat_pdf_streamlit_ui.py +++ b/demo/chat_pdf_streamlit_ui.py @@ -21,7 +21,7 @@ Prerequisites. From OgbujiPT cloned dir:. ```sh -pip install --upgrade . +pip install --upgrade https://github.com/uogbuji/OgbujiPT.git@16-better-vector-db pip install streamlit watchdog PyPDF2 PyCryptodome sentence_transformers qdrant-client tiktoken ``` @@ -46,8 +46,6 @@ from sentence_transformers import SentenceTransformer -import zlib # for crc32 checksums - # Avoid re-entrace complaints from huggingface/tokenizers os.environ['TOKENIZERS_PARALLELISM'] = 'false' From cd14fa33b57bb727b36b515989dac202304910ac Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Thu, 20 Jul 2023 17:17:06 -0600 Subject: [PATCH 38/39] Delete chat_pdf_streamlit_ui_OLD.py --- demo/chat_pdf_streamlit_ui_OLD.py | 221 ------------------------------ 1 file changed, 221 deletions(-) delete mode 100644 demo/chat_pdf_streamlit_ui_OLD.py diff --git a/demo/chat_pdf_streamlit_ui_OLD.py b/demo/chat_pdf_streamlit_ui_OLD.py deleted file mode 100644 index bb43cc5..0000000 --- a/demo/chat_pdf_streamlit_ui_OLD.py +++ /dev/null @@ -1,221 +0,0 @@ -''' -Advanced, "Chat my PDF" demo - -Use a PDF document as a knowledge base to provide context for natural language Q&A - -UI: Streamlit - streamlit.io -Vector store: Qdrant - https://qdrant.tech/ - Alternatives: pgvector, Chroma, Faiss, Weaviate, etc. -PDF to text: PyPDF2 - Alternatives: pdfplumber -Text to vector (embedding) model: - Alternatives: https://www.sbert.net/docs/pretrained_models.html / OpenAI ada002 - -Single-PDF support, for now, to keep the demo code simple, -though you can easily extend it to e.g. work with multiple docs -dropped in a directory - -Based on https://github.com/wafflecomposite/langchain-ask-pdf-local -but taking advantage of OgbujiPT - -You need access to an OpenAI-like service. Default assumption is that you -have a self-hosted framework such as llama-cpp-python or text-generation-webui -running. Assume for the following it's at my-llm-host:8000 - -Prerequisites. From OgbujiPT cloned dir:. - -```sh -pip install --upgrade . -pip install streamlit watchdog PyPDF2 PyCryptodome sentence_transformers qdrant-client tiktoken -``` - -You'll probably need a .env file. See demo/.env for an example to copy. Run the demo: - -```sh -streamlit run demo/chat_pdf_streamlit_ui.py -``` - -Something about Streamlit's event loop is interacting with asyncio to cause the `async_main` -coroutine to be invoked twice. We might need to perhaps bundle our coroutines and -`streamlit.bootstrap.run()` together in an `asyncio.gather()`,or something of the sort. -''' -import asyncio -import os - -from dotenv import load_dotenv - -import streamlit as st -from PyPDF2 import PdfReader - -from ogbujipt.config import openai_emulation, openai_live, HOST_DEFAULT -from ogbujipt.prompting import format, CHATGPT_DELIMITERS -from ogbujipt import oapi_choice1_text -from ogbujipt.text_helper import text_splitter -from ogbujipt.embedding_helper import qdrant_collection - -# Avoid re-entrace complaints from huggingface/tokenizers -os.environ['TOKENIZERS_PARALLELISM'] = 'false' - -# Load the main parameters from .env file -load_dotenv() -# User can set a variety of likely values to trigger use of OpenAI full-service -OPENAI = os.getenv('OPENAI', 'False') in \ - ['True', 'true', 'TRUE', 'Yes', 'yes', 'YES'] -OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') -LLM = os.getenv('LLM', 'LLM') # TODO: get this from non-openai openai api hosts -LLM_HOST = os.getenv('LLM_HOST', 'my-llm-host') -LLM_PORT = os.getenv('LLM_PORT', '8000') -# LLM "temperature" -LLM_TEMP = float(os.getenv('LLM_TEMP', '1')) -# LLM max context size -N_CTX = int(os.getenv('N_CTX', '2048')) -# K - how many chunks to return for query context -K = int(os.getenv('K', '6')) -# Chunk size is the number of characters counted in the chunks -EMBED_CHUNK_SIZE = int(os.getenv('EMBED_CHUNK_SIZE', '500')) -# Chunk Overlap to connect ends of chunks together -EMBED_CHUNK_OVERLAP = int(os.getenv('EMBED_CHUNK_OVERLAP', '100')) -# small LM for embeddings -# default https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 -DOC_EMBEDDINGS_LLM = os.getenv('EMBED_CHUNK_OVERLAP', 'all-MiniLM-L6-v2') - -PDF_USER_QUESTION_PROMPT = 'Ask a question about your PDF:' - -# throbber.gif as a data URL -# TODO: use a less archaic format than GIF; perhaps some sort of animatable vector image -throbber = '' # noqa E501 - - -async def prep_pdf(pdf, embedding_model, collection_name): - # Streamlit treats function docstrings as magic strings for user display - # Describe function via comments instead: - # Converts pdf content into chunks according to chunk size & overlap - # Vectorizes chunks for sLLM lookup - # returns `knowledge_base`, the vector DB with indexed chunks - - # Create in-memory Qdrant instance - knowledge_base = qdrant_collection(collection_name, embedding_model) - - pdf_reader = PdfReader(pdf) - - # Collect text from pdf - text = ''.join((page.extract_text() for page in pdf_reader.pages)) - - # Split the text into chunks - chunks = text_splitter( - text, - chunk_size=EMBED_CHUNK_SIZE, - chunk_overlap=EMBED_CHUNK_OVERLAP, - separator='\n' - ) - - print('\n\n'.join([ch[:100] for ch in chunks])) - # Add a new collection for this document, and upsert the chunks into it - knowledge_base.add(texts=chunks) - - return knowledge_base - - -# Schedule one task to do a long-running/blocking LLM request, -# and another to chat the PDF -async def async_main(openai_api, model, LLM_TEMP): - # Doc strings turn into streamlit headers - ''' - Oori β€” Ask your PDF πŸ“„πŸ’¬ - ''' - # create file upload box on Streamlit, set from the user's upload - pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False) - - if pdf: - # Show throbber, vectorize the PDF, and setup for similarity search - placeholder = st.empty() - with placeholder: - placeholder.image(throbber) - - # LLM will be downloaded from HuggingFace automatically - # There seem to be reentrancy issues with HuggingFace; defer import - from sentence_transformers import SentenceTransformer - embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM) - - kb = await prep_pdf(pdf, embedding_model, collection_name=pdf.name) - placeholder.empty() - - user_question = st.text_input(PDF_USER_QUESTION_PROMPT) - - embedded_question = embedding_model.encode(user_question) - docs = kb.db.search( - collection_name=kb.name, - query_vector=embedded_question, - limit=K - ) - - print(kb.name, pdf.name, docs) - if docs: - # Collects "chunked_doc" into "gathered_chunks" - gathered_chunks = '\n\n'.join( - doc.payload['_text'] for doc in docs - ) - - # Build prompt the doc chunks as context - prompt = format( - f'Given the context, {user_question}\n\n' - f'Context: """\n{gathered_chunks}\n"""\n', - preamble='### SYSTEM:\nYou are a helpful assistant, who answers ' - 'questions directly and as briefly as possible. ' - 'If you cannot answer with the given context, just say so.\n', - delimiters=CHATGPT_DELIMITERS - ) - - print(prompt) - # Show throbber, and send LLM prompt - with st.empty(): - st.image(throbber) - response = openai_api.Completion.create( - model=model, # Model (Required) - prompt=prompt, # Prompt (Required) - temperature=LLM_TEMP, # Temp (Default 1) - max_tokens=1024 # Maximum tokens to return (Default 16) - ) - - # Response is a json-like object - print('\nFull response data from LLM:\n', response) - - # Response is a json-like object; - # just get back the text of the response - response_text = oapi_choice1_text(response).strip() - print('\nResponse text from LLM:\n', response_text) - - # Write the response text to Streamlit - st.write(response_text) - else: - st.write('No context info found') - - -def main(): - # Describing function via comments instead: - # Set up Streamlit page, LLM host connection & launch the main loop - st.set_page_config( - page_title='Ask your PDF', - page_icon='πŸ“„πŸ’¬', - layout='wide', - initial_sidebar_state='expanded', - ) - - # Use OpenAI API if specified, otherwise emulate with supplied host, etc. - if OPENAI: - assert not (LLM_HOST or LLM_PORT), 'Don\'t use --host or --port with --openai' - model = LLM - openai_api = openai_live( - model=LLM, debug=True) - else: - # For now the model param is most useful when OPENAI is True - model = LLM or HOST_DEFAULT - openai_api = openai_emulation( - host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True) - print('oooh im tryin') - asyncio.run(async_main(openai_api, model, LLM_TEMP)) - - -if __name__ == '__main__': - # TODO: Look into isolating huggingface's one time per process setup routines - main() From e81ff3ee1d18b4273c861aff80a0466de5d90fbf Mon Sep 17 00:00:00 2001 From: Uche Ogbuji Date: Thu, 20 Jul 2023 17:20:28 -0600 Subject: [PATCH 39/39] Update README.md --- demo/README.md | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/demo/README.md b/demo/README.md index 24f7b88..2e9eb79 100644 --- a/demo/README.md +++ b/demo/README.md @@ -2,7 +2,7 @@ ## alpaca_simple_fix_xml.py -Quick demo, sending an Alpaca-compatible LLm some bad XML & asking it to make corrections. +Quick demo, sending an Alpaca-compatible LLM some bad XML & asking it to make corrections. # Intermediate @@ -30,12 +30,23 @@ Demonstrates: consideration until more sever-side LLM hosting frameworks reliablly support multiprocessing +## chat_web_selects.py + +Simple, command-line "chat my web site" demo, but supporting self-hosted LLM. + +Definitely a good idea for you to understand demos/alpaca_multitask_fix_xml.py +before swapping this in. + +Vector store: Qdrant - https://qdrant.tech/ + +Supports multiple web URLs, specified on cmdline + ## chat_pdf_streamlit_ui.py image -"Chat my PDF" demo, but using self-hosted LLM. Definitely a good idea for you to understand -demos/alpaca_multitask_fix_xml.py +"Chat my PDF" demo, supporting self-hosted LLM. Definitely a good idea for you to understand +alpaca_multitask_fix_xml.py & chat_web_selects.py before swapping this in. UI: Streamlit - streamlit.io