Skip to content

Commit

Permalink
[#16] Replace routines in embedding_helper with a class, qdrant_colle…
Browse files Browse the repository at this point in the history
…ction. Improvements to model_styles.py, and other tweaks. Begin porting demo/chat_pdf_streamlit_ui.py
  • Loading branch information
uogbuji committed Jul 14, 2023
1 parent 392d8ef commit 86af70b
Show file tree
Hide file tree
Showing 3 changed files with 178 additions and 285 deletions.
147 changes: 73 additions & 74 deletions demo/chat_pdf_streamlit_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@
from PyPDF2 import PdfReader

from ogbujipt.config import openai_emulation, openai_live, HOST_DEFAULT
from ogbujipt.prompting.basic import context_build, pdelim
from ogbujipt.prompting import format, CHATGPT_DELIMITERS
from ogbujipt import oapi_choice1_text
from ogbujipt.text_helper import text_splitter
from ogbujipt.embedding_helper import qdrant_init_embedding_db, qdrant_add_collection
from ogbujipt.embedding_helper import qdrant_collection

# Avoid re-entrace complaints from huggingface/tokenizers
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
Expand Down Expand Up @@ -82,14 +83,18 @@
throbber = '' # noqa E501


async def prep_pdf(pdf, knowledge_base, embedding_model, collection_name):
async def prep_pdf(pdf, embedding_model, collection_name):
# Streamlit treats function docstrings as magic strings for user display
# Describe function via comments instead:
# Converts pdf content into chunks according to chunk size & overlap
# Vectorizes chunks for sLLM lookup
# returns `knowledge_base`, the vector DB with indexed chunks

# Create in-memory Qdrant instance
knowledge_base = qdrant_collection(collection_name, embedding_model)

pdf_reader = PdfReader(pdf)

# Collect text from pdf
text = ''.join((page.extract_text() for page in pdf_reader.pages))

Expand All @@ -101,13 +106,9 @@ async def prep_pdf(pdf, knowledge_base, embedding_model, collection_name):
separator='\n'
)

print('\n\n'.join([ch[:100] for ch in chunks]))
# Add a new collection for this document, and upsert the chunks into it
knowledge_base = qdrant_add_collection(
client=knowledge_base,
collection_name=collection_name,
chunks=chunks,
embedding_model=embedding_model
)
knowledge_base.add(texts=chunks)

return knowledge_base

Expand All @@ -119,78 +120,76 @@ async def async_main(openai_api, model, LLM_TEMP):
'''
Oori — Ask your PDF 📄💬
'''
# Define delimeters in OpenAI's style
openai_delimiters = {
pdelim.PREQUERY: '### USER:',
pdelim.POSTQUERY: '### ASSISTANT:',
}

# LLM will be downloaded from HuggingFace automatically
# There seem to be reentrancy issues with HuggingFace; defer import
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM)

# Create in-memory Qdrant instance
knowledge_base = qdrant_init_embedding_db()

# create file upload box on Streamlit, set from the user's upload
pdf = st.file_uploader("Upload a PDF", type=["pdf"], accept_multiple_files=False)

if pdf:
# Show throbber, vectorize the PDF, and setup for similarity search
with st.empty():
st.image(throbber)
kb = await prep_pdf(pdf, knowledge_base, embedding_model, collection_name=pdf.name)

user_question = st.text_input(PDF_USER_QUESTION_PROMPT)

embedded_question = embedding_model.encode(user_question)

docs = None
if user_question:
docs = kb.search(
collection_name=pdf.name,
query_vector=embedded_question,
limit=K
placeholder = st.empty()
with placeholder:
placeholder.image(throbber)

# LLM will be downloaded from HuggingFace automatically
# There seem to be reentrancy issues with HuggingFace; defer import
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(DOC_EMBEDDINGS_LLM)

kb = await prep_pdf(pdf, embedding_model, collection_name=pdf.name)
placeholder.empty()

user_question = st.text_input(PDF_USER_QUESTION_PROMPT)

# docs = None
while not user_question:
await asyncio.sleep(0.1)

embedded_question = embedding_model.encode(user_question)
docs = kb.db.search(
collection_name=kb.name,
query_vector=embedded_question,
limit=K
)

print(kb.name, pdf.name, docs)
if docs:
# Collects "chunked_doc" into "gathered_chunks"
gathered_chunks = '\n\n'.join(
doc.payload['_text'] for doc in docs
)

# Build prompt the doc chunks as context
prompt = format(
f'Given the context, {user_question}\n\n'
f'Context: """\n{gathered_chunks}\n"""\n',
preamble='### SYSTEM:\nYou are a helpful assistant, who answers '
'questions directly and as briefly as possible. '
'If you cannot answer with the given context, just say so.\n',
delimiters=CHATGPT_DELIMITERS
)

print(prompt)
# Show throbber, and send LLM prompt
with st.empty():
st.image(throbber)
response = openai_api.Completion.create(
model=model, # Model (Required)
prompt=prompt, # Prompt (Required)
temperature=LLM_TEMP, # Temp (Default 1)
max_tokens=1024 # Maximum tokens to return (Default 16)
)

if docs:
# Collects "chunked_doc" into "gathered_chunks"
gathered_chunks = '\n\n'.join(
doc.payload['chunk_string'] for doc in docs
)
# Response is a json-like object; extract the text
print('\nFull response data from LLM:\n', response)

# Build "prompt" with the context of "chunked_doc"
prompt = context_build(
f'Given the context, {user_question}\n\n'
f'Context: """\n{gathered_chunks}\n"""\n',
preamble='### SYSTEM:\nYou are a helpful assistant, who answers '
'questions directly and as briefly as possible. '
'If you cannot answer with the given context, just say so.\n',
delimiters=openai_delimiters
)
# Response is a json-like object;
# just get back the text of the response
response_text = oapi_choice1_text(response).strip()
print('\nResponse text from LLM:\n', response_text)

print(prompt)
# Show throbber, and send LLM prompt
with st.empty():
st.image(throbber)
response = openai_api.Completion.create(
model=model, # Model (Required)
prompt=prompt, # Prompt (Required)
temperature=LLM_TEMP, # Temp (Default 1)
max_tokens=1024 # Maximum tokens to return (Default 16)
)

# Response is a json-like object; extract the text
print('\nFull response data from LLM:\n', response)

# Response is a json-like object;
# just get back the text of the response
response_text = response.choices[0].text.strip()
print('\nResponse text from LLM:\n', response_text)

# Write the response text to Streamlit
st.write(response_text)
# Write the response text to Streamlit
st.write(response_text)
else:
st.write('No context info found')


def main():
Expand Down
Loading

0 comments on commit 86af70b

Please sign in to comment.