Skip to content

Commit

Permalink
[#16] Sync up. Deal with "Calling st.experimental_rerun() within a ca…
Browse files Browse the repository at this point in the history
…llback is a no-op." & sew up the use of session data & call-backs
  • Loading branch information
uogbuji committed Jul 20, 2023
1 parent 241b4bc commit 6ef7dda
Showing 1 changed file with 65 additions and 91 deletions.
156 changes: 65 additions & 91 deletions demo/chat_pdf_streamlit_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@

PDF_USER_QUERY_PROMPT = 'Ask a question about your PDF:'


@st.cache_data # Streamlit caching to save on repeat loads
def load_favicon():
# oori_logo[32px].png as a data URL
return '' # noqa E501
st.set_page_config( # Set up Streamlit page
page_title='Ask your PDF',
layout='centered',
initial_sidebar_state='expanded',
)


@st.cache_data
Expand All @@ -92,27 +92,59 @@ def load_embedding_model(embedding_model_name):
return SentenceTransformer(embedding_model_name)


def prep_pdf(pdf, embedding_model, collection_name):
# Streamlit treats function docstrings as magic strings for user display. Use comments instead
# Converts pdf content into chunks according to chunk size & overlap
# Vectorizes chunks for sLLM lookup
kb = qdrant_collection(collection_name, embedding_model) # in-memory vector DB instance
def prep_pdf():
'Convert pdf content into chunks according to chunk size & overlap'
placeholder = st.empty()

# Load PDF & collect its text & split it into chunks
pdf_reader = PdfReader(pdf)
text = ''.join((page.extract_text() for page in pdf_reader.pages))
chunks = text_splitter(
text,
chunk_size=EMBED_CHUNK_SIZE,
chunk_overlap=EMBED_CHUNK_OVERLAP,
separator='\n')

# Update vector DB collection, insert the text chunks & update app state
kb.update(texts=chunks)
st.session_state['kb'] = kb # Update state


def query_llm(kb, openai_api, model):
pdf = st.session_state['pdf']
if not pdf:
return

with placeholder.container():
# Get the embedding model
if not st.session_state['embedding_model']:
st.session_state['embedding_model'] = load_embedding_model(embedding_model_name=DOC_EMBEDDINGS_LLM)
emb_model = st.session_state['embedding_model']

# Vectorizes chunks for sLLM lookup
# XXX: Look up rules around uploaded object names
kb = qdrant_collection(pdf.name, emb_model) # in-memory vector DB instance

# Show throbber, embed the PDF, and get ready for similarity search
embedding_placeholder = st.container()
embedding_placeholder.write('Embedding PDF...')

# Load throbber from cache
throbber = load_throbber()
embedding_placeholder.image(throbber)

# Prepare a vector knowledgebase based on the pdf contents
# Use st.session_state to avoid unnecessary reprocessing/reloading
pdf_reader = PdfReader(pdf)
text = ''.join((page.extract_text() for page in pdf_reader.pages))
chunks = text_splitter(
text,
chunk_size=EMBED_CHUNK_SIZE,
chunk_overlap=EMBED_CHUNK_OVERLAP,
separator='\n')

# Update vector DB collection, insert the text chunks & update app state
kb.update(texts=chunks)
st.session_state['kb'] = kb # Update state
placeholder.empty()

# Get the user query
st.text_input(
label=PDF_USER_QUERY_PROMPT,
key='user_query_str',
on_change=query_llm,
args=(st.session_state['openai_api'], st.session_state['model']))



def query_llm(openai_api, model):
kb = st.session_state['kb']
user_query = st.session_state['user_query_str']

# Placeholder for throbber & LLM response
Expand Down Expand Up @@ -149,73 +181,10 @@ def query_llm(kb, openai_api, model):
response_placeholder.write(response_text)


def streamlit_loop(openai_api, model, LLM_TEMP):
def main():
'''
Oori — Ask your PDF 📄💬
'''
favicon = load_favicon()
st.set_page_config( # Set up Streamlit page
page_title='Ask your PDF',
page_icon=favicon,
layout='centered',
initial_sidebar_state='expanded',
)

# Create file upload box on Streamlit, set from the user's upload
# Use st.session_state to avoid unnessisary reprocessing/reloading
if 'pdf' not in st.session_state: # First use and need to init the PDF
pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False)
st.session_state['pdf'] = pdf

new_pdf = True # Are embeddings needed for the pdf?

else: # No PDF has yet been uploaded
temp_pdf = st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False)

# Encode PDF content and compare CRCs to see if it's changed
pdf_new_checksum = zlib.adler32(str(temp_pdf).encode('utf-8'))
# FIXME: Store the old checksum in state, so it needn't be recomputed each time
pdf_old_checksum = zlib.adler32(str(st.session_state['pdf']).encode('utf-8'))

if pdf_new_checksum == pdf_old_checksum: # PDF is the same
pdf = st.session_state['pdf']
new_pdf = False # Flag to know if the new pdf needs to be embedded
else: # PDF is now different and needs to swap out session_state
pdf, st.session_state['pdf'] = temp_pdf, temp_pdf
new_pdf = True # Flag to know if the new pdf needs to be embedded

if pdf: # Only run once the program has a "pdf" loaded
if st.session_state['embedding_model']:
# Show throbber, embed the PDF, and get ready for similarity search
embedding_placeholder = st.container()

embedding_placeholder.write('Embedding PDF...')

# Load throbber from cache
throbber = load_throbber()
embedding_placeholder.image(throbber)

# Get the embedding model
embedding_model = load_embedding_model(embedding_model_name=DOC_EMBEDDINGS_LLM)

# Prepare a vector knowledgebase based on the pdf contents
# Use st.session_state to avoid unnecessary reprocessing/reloading
if new_pdf:
kb = prep_pdf(pdf, embedding_model, collection_name=pdf.name)
st.session_state['kb'] = kb
else:
kb = st.session_state['kb']

st.session_state['embedding_model'] = False

# Rerun the app to hide the embedding throbber
st.experimental_rerun()

# Get the user query
st.text_input(label=PDF_USER_QUERY_PROMPT, key='user_query_str', on_change=query_llm, args=(kb, openai_api, model))


def main():
# Streamlit treats function docstrings as magic strings for user display. Use comments instead
# Set up LLM host connection & launch the main loop
# Use OpenAI API if specified, otherwise emulate with supplied host, etc. for self-hosted LLM
Expand All @@ -229,10 +198,15 @@ def main():
model = LLM or HOST_DEFAULT
openai_api = openai_emulation(
host=LLM_HOST, port=LLM_PORT, model=LLM, debug=True)

st.session_state['embedding_model'] = True

streamlit_loop(openai_api, model, LLM_TEMP)
st.session_state['embedding_model'] = None
st.session_state['openai_api'] = openai_api
st.session_state['model'] = model

# Create file upload box on Streamlit, set from the user's upload
# Use st.session_state to avoid unnessisary reprocessing/reloading
st.file_uploader('Upload a PDF', type=['pdf'], accept_multiple_files=False,
on_change=prep_pdf, key='pdf')


main() # Code to execute

0 comments on commit 6ef7dda

Please sign in to comment.