From 60c4cafa1ed8b2cd2059f83f2ecdc80d38fd07d6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 22 Nov 2023 07:58:40 +0900 Subject: [PATCH 1/4] include title, authors and year in the data store --- document_qa/document_qa_engine.py | 38 ++++++++++++++++++++++++------- streamlit_app.py | 3 ++- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 196e84c..292080b 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -1,4 +1,5 @@ import copy +import json import os from pathlib import Path from typing import Union, Any @@ -173,8 +174,10 @@ def _get_context_multiquery(self, doc_id, query, context_size=4): relevant_documents = multi_query_retriever.get_relevant_documents(query) return relevant_documents - def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False): - """Extract text from documents using Grobid, if chunk_size is < 0 it keep each paragraph separately""" + def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False): + """ + Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately + """ if verbose: print("File", pdf_file_path) filename = Path(pdf_file_path).stem @@ -189,6 +192,7 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, texts = [] metadatas = [] ids = [] + if chunk_size < 0: for passage in structure['passages']: biblio_copy = copy.copy(biblio) @@ -212,10 +216,25 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, metadatas = [biblio for _ in range(len(texts))] ids = [id for id, t in enumerate(texts)] + if "biblio" in include: + biblio_metadata = copy.copy(biblio) + biblio_metadata['type'] = "biblio" + biblio_metadata['section'] = "header" + for key in ['title', 'authors', 'year']: + if key in biblio_metadata: + texts.append("{}: {}".format(key, biblio_metadata[key])) + metadatas.append(biblio_metadata) + ids.append(key) + return texts, metadatas, ids - def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1): - texts, metadata, ids = self.get_text_from_document(pdf_path, chunk_size=chunk_size, perc_overlap=perc_overlap) + def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False): + include = ["biblio"] if include_biblio else [] + texts, metadata, ids = self.get_text_from_document( + pdf_path, + chunk_size=chunk_size, + perc_overlap=perc_overlap, + include=include) if doc_id: hash = doc_id else: @@ -233,7 +252,7 @@ def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_o return hash - def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1): + def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1, include_biblio=False): input_files = [] for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False): for file_ in files: @@ -250,9 +269,12 @@ def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0. if os.path.exists(data_path): print(data_path, "exists. Skipping it ") continue - - texts, metadata, ids = self.get_text_from_document(input_file, chunk_size=chunk_size, - perc_overlap=perc_overlap) + include = ["biblio"] if include_biblio else [] + texts, metadata, ids = self.get_text_from_document( + input_file, + chunk_size=chunk_size, + perc_overlap=perc_overlap, + include=include) filename = metadata[0]['filename'] vector_db_document = Chroma.from_texts(texts, diff --git a/streamlit_app.py b/streamlit_app.py index 8f5b172..35d76b8 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -283,7 +283,8 @@ def play_old_messages(): # hash = get_file_hash(tmp_file.name)[:10] st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name, chunk_size=chunk_size, - perc_overlap=0.1) + perc_overlap=0.1, + include_biblio=True) st.session_state['loaded_embeddings'] = True st.session_state.messages = [] From b0a0e1a2045965b3707c6a68ab0be6e857eb01f5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 22 Nov 2023 09:01:29 +0900 Subject: [PATCH 2/4] change from year to publication year --- document_qa/document_qa_engine.py | 6 ++---- document_qa/grobid_processors.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 292080b..14d165c 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -1,9 +1,9 @@ import copy -import json import os from pathlib import Path from typing import Union, Any +from document_qa.grobid_processors import GrobidProcessor from grobid_client.grobid_client import GrobidClient from langchain.chains import create_extraction_chain from langchain.chains.question_answering import load_qa_chain @@ -13,8 +13,6 @@ from langchain.vectorstores import Chroma from tqdm import tqdm -from document_qa.grobid_processors import GrobidProcessor - class DocumentQAEngine: llm = None @@ -220,7 +218,7 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, biblio_metadata = copy.copy(biblio) biblio_metadata['type'] = "biblio" biblio_metadata['section'] = "header" - for key in ['title', 'authors', 'year']: + for key in ['title', 'authors', 'publication_year']: if key in biblio_metadata: texts.append("{}: {}".format(key, biblio_metadata[key])) metadatas.append(biblio_metadata) diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py index e21b1f1..4d8f36b 100644 --- a/document_qa/grobid_processors.py +++ b/document_qa/grobid_processors.py @@ -171,7 +171,7 @@ def parse_grobid_xml(self, text): } try: year = dateparser.parse(doc_biblio.header.date).year - biblio["year"] = year + biblio["publication_year"] = year except: pass From 398556b4a91a2a41dda9aeed1ae6ea8d8cbba5c6 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 22 Nov 2023 09:14:01 +0900 Subject: [PATCH 3/4] fix signature --- document_qa/document_qa_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py index 14d165c..49e5f5f 100644 --- a/document_qa/document_qa_engine.py +++ b/document_qa/document_qa_engine.py @@ -143,7 +143,7 @@ def _parse_json(self, response, output_parser): return parsed_output - def _run_query(self, doc_id, query, memory=None, context_size=4): + def _run_query(self, doc_id, query, context_size=4, memory=None): relevant_documents = self._get_context(doc_id, query, context_size) if memory: return self.chain.run(input_documents=relevant_documents, From 55e39a2934fe8e58f9c6788334b06f0a6ab79382 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Wed, 22 Nov 2023 09:14:29 +0900 Subject: [PATCH 4/4] fix memory wrongly reset at every reload --- streamlit_app.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/streamlit_app.py b/streamlit_app.py index 35d76b8..458fb3b 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -217,7 +217,8 @@ def play_old_messages(): st.button( 'Reset chat memory.', - on_click=clear_memory(), + key="reset-memory-button", + on_click=clear_memory, help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.") st.title("📝 Scientific Document Insights Q/A") @@ -226,7 +227,9 @@ def play_old_messages(): st.markdown( ":warning: Do not upload sensitive data. We **temporarily** store text from the uploaded PDF documents solely for the purpose of processing your request, and we **do not assume responsibility** for any subsequent use or handling of the data submitted to third parties LLMs.") -uploaded_file = st.file_uploader("Upload an article", type=("pdf", "txt"), on_change=new_file, +uploaded_file = st.file_uploader("Upload an article", + type=("pdf", "txt"), + on_change=new_file, disabled=st.session_state['model'] is not None and st.session_state['model'] not in st.session_state['api_keys'], help="The full-text is extracted using Grobid. ")