From 60c4cafa1ed8b2cd2059f83f2ecdc80d38fd07d6 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 22 Nov 2023 07:58:40 +0900
Subject: [PATCH 1/4] include title, authors and year in the data store

---
 document_qa/document_qa_engine.py | 38 ++++++++++++++++++++++++-------
 streamlit_app.py                  |  3 ++-
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
index 196e84c..292080b 100644
--- a/document_qa/document_qa_engine.py
+++ b/document_qa/document_qa_engine.py
@@ -1,4 +1,5 @@
 import copy
+import json
 import os
 from pathlib import Path
 from typing import Union, Any
@@ -173,8 +174,10 @@ def _get_context_multiquery(self, doc_id, query, context_size=4):
         relevant_documents = multi_query_retriever.get_relevant_documents(query)
         return relevant_documents
 
-    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
-        """Extract text from documents using Grobid, if chunk_size is < 0 it keep each paragraph separately"""
+    def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
+        """
+        Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
+        """
         if verbose:
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
@@ -189,6 +192,7 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
         texts = []
         metadatas = []
         ids = []
+
         if chunk_size < 0:
             for passage in structure['passages']:
                 biblio_copy = copy.copy(biblio)
@@ -212,10 +216,25 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
             metadatas = [biblio for _ in range(len(texts))]
             ids = [id for id, t in enumerate(texts)]
 
+        if "biblio" in include:
+            biblio_metadata = copy.copy(biblio)
+            biblio_metadata['type'] = "biblio"
+            biblio_metadata['section'] = "header"
+            for key in ['title', 'authors', 'year']:
+                if key in biblio_metadata:
+                    texts.append("{}: {}".format(key, biblio_metadata[key]))
+                    metadatas.append(biblio_metadata)
+                    ids.append(key)
+
         return texts, metadatas, ids
 
-    def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
-        texts, metadata, ids = self.get_text_from_document(pdf_path, chunk_size=chunk_size, perc_overlap=perc_overlap)
+    def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
+        include = ["biblio"] if include_biblio else []
+        texts, metadata, ids = self.get_text_from_document(
+            pdf_path,
+            chunk_size=chunk_size,
+            perc_overlap=perc_overlap,
+            include=include)
         if doc_id:
             hash = doc_id
         else:
@@ -233,7 +252,7 @@ def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_o
 
         return hash
 
-    def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
+    def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1, include_biblio=False):
         input_files = []
         for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
             for file_ in files:
@@ -250,9 +269,12 @@ def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.
             if os.path.exists(data_path):
                 print(data_path, "exists. Skipping it ")
                 continue
-
-            texts, metadata, ids = self.get_text_from_document(input_file, chunk_size=chunk_size,
-                                                               perc_overlap=perc_overlap)
+            include = ["biblio"] if include_biblio else []
+            texts, metadata, ids = self.get_text_from_document(
+                input_file,
+                chunk_size=chunk_size,
+                perc_overlap=perc_overlap,
+                include=include)
             filename = metadata[0]['filename']
 
             vector_db_document = Chroma.from_texts(texts,
diff --git a/streamlit_app.py b/streamlit_app.py
index 8f5b172..35d76b8 100644
--- a/streamlit_app.py
+++ b/streamlit_app.py
@@ -283,7 +283,8 @@ def play_old_messages():
         # hash = get_file_hash(tmp_file.name)[:10]
         st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
                                                                                                     chunk_size=chunk_size,
-                                                                                                    perc_overlap=0.1)
+                                                                                                    perc_overlap=0.1,
+                                                                                                    include_biblio=True)
         st.session_state['loaded_embeddings'] = True
         st.session_state.messages = []
 

From b0a0e1a2045965b3707c6a68ab0be6e857eb01f5 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 22 Nov 2023 09:01:29 +0900
Subject: [PATCH 2/4] change from year to publication year

---
 document_qa/document_qa_engine.py | 6 ++----
 document_qa/grobid_processors.py  | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
index 292080b..14d165c 100644
--- a/document_qa/document_qa_engine.py
+++ b/document_qa/document_qa_engine.py
@@ -1,9 +1,9 @@
 import copy
-import json
 import os
 from pathlib import Path
 from typing import Union, Any
 
+from document_qa.grobid_processors import GrobidProcessor
 from grobid_client.grobid_client import GrobidClient
 from langchain.chains import create_extraction_chain
 from langchain.chains.question_answering import load_qa_chain
@@ -13,8 +13,6 @@
 from langchain.vectorstores import Chroma
 from tqdm import tqdm
 
-from document_qa.grobid_processors import GrobidProcessor
-
 
 class DocumentQAEngine:
     llm = None
@@ -220,7 +218,7 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
             biblio_metadata = copy.copy(biblio)
             biblio_metadata['type'] = "biblio"
             biblio_metadata['section'] = "header"
-            for key in ['title', 'authors', 'year']:
+            for key in ['title', 'authors', 'publication_year']:
                 if key in biblio_metadata:
                     texts.append("{}: {}".format(key, biblio_metadata[key]))
                     metadatas.append(biblio_metadata)
diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py
index e21b1f1..4d8f36b 100644
--- a/document_qa/grobid_processors.py
+++ b/document_qa/grobid_processors.py
@@ -171,7 +171,7 @@ def parse_grobid_xml(self, text):
         }
         try:
             year = dateparser.parse(doc_biblio.header.date).year
-            biblio["year"] = year
+            biblio["publication_year"] = year
         except:
             pass
 

From 398556b4a91a2a41dda9aeed1ae6ea8d8cbba5c6 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 22 Nov 2023 09:14:01 +0900
Subject: [PATCH 3/4] fix signature

---
 document_qa/document_qa_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
index 14d165c..49e5f5f 100644
--- a/document_qa/document_qa_engine.py
+++ b/document_qa/document_qa_engine.py
@@ -143,7 +143,7 @@ def _parse_json(self, response, output_parser):
 
         return parsed_output
 
-    def _run_query(self, doc_id, query, memory=None, context_size=4):
+    def _run_query(self, doc_id, query, context_size=4, memory=None):
         relevant_documents = self._get_context(doc_id, query, context_size)
         if memory:
             return self.chain.run(input_documents=relevant_documents,

From 55e39a2934fe8e58f9c6788334b06f0a6ab79382 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <Foppiano.Luca@nims.go.jp>
Date: Wed, 22 Nov 2023 09:14:29 +0900
Subject: [PATCH 4/4] fix memory wrongly reset at every reload

---
 streamlit_app.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/streamlit_app.py b/streamlit_app.py
index 35d76b8..458fb3b 100644
--- a/streamlit_app.py
+++ b/streamlit_app.py
@@ -217,7 +217,8 @@ def play_old_messages():
 
     st.button(
         'Reset chat memory.',
-        on_click=clear_memory(),
+        key="reset-memory-button",
+        on_click=clear_memory,
         help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.")
 
 st.title("📝 Scientific Document Insights Q/A")
@@ -226,7 +227,9 @@ def play_old_messages():
 st.markdown(
     ":warning: Do not upload sensitive data. We **temporarily** store text from the uploaded PDF documents solely for the purpose of processing your request, and we **do not assume responsibility** for any subsequent use or handling of the data submitted to third parties LLMs.")
 
-uploaded_file = st.file_uploader("Upload an article", type=("pdf", "txt"), on_change=new_file,
+uploaded_file = st.file_uploader("Upload an article",
+                                 type=("pdf", "txt"),
+                                 on_change=new_file,
                                  disabled=st.session_state['model'] is not None and st.session_state['model'] not in
                                           st.session_state['api_keys'],
                                  help="The full-text is extracted using Grobid. ")