From 01d1d6b282c1a435bff195f7268b0989544e8198 Mon Sep 17 00:00:00 2001
From: Uche Ogbuji <uche@ogbuji.net>
Date: Thu, 18 Jul 2024 11:53:02 -0600
Subject: [PATCH] Collection management hint

---
 demo/chat_doc_folder.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/demo/chat_doc_folder.py b/demo/chat_doc_folder.py
index e0b4976..76b3919 100644
--- a/demo/chat_doc_folder.py
+++ b/demo/chat_doc_folder.py
@@ -27,6 +27,8 @@
 python demo/chat_doc_folder.py --apibase http://localhost:8000 demo/sample_docs 
 ```
 
+Sample query: Tell me about the Calabar Kingdom
+
 You can specify your document directory, and/or tweak it with the following command line options:
 --verbose - print more information while processing (for debugging)
 --limit (max number of chunks to retrieve for use as context)
@@ -53,6 +55,8 @@
 USER_PROMPT = 'What do you want to know from the documents?\n'
 
 
+# Note: simple demo mode, so no duplicate management, cleanup, etc of the chroma DB
+# You can always add self.coll.delete_collection(name='chat_doc_folder'), but take case!
 class vector_store:
     '''Encapsulates Chroma the vector store and its parameters (e.g. for doc chunking)'''
     def __init__(self, chunk_size, chunk_overlap):
@@ -86,7 +90,7 @@ def read_word_doc(fpath, store):
     with docx2python(fpath) as docx_content:
         doctext = docx_content.text
     chunks = list(store.text_split(doctext))
-    metas = [{'source': fpath}]*len(chunks)
+    metas = [{'source': str(fpath)}]*len(chunks)
     store.update(chunks, metas=metas)
 
 
@@ -96,7 +100,7 @@ def read_pdf_doc(fpath, store):
     pdf_reader = PdfReader(fpath)
     doctext = ''.join((page.extract_text() for page in pdf_reader.pages))
     chunks = list(store.text_split(doctext))
-    metas = [{'source': fpath}]*len(chunks)
+    metas = [{'source': str(fpath)}]*len(chunks)
     store.update(chunks, metas=metas)