Skip to content

Commit

Permalink
Collection management hint
Browse files Browse the repository at this point in the history
  • Loading branch information
uogbuji committed Jul 18, 2024
1 parent 91276c3 commit 01d1d6b
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions demo/chat_doc_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
python demo/chat_doc_folder.py --apibase http://localhost:8000 demo/sample_docs
```
Sample query: Tell me about the Calabar Kingdom
You can specify your document directory, and/or tweak it with the following command line options:
--verbose - print more information while processing (for debugging)
--limit (max number of chunks to retrieve for use as context)
Expand All @@ -53,6 +55,8 @@
USER_PROMPT = 'What do you want to know from the documents?\n'


# Note: simple demo mode, so no duplicate management, cleanup, etc of the chroma DB
# You can always add self.coll.delete_collection(name='chat_doc_folder'), but take case!
class vector_store:
'''Encapsulates Chroma the vector store and its parameters (e.g. for doc chunking)'''
def __init__(self, chunk_size, chunk_overlap):
Expand Down Expand Up @@ -86,7 +90,7 @@ def read_word_doc(fpath, store):
with docx2python(fpath) as docx_content:
doctext = docx_content.text
chunks = list(store.text_split(doctext))
metas = [{'source': fpath}]*len(chunks)
metas = [{'source': str(fpath)}]*len(chunks)
store.update(chunks, metas=metas)


Expand All @@ -96,7 +100,7 @@ def read_pdf_doc(fpath, store):
pdf_reader = PdfReader(fpath)
doctext = ''.join((page.extract_text() for page in pdf_reader.pages))
chunks = list(store.text_split(doctext))
metas = [{'source': fpath}]*len(chunks)
metas = [{'source': str(fpath)}]*len(chunks)
store.update(chunks, metas=metas)


Expand Down

0 comments on commit 01d1d6b

Please sign in to comment.