-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_vdb.py
38 lines (30 loc) · 1.13 KB
/
generate_vdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
### from langchain_cohere import CohereEmbeddings
def vdb():
# Set embeddings
embd = OpenAIEmbeddings()
# # Docs to index
# urls = [
# "https://www.justice.gov/crt/statutes-enforced-criminal-section",
# "https://www.justice.gov/sites/default/files/ag/legacy/2014/03/12/apr2013-section1.pdf",
# ]
# # Load
# docs = [WebBaseLoader(url).load() for url in urls]
# docs_list = [item for sublist in docs for item in sublist]
# # Split
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
# chunk_size=500, chunk_overlap=0
# )
# doc_splits = text_splitter.split_documents(docs_list)
# # Add to vectorstore
vectorstore = Chroma(
# documents=doc_splits,
collection_name="rag-chroma",
embedding_function=embd,
persist_directory="chroma_langchain_db",
)
retriever = vectorstore.as_retriever()
return retriever