-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclsFeedVectorDB.py
173 lines (130 loc) · 5.29 KB
/
clsFeedVectorDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#########################################################
#### Written By: SATYAKI DE ####
#### Written On: 27-Jun-2023 ####
#### Modified On 28-Sep-2023 ####
#### ####
#### Objective: This is the main calling ####
#### python script that will invoke the ####
#### haystack frameowrk to contextulioze the docs ####
#### inside the vector DB. ####
#### ####
#########################################################
from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
import openai
import pandas as pd
import os
import clsCreateList as ccl
from clsConfigClient import clsConfigClient as cf
import clsL as log
from datetime import datetime, timedelta
# Disbling Warning
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
###############################################
### Global Section ###
###############################################
Ind = cf.conf['DEBUG_IND']
openAIKey = cf.conf['OPEN_AI_KEY']
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#Initiating Logging Instances
clog = log.clsL()
cl = ccl.clsCreateList()
var = datetime.now().strftime(".%H.%M.%S")
# Encode your data to create embeddings
documents = []
var_1 = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
print('*'*120)
print('Start Time: ' + str(var_1))
print('*'*120)
print('*'*240)
print('Creating Index store:: ')
print('*'*240)
documents = cl.createRec()
print('Inserted Sample Records: ')
print(documents[:5])
print('\n')
print('Type:')
print(type(documents))
r1 = len(documents)
if r1 > 0:
print()
print('Successfully Indexed records!')
else:
print()
print('Failed to Indexed recrods!')
print('*'*120)
var_2 = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
print('End Time: ' + str(var_2))
# Passing OpenAI API Key
openai.api_key = openAIKey
###############################################
### End of Global Section ###
###############################################
class clsFeedVectorDB:
def __init__(self):
self.basePath = cf.conf['DATA_PATH']
self.modelFileName = cf.conf['CACHE_FILE']
self.vectorDBPath = cf.conf['VECTORDB_PATH']
self.vectorDBFileName = cf.conf['VECTORDB_FILE_NM']
self.queryModel = cf.conf['QUERY_MODEL']
self.passageModel = cf.conf['PASSAGE_MODEL']
def retrieveDocuments(self, question, retriever, top_k=3):
return retriever.retrieve(question, top_k=top_k)
def generateAnswerWithGPT3(self, retrievedDocs, question):
documents_text = " ".join([doc.content for doc in retrievedDocs])
prompt = f"Given the following documents: {documents_text}, answer the question: {question}"
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt,
max_tokens=150
)
return response.choices[0].text.strip()
def ragAnswerWithHaystackAndGPT3(self, question, retriever):
retrievedDocs = self.retrieveDocuments(question, retriever)
return self.generateAnswerWithGPT3(retrievedDocs, question)
def genData(self, strVal):
try:
basePath = self.basePath
modelFileName = self.modelFileName
vectorDBPath = self.vectorDBPath
vectorDBFileName = self.vectorDBFileName
queryModel = self.queryModel
passageModel = self.passageModel
print('*'*120)
print('Index Your Data for Retrieval:')
print('*'*120)
FullFileName = basePath + modelFileName
FullVectorDBname = vectorDBPath + vectorDBFileName
sqlite_path = "sqlite:///" + FullVectorDBname + '.db'
print('Vector DB Path: ', str(sqlite_path))
indexFile = "vectorDB/" + str(vectorDBFileName) + '.faiss'
indexConfig = "vectorDB/" + str(vectorDBFileName) + ".json"
print('File: ', str(indexFile))
print('Config: ', str(indexConfig))
# Initialize DocumentStore
document_store = FAISSDocumentStore(sql_url=sqlite_path)
libName = "vectorDB/" + str(vectorDBFileName) + '.faiss'
document_store.write_documents(documents)
# Initialize Retriever
retriever = DensePassageRetriever(document_store=document_store,
query_embedding_model=queryModel,
passage_embedding_model=passageModel,
use_gpu=False)
document_store.update_embeddings(retriever=retriever)
document_store.save(index_path=libName, config_path="vectorDB/" + str(vectorDBFileName) + ".json")
print('*'*120)
print('Testing with RAG & OpenAI...')
print('*'*120)
answer = self.ragAnswerWithHaystackAndGPT3(strVal, retriever)
print('*'*120)
print('Testing Answer:: ')
print(answer)
print('*'*120)
return 0
except Exception as e:
x = str(e)
print('Error: ', x)
return 1