-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.py
268 lines (201 loc) · 8.94 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import spacy
import wikipedia
import re
from sentence_transformers import SentenceTransformer, util
import torch
import json
import random
nlp = spacy.load("en_core_web_sm") # en_core_web_trf is slower but probably better (maybe with spacy-gpu is faster)
def read_data_from_file(filename):
sentences = []
with open(filename, "r", encoding="UTF-8") as input_file:
doc = input_file.read()
doc = doc.rstrip()
parsed_doc = nlp(doc)
for token in parsed_doc.sents:
sentence = token.text.strip()
# print(sentence)
sentences.append(sentence)
return sentences
def read_data_from_wiki(personality, summary=False):
sentences = []
if not summary:
agent_name, content = get_wikipedia_article(personality)
else:
agent_name, content = get_wikipedia_article_summary(personality)
doc = content.rstrip()
parsed_doc = nlp(doc)
for token in parsed_doc.sents:
sentence = token.text.strip()
# print(sentence)
sentences.append(sentence)
return agent_name, sentences, content
def get_paragraphs(text):
text = text.replace('\n', '')
paragraphs = re.findall('.*?(?<![A-Z])\.(?=[A-Z]|$)', text, re.DOTALL)
return paragraphs
def get_paragraphs_history(text):
text = text.split('\n')
paragraphs = [t for t in text if t != '']
return paragraphs
def get_context_based_on_question1(question, text, bi_encoder=None):
model_name = 'nq-distilbert-base-v1'
bi_encoder = SentenceTransformer(model_name)
top_k = 5 # Number of passages we want to retrieve with the bi-encoder
paragraphs = get_paragraphs(text)
paragraphs_embeddings = bi_encoder.encode(paragraphs, convert_to_tensor=True, show_progress_bar=True)
top_k = min(5, len(paragraphs))
top_k = min(top_k, len(paragraphs))
question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.cos_sim(question_embedding, paragraphs_embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)
print("\n\n======================\n\n")
print("Query:", question)
print("\nTop 5 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
print(paragraphs[idx], "(Score: {:.4f})".format(score))
def get_context_based_on_question2(question, text, top_k=5, bi_encoder=None, paragraphs_embeddings=None, is_history=False):
# model_name = 'nq-distilbert-base-v1'
if bi_encoder is None:
model_name = "multi-qa-mpnet-base-cos-v1"
bi_encoder = SentenceTransformer(model_name)
top_k = top_k # Number of passages we want to retrieve with the bi-encoder
if not is_history:
paragraphs = get_paragraphs(text)
else:
paragraphs = get_paragraphs_history(text)
if paragraphs_embeddings is None:
paragraphs_embeddings = bi_encoder.encode(paragraphs, convert_to_tensor=True, show_progress_bar=True)
# Encode the query using the bi-encoder and find potentially relevant passages
question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
top_k = min(top_k, len(paragraphs))
hits = util.semantic_search(question_embedding, paragraphs_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
# Output of top-k hits
s = 0
context = ""
for hit in hits:
# ("\t{:.3f}\t{}".format(hit['score'], paragraphs[hit['corpus_id']]))
s += len(paragraphs[hit['corpus_id']].split(" "))
context += paragraphs[hit['corpus_id']] + "\n"
# print("# of words=", s)
# print("\n\n========\n")
return context
def get_wikipedia_article(personality, language="en"):
wikipedia.set_lang(language)
nt = wikipedia.page(personality, auto_suggest=False)
page_title = nt.title
page_content = nt.content
page_content = remove_references(page_content)
page_content = remove_paragraph_titles(page_content)
# print()
# print(nt.url)
# print(page_title)
# print("######################################################################################################\n")
# print(page_content)
return page_title, page_content
def get_wikipedia_article_summary(personality, language="en"):
wikipedia.set_lang(language)
nt = wikipedia.summary(personality, auto_suggest=False)
page_content = remove_references(nt)
page_content = remove_paragraph_titles(page_content)
# print()
# print(personality)
# print("######################################################################################################\n")
# print(page_content)
return personality, page_content
def remove_references(page_content):
page_content = re.sub(r'(=)*(?<=(=))( See also )(?=(=))(.\n*)*', "", page_content)
page_content = page_content.strip()
return page_content
def remove_paragraph_titles(page_content):
page_content = re.sub(r'(?<=(=)).*(?=(=))', "", page_content)
page_content = re.sub(r'(==)*', "", page_content)
page_content = page_content.strip()
return page_content
def remove_paragraph_titles2(page_content):
page_content = re.sub(r'(?<=(=))[a-zA-Z0-9 ]*(?=(=))', "", page_content)
page_content = re.sub(r'(==)*', "", page_content)
page_content = page_content.strip()
return page_content
def get_intents():
data_file = "data/my_intents.json"
data_file = open(data_file).read()
intents = json.loads(data_file)
patterns = []
for intent in intents['intents']:
for pattern in intent['patterns']:
patterns.append(pattern)
return intents, patterns
def get_label_by_pattern(intents, pattern):
for intent in intents['intents']:
for pattern_el in intent['patterns']:
if pattern[0] == pattern_el:
return intent["label"]
return "noanswer"
def get_response_by_question(intents, patterns, question, bi_encoder=None, patterns_embeddings=None):
if bi_encoder is None:
model_name = "multi-qa-mpnet-base-cos-v1"
bi_encoder = SentenceTransformer(model_name)
top_k = 1 # Number of passages we want to retrieve with the bi-encoder
if patterns_embeddings is None:
patterns_embeddings = bi_encoder.encode(patterns, convert_to_tensor=True, show_progress_bar=True)
# Encode the query using the bi-encoder and find potentially relevant passages
question_embedding = bi_encoder.encode(question, convert_to_tensor=True)
top_k = min(top_k, len(patterns))
hits = util.semantic_search(question_embedding, patterns_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
# Output of top-k hits
context = []
for hit in hits:
context.append(patterns[hit['corpus_id']])
label = get_label_by_pattern(intents, context)
# print("label:", label)
response = None
for intent in intents['intents']:
if (intent['label'] == label):
response = random.choice(intent['responses'])
break
return response
def is_relevant_question(question, text, top_k=1, bi_encoder=None, paragraphs_embeddings=None, is_history=False):
# model_name = 'nq-distilbert-base-v1'
if bi_encoder is None:
model_name = "multi-qa-mpnet-base-cos-v1"
bi_encoder = SentenceTransformer(model_name)
top_k = top_k # Number of passages we want to retrieve with the bi-encoder
if paragraphs_embeddings is None:
if not is_history:
paragraphs = get_paragraphs(text)
else:
paragraphs = get_paragraphs_history(text)
paragraphs_embeddings = bi_encoder.encode(paragraphs, show_progress_bar=True)
# Encode the query using the bi-encoder and find potentially relevant passages
question_embedding = bi_encoder.encode(question)
# top_k = min(top_k, len(paragraphs))
hits = util.semantic_search(question_embedding, paragraphs_embeddings, top_k=top_k)
hits = hits[0] # Get the hits for the first query
# Output of top-k hits
score = 0
for hit in hits:
# print(hit['score'])
score = hit['score']
break
return score
if __name__ == '__main__':
# get_wikipedia_article("Nikola Tesla")
"""agent_name, sentences, content = read_data_from_wiki("Nikola Tesla")
# print(content)
context = get_context_based_on_question2("Who was Nikola Tesla?", content)
print(context)"""
"""intents, patterns = get_intents()
question = "hello!"
model_name = "multi-qa-mpnet-base-cos-v1"
bi_encoder = SentenceTransformer(model_name)
patterns_embeddings = bi_encoder.encode(patterns, convert_to_tensor=True, show_progress_bar=True)
response = get_response_by_question(intents, patterns, question, bi_encoder=bi_encoder,
patterns_embeddings=patterns_embeddings)
print("response:", response)"""
agent_name, sentences, content = read_data_from_wiki("Nikola Tesla")
score = is_relevant_question("How are you?", content)
print(score)