-
Notifications
You must be signed in to change notification settings - Fork 0
/
answerSelection.py
184 lines (164 loc) · 8.02 KB
/
answerSelection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import spacy
nlp = spacy.load('en_core_web_lg')
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
import question_generation.pipelines as QGPipelines #adapted from https://github.com/patil-suraj/question_generation
from loggerLog import *
""" Main method to get potencial answers from a text excerpt.
we also try to obtain the named entity label of the answer (or, if not possible, of one of its tokens).
Options: 0 -> named entities, 1 -> noun chunks, 2 -> transformer """
def excerptAnswers(excerpt, option):
try:
sentences = selectSentences(excerpt)
except Exception as e:
logger.error("Exception in function selectSentences in excerptAnswers (answerSelection.py): %s", e)
outputAnswerSelection = []
if option == 2:
# load pipeline ("question-generationAnswer" was adapted from "question-generation" to only perform answer selection)
try:
nlp = QGPipelines.pipeline("question-generationAnswer")
except Exception as e:
logger.error("Exception loading model question-generationAnswer in excerptAnswers (answerSelection.py): %s", e)
for sentence in sentences:
if option == 0 or option == 1:
try:
answers, answersLabels = answersAndLabels(sentence, option)
except Exception as e:
logger.error("Exception in function answersAndLabels in excerptAnswers (answerSelection.py): %s", e)
elif option == 2:
try:
answers, answersLabels = answersTransformer(sentence, nlp)
except Exception as e:
logger.error("Exception in function answersTransformer in excerptAnswers (answerSelection.py): %s", e)
if answers:
sentenceDict = {}
sentenceDict["sentence"] = sentence
answersList = []
for i in range(len(answers)):
answerDict = {"answer": answers[i], "label": answersLabels[i]}
answersList.append(answerDict)
sentenceDict["answers"] = answersList
outputAnswerSelection.append(sentenceDict)
return outputAnswerSelection
""" Returns a list of the sentences. Removes whitespaces from the beginning and end of each sentence, and eliminates
possible strings only constituted by whitespaces """
def selectSentences(excerpt):
selected = []
doc = nlp(excerpt)
for sentence in doc.sents:
sentenceString = str(sentence)
# verifies if sentence still exists after removing whitespaces from the beginning and end
if sentenceString.strip():
selected.append(sentenceString)
return selected
""" used to get named entities (NEs, method 0) or noun chunks (NCs, method 1) as potential answers.
In future steps of the workflow (e.g. distractor selection), it is useful to to have answers' NE label answers obtained
by analizing NEs already have a corresponding label, we also try to correspond a label to the ones obtained from NCs. """
def answersAndLabels(sentence, method):
answers = []
labels = []
if method == 0:
# answers and their labels according to named entity recognition
try:
neTokens, neLabels = getNamedEntities(sentence)
except Exception as e:
logger.error("Exception in function getNamedEntities in answersAndLabels (answerSelection.py): %s", e)
answers = neTokens
labels = neLabels
elif method == 1:
# noun chunks as answers
try:
nounChunks = getNounChunks(sentence)
except Exception as e:
logger.error("Exception in function getNounChunks in answersAndLabels (answerSelection.py): %s", e)
ncLabels = []
# we still get the answers according to NER to then compare with NCs
try:
neTokens, neLabels = getNamedEntities(sentence)
except Exception as e:
logger.error("Exception in function getNamedEntities in answersAndLabels (answerSelection.py): %s", e)
for nounChunk in nounChunks:
# if a noun chunks is equal to a named entity, we simply attribute the same label
if nounChunk in neTokens:
index = neTokens.index(nounChunk)
ncLabels.append(neLabels[index])
else:
# if not, after processing the noun chunk, we repeat the process to try getting a label that corresponds to one of its tokens
try:
nounChunk = processPhrase(nounChunk)
except Exception as e:
logger.error("Exception in function processPhrase in answersAndLabels (answerSelection.py): %s", e)
try:
innerNETokens, innerNELabels = getNamedEntities(nounChunk)
except Exception as e:
logger.error("Exception in function getNamedEntities in answersAndLabels (answerSelection.py): %s", e)
# if multiple labels are found, we simply attribute the first to the whole noun chunk
if innerNELabels:
ncLabels.append(innerNELabels[0])
# if there is still no label found, we attribute an empty string
else:
ncLabels.append("")
answers = nounChunks
labels = ncLabels
return answers, labels
# returns named entities present in a string and their labels
def getNamedEntities(text):
doc = nlp(text)
neTokens = []
neLabels = []
if doc.ents:
for ent in doc.ents:
neTokens.append(ent.text)
neLabels.append(ent.label_)
return neTokens, neLabels
# returns noun chunks present in a string
def getNounChunks(text):
doc = nlp(text)
chunks = []
if doc.noun_chunks:
for chunk in doc.noun_chunks:
chunks.append(chunk.text)
return chunks
# removes punctuation and stopwords from a string
def processPhrase(phrase):
# same as string.punctuation but without the apostrophe
punctuation = "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~"
phrase = phrase.translate(str.maketrans('', '', punctuation))
tokens = nltk.word_tokenize(phrase)
tokens = [t for t in tokens if t not in stopwords]
phrase = " ".join(tokens)
return phrase
""" Uses the transformer (directory question_generation) to obtain potential answers (method 2).
The process to attribute a named entity label to a answer is equal to the what is performed for noun chunks. """
def answersTransformer(text, nlp):
# potential answers obtained using the pipeline "question-generationAnswer"
answers = nlp(text)
labels = []
# we still get the answers according to NER to then compare with the ones obtained with the transformer
try:
neTokens, neLabels = getNamedEntities(text)
except Exception as e:
logger.error("Exception in function getNamedEntities in answersTransformer (answerSelection.py): %s", e)
for answer in answers:
# if an answer is equal to a named entity, we simply attribute the same label
if answer in neTokens:
index = neTokens.index(answer)
labels.append(neLabels[index])
else:
# if not, after processing the answer, we repeat the process to try getting a label that corresponds to one of its tokens
try:
answer = processPhrase(answer)
except Exception as e:
logger.error("Exception in function processPhrase in answersTransformer (answerSelection.py): %s", e)
try:
innerNETokens, innerNELabels = getNamedEntities(answer)
except Exception as e:
logger.error("Exception in function getNamedEntities in answersTransformer (answerSelection.py): %s", e)
# if multiple labels are found, we simply attribute the first to the whole noun chunk
if innerNELabels:
labels.append(innerNELabels[0])
# if there is still no label found, we attribute an empty string
else:
labels.append("")
return answers, labels