-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpostingsList.py
262 lines (219 loc) · 10.8 KB
/
postingsList.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#!/usr/bin/python3
import string
class Node():
def __init__(self, value, dataCount = None, fileName = None):
self.data = value
self.dataFileName = fileName
self.next = None
self.dataCount = dataCount
#Function points current node to another node
def addNode(self, node2):
self.next = node2
#Getter function for next
def getNext(self):
return self.next
#Getter function for data
def getData(self):
return self.data
#Getter function for dataFileName
def getDataFileName(self):
return self.dataFileName
class PostingsList():
def __init__(self):
self.numNodes = 0
self.indivNodeLengths = []
self.nodes = []
#Function adds a string to the postings list as a branch
def addTokenToList(self, tokenTouple):
#Creating linked list head
#Text added to Node.data contains punctuation, newline characters, spaces
#Allows text, in its entirety, to be displayed to user if matched
nodeHead = Node(tokenTouple[0], None, tokenTouple[1])
nodeTail = nodeHead
#Postings have newline characters, spaces, punctuation removed
#Replacing newline characters with spaces
tokenString = tokenTouple[0]
if '\n' in tokenString:
tokenString = tokenString.replace('\n', ' ')
#Removing punctuation from string
#Lowercasing string
tokenString = tokenString.translate(str.maketrans('', '', string.punctuation)).lower()
wordList = tokenString.split(' ')
#Not adding spaces or empty strings as postings
wordCountDict = {}
for word in wordList:
if (word == ' ') or (word == ''):
continue
cleaned = word.strip()
#Though this is a boolean retrieval matrix
#incrementing counter allows for 'ordering results' functionality to be added later
if cleaned not in wordCountDict:
wordCountDict[cleaned] = 1
else:
wordCountDict[cleaned] += 1
nodeCount = 0
#Sorting postings, allows for more efficent search when looking for a search term
#Adding each posting to head node in linked list
for word in sorted(wordCountDict.keys()):
tempNode = Node(word, wordCountDict[word])
nodeTail.addNode(tempNode)
nodeTail = tempNode
nodeCount += 1
self.nodes.append(nodeHead)
#Storing some metadata on each branch, allows for further functionality in the future
self.numNodes += 1
self.indivNodeLengths.append(nodeCount)
#Function builds postings list from list of tuples
#Tuples in format: [([Text1,...,TextN], file1),...,([Text1,...,TextN], fileN)]
def buildPostingsList(self, fileTokenTouples):
for stringList, fileVal in fileTokenTouples:
for stringVal in stringList:
#Every string has a file associated with it when passed to branch building function
self.addTokenToList((stringVal, fileVal))
#Function searches postings list for an individual search term, ie: 'cat'
#If inverseFlag=False, find all node heads whose postings contain 'cat'
#If inverseFlag=True, find all node heads whose postings do not contain 'cat'
def searchAllBranches(self, term, inverseFlag = False):
headNodesContainingTerm = []
#All search is done on lowercase search terms
term = term.lower()
#Iterating over postings list
for nodeHead in self.nodes:
avoid = False
traversalNode = nodeHead
#Iterating over branches
while traversalNode.getNext() != None:
if traversalNode.getData() == term:
#Inverse flag is disabled, if search term is found, include node data and file name info
if inverseFlag == False:
headNodesContainingTerm.append((nodeHead.getData(), nodeHead.getDataFileName()))
else:
avoid = True
break
else:
traversalNode = traversalNode.getNext()
#Inverse flag is true, end of branch was reached, include node data and file name info
if (inverseFlag == True) & (avoid == False):
headNodesContainingTerm.append((nodeHead.getData(), nodeHead.getDataFileName()))
return headNodesContainingTerm
#Function invertes list of node head data, finds all node heads that are not in passed-in list
def inverseAllBranches(self, branches):
invertedBranches = []
#Iterating over postings list
for nodeHead in self.nodes:
currentNode = (nodeHead.getData(), nodeHead.getDataFileName())
#Current node is not a node in passed in list, include this node
if currentNode not in branches:
invertedBranches.append(currentNode)
return invertedBranches
#Function applies 'AND' operator to search term or list of node head data
def AND(self, stack1, stack2):
#If stack1 is a string, ie: 'cat', get node head result list for that term
if isinstance(stack1, list) == False:
stack1 = self.searchAllBranches(stack1)
#If stack2 is a string, ie: 'cat', get node head result list for that term
if isinstance(stack2, list) == False:
stack2 = self.searchAllBranches(stack2)
#stack1 and stack2 might already be lists
#Returns elements found in both stack1 and stack2
return [match for match in stack1 if match in stack2]
#Function applies 'OR' operator to search term or list of node head data
def OR(self, stack1, stack2):
#If stack1 is a string, ie: 'cat', get node head result list for that term
if isinstance(stack1, list) == False:
stack1 = self.searchAllBranches(stack1)
#If stack2 is a string, ie: 'cat', get node head result list for that term
if isinstance(stack2, list) == False:
stack2 = self.searchAllBranches(stack2)
#stack1 and stack2 might already be lists
#Simply add lists and remove duplicate strings
return list(set(stack1 + stack2))
#Function applies 'NOT' operator to search term or list of node head data
def NOT(self, stack1):
#If stack1 is a string, ie: 'cat'
#Search postings list for individual term, but 'True' flag inverts results
if isinstance(stack1, list) == False:
return self.searchAllBranches(stack1, True)
#If stack1 is a list of node head data, find all other node heads not in this list
else:
return self.inverseAllBranches(stack1)
#Function searches for individual search term in postings list, ie: 'cat'
def searchIndivTerm(self, term):
return self.searchAllBranches(term)
#Function uses prefix list of boolean operations and search terms to search postings list
#Returns tuple in format:
#({0 or 1, 0 if search is invalid, 1 if search is valid}, [strings of matching head nodes in postings list])
def searchPostingList(self, parsedQuery):
operatorList = ['not', 'and', 'or']
operandStack = []
operatorStack = []
#If prefix list is a single search time, ie: ['cat']
if (len(parsedQuery) == 1) & (parsedQuery[0] not in operatorList):
return (1, self.searchIndivTerm(parsedQuery[0]))
while True:
if len(parsedQuery) == 0:
#When prefix list is empty, operator stack has to be 0, operand stack has to be 1
if (len(operatorStack) != 0) & (len(operandStack) != 1):
return (0, None)
else:
return (1, operandStack.pop())
#If prefix element is operand, add to operand stack
if parsedQuery[-1] not in operatorList:
operandStack.append(parsedQuery.pop())
#Prefix element is operator
else:
#Adding to operator stack
operatorStack.append(parsedQuery.pop())
#'NOT' operator operates on single operand
#'OR' & 'AND' operators operate on two operands
if operatorStack[-1] != 'not':
#At least two operands
if len(operandStack) >= 2:
operator = operatorStack.pop()
if operator == 'and':
result = self.AND(operandStack.pop(), operandStack.pop())
elif operator == 'or':
result = self.OR(operandStack.pop(), operandStack.pop())
#Re-adding result to operand stack
operandStack.append(result)
#At least one operand
elif len(operandStack) >= 1:
operatorStack.pop()
result = self.NOT(operandStack.pop())
#Re-adding result to operand stack
operandStack.append(result)
#Function is used for debugging, iterates over postings list
#At each iteration, prints node head data and iterates over each branch, printing postings
def printBranches(self):
for count, node in enumerate(self.nodes):
#Printing node head data
print('ON HEAD NODE:', node.getData(), '\nFROM FILE:', node.getDataFileName())
#Traversing branches
traversalNode = node.getNext()
print('Branch Data,', str(self.indivNodeLengths[count]), 'in total:')
#Printing postings
while traversalNode != None:
print(traversalNode.getData())
traversalNode = traversalNode.getNext()
print('')
#Function prints results of boolean search query for the user
def printResults(self, resultList):
print('\n-----------------------------------------------')
#Different versions to be gramatically correct
if len(resultList) > 1:
print('FOUND '+str(len(resultList))+' TOTAL RESULTS\n')
elif len(resultList) == 1:
print('FOUND '+str(len(resultList))+' TOTAL RESULT\n')
else:
print('FOUND '+str(len(resultList))+' TOTAL RESULTS')
if len(resultList) != 0:
for count, result in enumerate(resultList):
#Printing text
print('RESULT '+str(count+1)+':\n'+result[0])
#Printing file text was found in
if count+1 != len(resultList):
print('IN FILE:\n'+result[1]+'\n')
#Last element lacks newline character for appearance
else:
print('IN FILE:\n'+result[1])
print('-----------------------------------------------\n')