-
Notifications
You must be signed in to change notification settings - Fork 0
/
InvertedIndex.py
212 lines (183 loc) · 7.57 KB
/
InvertedIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from collections import OrderedDict
import xml.etree.ElementTree as ET
from Preprocessor import *
import collections
import os
class InvertedIndex(object):
'''Class that implements the positional inverted index'''
"""Class of type object implementing a query processor engine
Fields
------
invertedIndexDictionary : Object of type InvertedIndex()
The positional inverted index data structure
ppr : Object of type Preprocessor
The preprocessing toolkit
"""
ppr = Preprocessor()
def __init__(self):
"""Constructor of Class InvertedIndex initializes inverted index dictionary structure
"""
self.invertedIndexDictionary = {}
def buildIndexFromFile(self, pathToFile):
"""Invokes parsing method in order to build the positional inverted index from a given collection (file)
Parameters
----------
pathToFile : String type
The path leading to the collection file
"""
self.parseXMLFile(pathToFile)
def initializeTerm(self, term):
"""Initializes the dictionary structure for the documents in which the term is located
Parameters
----------
term : String type
A given term
"""
self.invertedIndexDictionary[term] = OrderedDict()
def initializeDoc(self, term , docID):
"""Initializes the list of positions that the term occurs within a document
Parameters
----------
term : String type
A given term
docID : Int type
A given document ID
"""
self.invertedIndexDictionary[term][docID] = []
def insertTermOccurrence(self, term, docID, position):
"""Inserts an occurence of a term in the inverted index
Parameters
----------
term : String type
A given term
docID : Int type
A given document ID
position : Int type
The position in which the term appears in the given document
"""
if term not in self.invertedIndexDictionary:
self.initializeTerm(term)
if docID not in self.invertedIndexDictionary[term]:
self.initializeDoc(term, docID)
self.invertedIndexDictionary[term][docID].append(position)
def insertMultipleTermOccurrences(self, term, docID, listOfPositions):
"""Inserts a list of occurences of a term in the inverted index - useful for externally building the inverted index
Parameters
----------
term : String type
A given term
docID : Int type
A given document ID
listOfPositions : List type of Ints
The list of positions in which the term appears in the given document
"""
if term not in self.invertedIndexDictionary:
self.initializeTerm(term)
if docID not in self.invertedIndexDictionary[term]:
self.initializeDoc(term, docID)
self.invertedIndexDictionary[term][docID].extend(listOfPositions)
def getTermDocumentSet(self, term):
"""Returns the set of document IDs for a given term
Parameters
----------
term : String type
A given term whose document IDs need to be retrieved
Returns
-------
documentSet : Set type
A set containing the documents that contain the given term
"""
if term not in self.invertedIndexDictionary:
emptySet = ()
return emptySet
else:
return set(self.invertedIndexDictionary[term].keys())
def getTermDocumentDictionary(self, term):
"""Returns the dictionary of document IDs and list of positions for a given term
Parameters
----------
term : String type
A given term whose document IDs need to be retrieved
Returns
-------
termDictionary : Dictionary type
A dictionary containing the documents that contain the given term and the list of positions for each document in which the term appears
"""
if term not in self.invertedIndexDictionary:
emptyDictionary = {}
return emptyDictionary
else:
return self.invertedIndexDictionary[term]
def getIndexDictionary(self):
"""Returns the inverted index dictionary
Returns
-------
invertedIndex : Dictionary type
A dictionary containing terms, docs and postings
"""
return self.invertedIndexDictionary
def parseXMLFile(self, pathToFile):
"""Parses the collection of documents and updates the inverted index
Parameters
----------
pathToFile : String type
Path leading to the file with the collection
"""
tree = ET.parse(pathToFile)
root = tree.getroot()
hasHeadline = False
for child in root:
for node in child:
node_tag = node.tag
if node_tag == 'DOCNO':
docID = int(node.text)
if node_tag == 'HEADLINE':
hasHeadline = True
headlineText = node.text.split("/", 1)[1].strip()
if node_tag == 'TEXT' or node_tag == 'Text': # So that I can parse both input files
position = 1
text = node.text
if hasHeadline:
headlineAndText = headlineText + ' ' + text
else:
headlineAndText = text
for word in self.ppr.tokenize(self.ppr.toLowerCase(headlineAndText)):
if (len(word) > 0) and (self.ppr.isNotAStopword(word)):
self.insertTermOccurrence(self.ppr.stemWordPorter(word), docID, position)
position += 1
def exportInvertedIndexToDirectory(self, pathToFile):
"""Exports the positional inverted index to a file within a specified directory
Parameters
----------
pathToFile : String type
Path leading to the output file
"""
path = pathToFile.rsplit('/', 1)[0]
if not os.path.exists(path): # Check whether the directory exists or not
os.makedirs(path)
# Write operations
with open(pathToFile, 'w') as output:
ordered = self.orderIndex(self.invertedIndexDictionary) # shouldn't be like this!!!
for term in ordered:#self.invertedIndexDictionary:
output.write('{}:\n'.format(term))
termNumOfOccurr = 0
for doc in ordered[term]:#self.invertedIndexDictionary[term]:
positionList = ",".join(map(str, ordered[term][doc]))#self.invertedIndexDictionary[term][doc])) # Formatting the string with the positions
output.write('\t{}: {}\n'.format(doc, positionList))
output.write('\n')
def orderIndex(self, invertedIndex):
"""Sorts the inverted index based on its keys - used when writing the index to a file in key order (alphanumeric)
Parameters
----------
invertedIndex : InvertedIndex object type
A positional inverted index structure
Returns
-------
sortedII : InvertedIndex object type
The key sorted version of the inverted index
"""
return collections.OrderedDict(sorted(invertedIndex.items()))
def printLength(self):
'''Method that prints the number of items in the index
'''
print('Number of terms in II: {}'.format(len(self.invertedIndexDictionary)))