-
Notifications
You must be signed in to change notification settings - Fork 1
/
inverter.py
219 lines (189 loc) · 8.17 KB
/
inverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#Animesh Agrawal animesha 50254531
#Micheal Kirk kirkmc 49847974
#Rachel Lam rslam 24554220
import json
import os
from corpus import Corpus
from collections import defaultdict
from bs4 import BeautifulSoup
import re
import math
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
class Inverter:
"""
This class creates an inverted index of all tokens extracted from the given urls
"""
def __init__(self):
self.corpus = Corpus()
self.wordCountDict = defaultdict(dict)
self.documentFrequencyDict = dict()
self.tfidfDict = defaultdict(dict)
self.lemmatizer = WordNetLemmatizer()
self.stopWords = set(stopwords.words('english'))
self.porterStemmer = PorterStemmer()
self.KEYWORDS = dict()
self.h2key = dict()
self.h3key = dict()
self.h4key = dict()
def calculate_word_count(self, url, url_text):
''' This function takes in a url and the html text component and creates tokens and counts the number of time
a token appeared in the URL
TF in TF-IDF'''
wordCountDict = defaultdict(int) #count number of words, used to calculate td-idf
total_num_words = 0 #used to calculate tf-idf
tokenized = re.split('[^a-zA-z0-9]+', url_text)
for token in tokenized:
if token not in self.stopWords:
token = token.lower() # Changing to lowercase
token = self.lemmatizer.lemmatize(token) #Lemmatizing each token
token = self.porterStemmer.stem(token) #Stemming each token
total_num_words += 1
if token in self.wordCountDict and url in self.wordCountDict[token]:
self.wordCountDict[token][url] += 1
else:
self.wordCountDict[token][url] = 1
def calculate_document_frequency(self):
'''Calculating how many documents each token occors in
IDF in TF_IDF'''
for term, dictionary in self.wordCountDict.items():
self.documentFrequencyDict[term] = len(dictionary)
def calculate_tfidf(self):
''' Takes the TF and IDF to combine them to create the TF-IDF score
H1 and Title adds extra 5points to weight
H2 adds extra 4points to weight
H3 adds extra 3points to weight
H4 adds extra 2points to weight'''
print(self.corpus.get_corpus_length())
for term, dictionary in self.wordCountDict.items():
for url, freq in dictionary.items():
# Weight = (1+log(TF) * log(corpus_size/IDF))
weight = (1 + math.log(freq)) * (math.log(self.corpus.get_corpus_length()/self.documentFrequencyDict[term]))
self.tfidfDict[term][url] = weight
if term in self.KEYWORDS and url in self.KEYWORDS[term]:
self.tfidfDict[term][url] += 5
if term in self.h2key and url in self.h2key[term]:
self.tfidfDict[term][url] += 4
if term in self.h3key and url in self.h3key[term]:
self.tfidfDict[term][url] += 3
if term in self.h4key and url in self.h4key[term]:
self.tfidfDict[term][url] += 2
def fix_keywords(self, keyword_string):
'''Takes in a string and breaks it into tokens and standardizes them and returns a set of these tokens'''
l = set()
s = re.sub(r'[^a-zA-Z0-9]+', ' ', keyword_string)
s = s.split(' ')
if len(s) > 0:
for word in s:
word = word.strip()
word = word.lower()
word = self.lemmatizer.lemmatize(word)
word = self.porterStemmer.stem(word)
if word != '':
l.add(word)
return l
def get_html_text(self, url, url_file):
'''Takes a URL and the local path to the file, extracts H1, H2, h3, h4, Title and All the html tags and
saves the data to respective dictionaries and datastructure to be tokenized and used to create index later'''
f = open(url_file, "rb")
content = f.read()
soup = BeautifulSoup(content, "lxml")
#get rid of all the script stuff
for script in soup(["script", "style"]):
script.extract()
keys = ''
h2keys = ''
h3keys = ''
h4keys = ''
for content in soup.find_all('title'):
content = content.get_text()
keys += content + " "
for content in soup.find_all('h1'):
content = content.get_text()
keys += content + " "
for content in soup.find_all('h2'):
content = content.get_text()
h2keys += content + " "
for content in soup.find_all('h3'):
content = content.get_text()
h3keys += content + " "
for content in soup.find_all('h4'):
content = content.get_text()
h4keys += content + " "
key_set = self.fix_keywords(keys)
h2keys = self.fix_keywords(h2keys)
h3keys = self.fix_keywords(h3keys)
h4keys = self.fix_keywords(h4keys)
for key in h2keys:
if key in self.h2key.keys():
self.h2key[key].append(url)
else:
self.h2key[key] = [url]
for key in h3keys:
if key in self.h3key.keys():
self.h3key[key].append(url)
else:
self.h3key[key] = [url]
for key in h4keys:
if key in self.h4key.keys():
self.h4key[key].append(url)
else:
self.h4key[key] = [url]
for key in key_set:
if key in self.KEYWORDS.keys():
self.KEYWORDS[key].append(url)
else:
self.KEYWORDS[key] = [url]
#get the rest of the url text
url_text = soup.get_text()
lines = (line.strip() for line in url_text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
url_text = '\n'.join(chunk for chunk in chunks if chunk)
return (url, url_text)
def start_indexing(self):
'''
Reads the bookkeeping JSON and creates index for each of the URLs in the JSON file
'''
#using good.txt from WebCrawler instead of going through the whole corpus
counter = 0
# The corpus directory name
WEBPAGES_RAW_NAME = "WEBPAGES_RAW"
# The corpus JSON mapping file
JSON_FILE_NAME = os.path.join(".", WEBPAGES_RAW_NAME, "bookkeeping.json")
file_url_map = json.load(open(JSON_FILE_NAME), encoding="utf-8")
#print(file_url_map)
for loc, url in file_url_map.items():
url = url.strip()
loc = loc.split("/")
dir = loc[0]
file = loc[1]
url_file = os.path.join(".", WEBPAGES_RAW_NAME, dir, file)
if url_file is not None:
counter += 1
print(url, "---------", counter)
url, url_text = self.get_html_text(url, url_file)
self.calculate_word_count(url, url_text)
def get_wordCountDict(self):
'''Prints TF for each term'''
for key, val in self.wordCountDict.items():
docFreqPair = sorted(val.items(), key = lambda x: x[1], reverse = True)
print(key, docFreqPair)
def get_documentFrequencyDict(self):
'''Prints the IDF for each term'''
for key, val in self.documentFrequencyDict.items():
print(key, val)
def get_tfidfDict(self):
'''Returns the TF-IDF for each term in a Dictionary'''
for key, val in self.tfidfDict.items():
docFreqPair = sorted(val.items(), key = lambda x: x[1], reverse = True)
#print(key, docFreqPair, "\n\n")
return self.tfidfDict
if __name__ == '__main__':
i = Inverter()
i.start_indexing()
i.calculate_document_frequency()
i.calculate_tfidf()
inverted = i.get_tfidfDict()
with open('invertedIndex.json', 'w') as f: #saving the TF-IDF dict in a JSON file for later use
json.dump(inverted, f)