-
Notifications
You must be signed in to change notification settings - Fork 1
/
similaritySearch.py
60 lines (44 loc) · 1.67 KB
/
similaritySearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import math
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
class SimilaritySearch:
question_tokens = []
term_document_matrix = []
mapWordToId = {}
word_list = {}
def __init__(self, question_tokens):
self.question_tokens = question_tokens
self.term_document_matrix, self.mapWordToId, self.word_list = self.get_tf_idf_weights(question_tokens)
self.inverted_index = self.initialise_inverted_index(self.word_list)
return
'''
Initialises inverted index
'''
def initialise_inverted_index(self, word_list):
inverted_index = {}
for word in word_list.keys():
inverted_index[word] = []
return inverted_index
'''
get TF-IDF weights
'''
def get_tf_idf_weights(self, tokens):
word_list = {}
for document in tokens:
for word in document:
word_list[word] = True
mapWordToId = {}
count = 0
for word in word_list.keys():
mapWordToId[word] = count
count = count + 1
termDocumentMatrix = np.zeros((len(word_list.keys()),len(tokens)))
document_id = 0
for document in tokens:
for word in document:
termDocumentMatrix[mapWordToId[word]][document_id] = termDocumentMatrix[mapWordToId[word]][document_id] + 1
document_id += 1
non_zero_values_in_each_row = np.log10(len(tokens)*1.0/((termDocumentMatrix != 0).sum(1)))
termDocumentMatrix = (termDocumentMatrix.T*non_zero_values_in_each_row).T
return termDocumentMatrix.T, mapWordToId , word_list