-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvectorspacemodelbackend_query.py
128 lines (117 loc) · 4.97 KB
/
vectorspacemodelbackend_query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import math
from prettytable import PrettyTable
def getVectorSpaceModel(queries, query_weight, document_weight, vsm_type):
query_matrix, query_result = getQueryDistance(query_weight)
print("query_matrix",query_matrix)
print("query_result",query_result)
document_matrix, document_result = getDocumentDistance(document_weight)
print("document_matrix",document_matrix)
print("document_result",document_result)
dot_document_matrix, dot_document_result = getDotProduct(query_matrix, document_matrix)
print("dot_document_matrix",dot_document_matrix)
print("dot_document_result",dot_document_result)
print_tabel_hasil(queries, query_matrix, query_result, document_matrix, document_result, dot_document_matrix,
dot_document_result)
if (vsm_type is 'dice'):
doc_similarity = diceSimilarity(query_result,document_result,dot_document_result)
else:
doc_similarity = cosineSimilarity(query_result,document_result,dot_document_result)
print("doc_similarity",doc_similarity)
return doc_similarity
def getQueryDistance(term_weight):
calculation_matrix = {}
total_weight = 0
for weight in term_weight.keys():
term_res = math.pow(term_weight[weight],2)
total_weight += term_res
calculation_matrix[weight] = term_res
return calculation_matrix, math.sqrt(total_weight)
def getDocumentDistance(term_weights):
doc_calculation_matrix = {}
result_matrix = {}
for docnum, term_weight in term_weights.items():
calculation_matrix = {}
total_weight = 0
for weight in term_weight.keys():
term_res = math.pow(term_weight[weight],2)
total_weight += term_res
calculation_matrix[weight] = term_res
doc_calculation_matrix[docnum] = calculation_matrix
result_matrix[docnum] = math.sqrt(total_weight)
return doc_calculation_matrix, result_matrix
def getDotProduct(query_matrix,document_matrix):
doc_calculation_matrix = {}
result_matrix = {}
for docnum, term_weight in document_matrix.items():
calculation_matrix = {}
total_weight = 0
for weight in term_weight.keys():
if weight in query_matrix.keys():
term_res = term_weight[weight] * query_matrix[weight]
total_weight += term_res
calculation_matrix[weight] = term_res
doc_calculation_matrix[docnum] = calculation_matrix
result_matrix[docnum] = total_weight
return doc_calculation_matrix, result_matrix
def cosineSimilarity(query_result,document_result,dot_document_result):
doc_similarity = {}
print("query_result",query_result)
print("document_result",document_result)
print("dot_document_result",dot_document_result)
for docnum, term_weight in document_result.items():
print("term_weight",term_weight)
if term_weight == 0 or query_result == 0:
doc_similarity[docnum] = 0
else:
doc_similarity[docnum] = (dot_document_result[docnum])/(query_result*term_weight)
return doc_similarity
def diceSimilarity(query_result,document_result,dot_document_result):
doc_similarity = {}
print("query_result",query_result)
print("document_result",document_result)
print("dot_document_result",dot_document_result)
for docnum, term_weight in document_result.items():
print("term_weight",term_weight)
if term_weight == 0 or query_result == 0:
doc_similarity[docnum] = 0
else:
doc_similarity[docnum] = 2*abs(dot_document_result[docnum])/(pow(abs(query_result),2)+pow(abs(term_weight),2))
return doc_similarity
def print_tabel_hasil(queries, query_matrix, query_result, document_matrix, document_result, dot_document_matrix, dot_document_result):
headers = ['Query']
headers = headers + ['Q^2']
for num in range(len(document_matrix.keys())):
headers = headers + ['D' + str(num + 1)+'^2']
for num in range(len(document_matrix.keys())):
headers = headers + ['Q*D' + str(num + 1)]
t = PrettyTable(headers)
for query in queries:
row = [query]
if query in query_matrix.keys():
row += [query_matrix[query]]
else:
row += ['0']
for dm in document_matrix.values():
if query in dm.keys():
row += [dm[query]]
else:
row += ['0']
for ddm in dot_document_matrix.values():
if query in ddm.keys():
row += [ddm[query]]
else:
row += ['0']
t.add_row(row)
upperrow = ['']
row = ['']
upperrow += ['sqrt(Q)']
row += [query_result]
for docnum in document_result.keys():
upperrow += ['sqrt(D'+str(docnum+1)+')']
row += [round(document_result[docnum], 4)]
for docnum in dot_document_result.keys():
upperrow += ['sum(Q*D'+str(docnum+1)+')']
row += [round(dot_document_result[docnum], 4)]
t.add_row(upperrow)
t.add_row(row)
print(t)