Skip to content

Commit b362cc7

Browse files
committed
synonyms added
1 parent f80756a commit b362cc7

14 files changed

+148
-141
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@ __pycache__/
77
*.so
88
.vscode/
99
.idea/
10-
nltk_data
10+
nltk_data/
1111
#raw data this lesson provide
1212
data/
13+
#test tools
14+
test.py
15+
dataset/
1316
#processed data by process.py
1417
output/
1518
# Distribution / packaging

bool_search.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,12 @@
22
import re
33
from utils.data_process import Data
44
from re import L, sub
5+
6+
57
class Bool_Search(object):
6-
def __init__(self, inverted_index, look_up_dict):
8+
def __init__(self, inverted_index, dictionary):
79
self.ii = inverted_index
8-
self.lookup = {look_up_dict[wordid]:wordid for wordid in range(len(look_up_dict))}
9-
10-
def _get_ii_(self, word):
11-
# query word not found in documentss
12-
if word not in self.lookup:
13-
return None
14-
return self.ii[self.lookup[word]]
10+
self.dict = dictionary
1511

1612
def search(self, query):
1713
l_bracket_stack = []
@@ -30,17 +26,21 @@ def search(self, query):
3026
query.pop(l_bracket_index)
3127
# pop expression in between
3228
# replace right bracket with procssed single inverted index
33-
to_process = [query.pop(l_bracket_index) for i in range(0, index-l_bracket_index-1)]
29+
to_process = [query.pop(l_bracket_index)
30+
for i in range(0, index-l_bracket_index-1)]
3431
query[l_bracket_index] = Bool_Search.process(to_process)
3532
index = l_bracket_index + 1
3633
# pass bool keyword to process to handle, ignore
3734
elif query[index] == 'and' or query[index] == 'not' or query[index] == 'or':
3835
index += 1
3936
# replace word with corresponding inverted index
4037
else:
41-
query[index] = self._get_ii_(query[index])
38+
if query[index] in self.dict:
39+
query[index] = self.ii[self.dict[query[index]]]
40+
else:
41+
query[index] = None
4242
index += 1
43-
43+
4444
if index > 1:
4545
query[0] = Bool_Search.process(query)
4646
return query[0]
@@ -83,7 +83,7 @@ def strip(iia, iib):
8383
res.pop(i)
8484
j += 1
8585
return res
86-
86+
8787
@staticmethod
8888
def complement(iia, iib):
8989
# handle circumstances in which one or more word is not found
@@ -116,15 +116,16 @@ def complement(iia, iib):
116116
# generate key word list from query string
117117
def _preprocess_(query):
118118
# add space before & after bracket for split
119-
query = sub('\(', ' ( ',query)
120-
query = sub('\)', ' ) ',query)
119+
query = sub('\(', ' ( ', query)
120+
query = sub('\)', ' ) ', query)
121121
# remove continuous spaces
122-
query = sub(' {2,}', ' ',query)
122+
query = sub(' {2,}', ' ', query)
123123
query = query.lower()
124124
query = query.split()
125125
query = Data.lemma(query)
126126
return query
127127
# calculate target inverted index by operator
128+
128129
@staticmethod
129130
def process(ii_list):
130131
while len(ii_list) > 1:
@@ -137,18 +138,19 @@ def process(ii_list):
137138
ii_list[0] = Bool_Search.strip(iia, iib)
138139
return ii_list[0]
139140

141+
140142
def load():
141143
import zstd
142144
import pickle
143-
with open('output/inverted_index.zstd','rb') as f:
145+
with open('output/inverted_index.zstd', 'rb') as f:
144146
ii = zstd.decompress(f.read())
145147
ii = pickle.loads(ii)
146148
f.close()
147-
with open('output/dictionary.zstd','rb') as f:
149+
with open('output/dictionary.zstd', 'rb') as f:
148150
dictionary = zstd.decompress(f.read())
149151
dictionary = pickle.loads(dictionary)
150152
f.close()
151-
with open('output/metadata.zstd','rb') as f:
153+
with open('output/metadata.zstd', 'rb') as f:
152154
metadata = zstd.decompress(f.read())
153155
metadata = pickle.loads(metadata)
154156
f.close()
@@ -165,8 +167,8 @@ def load():
165167
while True:
166168
query = input("Enter expression for bool search: ")
167169
res = bs.search(query)
168-
if res!= None:
170+
if res != None:
169171
for docid in range(len(res)):
170-
print('{}\t{}'.format(docid+1,metadata[res[docid]]['title']))
172+
print('{}\t{}'.format(docid+1, metadata[res[docid]]['title']))
171173
else:
172-
print('Not found')
174+
print('Not found')

output/dictionary.zstd

-46.5 KB
Binary file not shown.

output/header_tf_idf_matrix.zstd

-283 KB
Binary file not shown.

output/inverted_index.zstd

-42.5 KB
Binary file not shown.

output/metadata.zstd

-18.4 KB
Binary file not shown.

output/tf_idf_matrix.zstd

-276 KB
Binary file not shown.

process.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,37 @@
99
logging.basicConfig(level=logging.DEBUG)
1010
logging.info("Loading data from files")
1111
data = Data()
12-
data.load("data",300)
12+
data.load("data", 10)
1313
data.process()
1414
wordcount = None
1515
logging.info("Writing data to output")
16-
with open(f'{save_dir}/inverted_index.zstd','wb') as f:
16+
with open(f'{save_dir}/inverted_index.zstd', 'wb') as f:
1717
# include words in title
18-
ii = Inverted_Index(data.data,data.headerdata,data.dict)
18+
ii = Inverted_Index(data.data, data.headerdata, data.dict)
1919
ii.procecss()
2020
wordcount = ii.word_count
2121
logging.info("Compressing and saving inverted index")
2222
ii_data = pickle.dumps(ii.inverted_index)
2323
f.write(zstd.compress(ii_data))
2424
f.close()
25-
with open(f'{save_dir}/tf_idf_matrix.zstd','wb') as f:
26-
tf_idf = TF_IDF(data.data,data.headerdata,data.dict,wordcount)
25+
with open(f'{save_dir}/tf_idf_matrix.zstd', 'wb') as f:
26+
tf_idf = TF_IDF(data.data, data.headerdata, data.dict, wordcount)
2727
tf_idf.process()
2828
logging.info("Compressing and saving tf-idf matrix")
2929
tf_idf_data = pickle.dumps(tf_idf.tf_idf)
3030
f.write(zstd.compress(tf_idf_data))
3131
f.close()
32-
with open(f'{save_dir}/header_tf_idf_matrix.zstd','wb') as f:
32+
with open(f'{save_dir}/header_tf_idf_matrix.zstd', 'wb') as f:
3333
header_tf_idf_data = pickle.dumps(tf_idf.header_tf_idf)
3434
f.write(zstd.compress(header_tf_idf_data))
3535
f.close()
36-
with open(f'{save_dir}/dictionary.zstd','wb') as f:
37-
logging.info("Compressing and saving dictionary")
36+
with open(f'{save_dir}/dictionary.zstd', 'wb') as f:
37+
logging.info("Compressing and saving dictionary")
3838
dict_data = pickle.dumps(data.dict)
3939
f.write(zstd.compress(dict_data))
4040
f.close()
41-
with open(f'{save_dir}/metadata.zstd','wb') as f:
42-
logging.info("Compressing and saving metadata")
41+
with open(f'{save_dir}/metadata.zstd', 'wb') as f:
42+
logging.info("Compressing and saving metadata")
4343
meta_data = pickle.dumps(data.metadata)
4444
f.write(zstd.compress(meta_data))
45-
f.close()
45+
f.close()

report/report.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Web 信息处理与应用 实验一
2+
3+
## 实验目的
4+
5+
本实验要求以给定的财经新闻数据集为基础,实现一个新闻搜索引擎。对于给定的查询,能够以精确查
6+
询或模糊语义匹配的方式返回最相关的一系列新闻文档。
7+
8+
## 实验要求
9+
10+
本实验要求同时实现 bool 检索和语义检索
11+

semantic_search.py

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,23 @@
99

1010
class Semantic_Search(object):
1111
# threshold is used to filter out documents which only include less than threshold * words in query
12-
def __init__(self, tf_idf_table, header_tf_idf_table, look_up_dict):
12+
def __init__(self, tf_idf_table, header_tf_idf_table, dictionary):
1313
self.ths = 0.5
1414
self.res = 10
1515
self.tf_idf = tf_idf_table
1616
self.header_tf_idf = header_tf_idf_table
1717
# dict constructed from database
18-
self.lookup = {}
19-
for wordid in range(len(look_up_dict)):
20-
self.lookup[look_up_dict[wordid]] = wordid
21-
# dict constructed from query
22-
self.dict = []
23-
24-
# convert wordid in query to id in database
25-
def _convert_(self, word):
26-
if word not in self.lookup.keys():
27-
return -1
28-
else:
29-
return self.lookup[word]
18+
self.dict = dictionary
3019

3120
def _gen_tf_(self, query):
3221
query = Data.dump(query)
33-
tf = TF_IDF([query], None, None, None)
22+
query_list = []
23+
for word in query:
24+
if word in self.dict:
25+
query_list.append(self.dict[word])
26+
tf = TF_IDF([query_list], None, None, None)
3427
tf.gen_tf()
35-
query_tf = {}
36-
for word in tf.tf[0].keys():
37-
# ignore those words which are not in database
38-
if self._convert_(word) >= 0:
39-
query_tf[self._convert_(word)] = tf.tf[0][word]
40-
self.dict = list(set(query))
41-
return query_tf
28+
return tf.tf[0]
4229

4330
# add tf-idf length into calculation
4431
def search(self, query, threshold=0.5, return_results=10, len_weight=0.5, header_weight=0.3):
@@ -114,7 +101,10 @@ def load():
114101
while True:
115102
query = input("Enter words for semantic search: ")
116103
res = ss.search(query, 0.5, 10, 0.6)
117-
for docid in range(len(res)):
118-
if(res[docid][0] > 0):
119-
print('{}:\t{}'.format(res[docid][0],
120-
metadata[res[docid][1]]['title']))
104+
if res[0][0] == 0:
105+
print('Not found')
106+
else:
107+
for docid in range(len(res)):
108+
if(res[docid][0] > 0):
109+
print('{}:\t{}'.format(res[docid][0],
110+
metadata[res[docid][1]]['title']))

0 commit comments

Comments
 (0)