Catoverflow
diff --git a/‎.gitignore
Lines changed: 4 additions & 1 deletion b/‎.gitignore
Lines changed: 4 additions & 1 deletion
diff --git a/‎bool_search.py
Lines changed: 23 additions & 21 deletions b/‎bool_search.py
Lines changed: 23 additions & 21 deletions
diff --git a/‎output/dictionary.zstd
-46.5 KB b/‎output/dictionary.zstd
-46.5 KB
diff --git a/‎output/header_tf_idf_matrix.zstd
-283 KB b/‎output/header_tf_idf_matrix.zstd
-283 KB
diff --git a/‎output/inverted_index.zstd
-42.5 KB b/‎output/inverted_index.zstd
-42.5 KB
diff --git a/‎output/metadata.zstd
-18.4 KB b/‎output/metadata.zstd
-18.4 KB
diff --git a/‎output/tf_idf_matrix.zstd
-276 KB b/‎output/tf_idf_matrix.zstd
-276 KB
diff --git a/‎process.py
Lines changed: 11 additions & 11 deletions b/‎process.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎report/report.md
Lines changed: 11 additions & 0 deletions b/‎report/report.md
Lines changed: 11 additions & 0 deletions
diff --git a/‎semantic_search.py
Lines changed: 15 additions & 25 deletions b/‎semantic_search.py
Lines changed: 15 additions & 25 deletions
@@ -7,9 +7,12 @@ __pycache__/
 *.so
 .vscode/
 .idea/
-nltk_data
+nltk_data/
 #raw data this lesson provide
 data/
+#test tools
+test.py
+dataset/
 #processed data by process.py
 output/
 # Distribution / packaging
 
@@ -2,16 +2,12 @@
 import re
 from utils.data_process import Data
 from re import L, sub
+
+
 class Bool_Search(object):
-    def __init__(self, inverted_index, look_up_dict):
+    def __init__(self, inverted_index, dictionary):
         self.ii = inverted_index
-        self.lookup = {look_up_dict[wordid]:wordid for wordid in range(len(look_up_dict))}
-
-    def _get_ii_(self, word):
-        # query word not found in documentss
-        if word not in self.lookup:
-            return None
-        return self.ii[self.lookup[word]]
+        self.dict = dictionary
 
     def search(self, query):
         l_bracket_stack = []
@@ -30,17 +26,21 @@ def search(self, query):
                 query.pop(l_bracket_index)
                 # pop expression in between
                 # replace right bracket with procssed single inverted index
-                to_process = [query.pop(l_bracket_index) for i in range(0, index-l_bracket_index-1)]
+                to_process = [query.pop(l_bracket_index)
+                              for i in range(0, index-l_bracket_index-1)]
                 query[l_bracket_index] = Bool_Search.process(to_process)
                 index = l_bracket_index + 1
             # pass bool keyword to process to handle, ignore
             elif query[index] == 'and' or query[index] == 'not' or query[index] == 'or':
                 index += 1
             # replace word with corresponding inverted index
             else:
-                query[index] = self._get_ii_(query[index])
+                if query[index] in self.dict:
+                    query[index] = self.ii[self.dict[query[index]]]
+                else:
+                    query[index] = None
                 index += 1
-            
+
         if index > 1:
             query[0] = Bool_Search.process(query)
         return query[0]
@@ -83,7 +83,7 @@ def strip(iia, iib):
                 res.pop(i)
                 j += 1
         return res
-    
+
     @staticmethod
     def complement(iia, iib):
         # handle circumstances in which one or more word is not found
@@ -116,15 +116,16 @@ def complement(iia, iib):
     # generate key word list from query string
     def _preprocess_(query):
         # add space before & after bracket for split
-        query = sub('\(', ' ( ',query)
-        query = sub('\)', ' ) ',query)
+        query = sub('\(', ' ( ', query)
+        query = sub('\)', ' ) ', query)
         # remove continuous spaces
-        query = sub(' {2,}', ' ',query)
+        query = sub(' {2,}', ' ', query)
         query = query.lower()
         query = query.split()
         query = Data.lemma(query)
         return query
     # calculate target inverted index by operator
+
     @staticmethod
     def process(ii_list):
         while len(ii_list) > 1:
@@ -137,18 +138,19 @@ def process(ii_list):
                 ii_list[0] = Bool_Search.strip(iia, iib)
         return ii_list[0]
 
+
 def load():
     import zstd
     import pickle
-    with open('output/inverted_index.zstd','rb') as f:
+    with open('output/inverted_index.zstd', 'rb') as f:
         ii = zstd.decompress(f.read())
         ii = pickle.loads(ii)
         f.close()
-    with open('output/dictionary.zstd','rb') as f:
+    with open('output/dictionary.zstd', 'rb') as f:
         dictionary = zstd.decompress(f.read())
         dictionary = pickle.loads(dictionary)
         f.close()
-    with open('output/metadata.zstd','rb') as f:
+    with open('output/metadata.zstd', 'rb') as f:
         metadata = zstd.decompress(f.read())
         metadata = pickle.loads(metadata)
         f.close()
@@ -165,8 +167,8 @@ def load():
     while True:
         query = input("Enter expression for bool search: ")
         res = bs.search(query)
-        if res!= None:
+        if res != None:
             for docid in range(len(res)):
-                print('{}\t{}'.format(docid+1,metadata[res[docid]]['title']))
+                print('{}\t{}'.format(docid+1, metadata[res[docid]]['title']))
         else:
-            print('Not found')
+            print('Not found')
@@ -9,37 +9,37 @@
 logging.basicConfig(level=logging.DEBUG)
 logging.info("Loading data from files")
 data = Data()
-data.load("data",300)
+data.load("data", 10)
 data.process()
 wordcount = None
 logging.info("Writing data to output")
-with open(f'{save_dir}/inverted_index.zstd','wb') as f:
+with open(f'{save_dir}/inverted_index.zstd', 'wb') as f:
     # include words in title
-    ii = Inverted_Index(data.data,data.headerdata,data.dict)
+    ii = Inverted_Index(data.data, data.headerdata, data.dict)
     ii.procecss()
     wordcount = ii.word_count
     logging.info("Compressing and saving inverted index")
     ii_data = pickle.dumps(ii.inverted_index)
     f.write(zstd.compress(ii_data))
     f.close()
-with open(f'{save_dir}/tf_idf_matrix.zstd','wb') as f:
-    tf_idf = TF_IDF(data.data,data.headerdata,data.dict,wordcount)
+with open(f'{save_dir}/tf_idf_matrix.zstd', 'wb') as f:
+    tf_idf = TF_IDF(data.data, data.headerdata, data.dict, wordcount)
     tf_idf.process()
     logging.info("Compressing and saving tf-idf matrix")
     tf_idf_data = pickle.dumps(tf_idf.tf_idf)
     f.write(zstd.compress(tf_idf_data))
     f.close()
-    with open(f'{save_dir}/header_tf_idf_matrix.zstd','wb') as f:
+    with open(f'{save_dir}/header_tf_idf_matrix.zstd', 'wb') as f:
         header_tf_idf_data = pickle.dumps(tf_idf.header_tf_idf)
         f.write(zstd.compress(header_tf_idf_data))
         f.close()
-with open(f'{save_dir}/dictionary.zstd','wb') as f:
-    logging.info("Compressing and saving dictionary")  
+with open(f'{save_dir}/dictionary.zstd', 'wb') as f:
+    logging.info("Compressing and saving dictionary")
     dict_data = pickle.dumps(data.dict)
     f.write(zstd.compress(dict_data))
     f.close()
-with open(f'{save_dir}/metadata.zstd','wb') as f:
-    logging.info("Compressing and saving metadata")  
+with open(f'{save_dir}/metadata.zstd', 'wb') as f:
+    logging.info("Compressing and saving metadata")
     meta_data = pickle.dumps(data.metadata)
     f.write(zstd.compress(meta_data))
-    f.close()
+    f.close()
@@ -0,0 +1,11 @@
+# Web 信息处理与应用 实验一
+
+## 实验目的
+
+本实验要求以给定的财经新闻数据集为基础,实现一个新闻搜索引擎。对于给定的查询,能够以精确查
+询或模糊语义匹配的方式返回最相关的一系列新闻文档。
+
+## 实验要求
+
+本实验要求同时实现 bool 检索和语义检索 
+
@@ -9,36 +9,23 @@
 
 class Semantic_Search(object):
     # threshold is used to filter out documents which only include less than threshold * words in query
-    def __init__(self, tf_idf_table, header_tf_idf_table, look_up_dict):
+    def __init__(self, tf_idf_table, header_tf_idf_table, dictionary):
         self.ths = 0.5
         self.res = 10
         self.tf_idf = tf_idf_table
         self.header_tf_idf = header_tf_idf_table
         # dict constructed from database
-        self.lookup = {}
-        for wordid in range(len(look_up_dict)):
-            self.lookup[look_up_dict[wordid]] = wordid
-        # dict constructed from query
-        self.dict = []
-
-    # convert wordid in query to id in database
-    def _convert_(self, word):
-        if word not in self.lookup.keys():
-            return -1
-        else:
-            return self.lookup[word]
+        self.dict = dictionary
 
     def _gen_tf_(self, query):
         query = Data.dump(query)
-        tf = TF_IDF([query], None, None, None)
+        query_list = []
+        for word in query:
+            if word in self.dict:
+                query_list.append(self.dict[word])
+        tf = TF_IDF([query_list], None, None, None)
         tf.gen_tf()
-        query_tf = {}
-        for word in tf.tf[0].keys():
-            # ignore those words which are not in database
-            if self._convert_(word) >= 0:
-                query_tf[self._convert_(word)] = tf.tf[0][word]
-        self.dict = list(set(query))
-        return query_tf
+        return tf.tf[0]
 
     # add tf-idf length into calculation
     def search(self, query, threshold=0.5, return_results=10, len_weight=0.5, header_weight=0.3):
@@ -114,7 +101,10 @@ def load():
     while True:
         query = input("Enter words for semantic search: ")
         res = ss.search(query, 0.5, 10, 0.6)
-        for docid in range(len(res)):
-            if(res[docid][0] > 0):
-                print('{}:\t{}'.format(res[docid][0],
-                                       metadata[res[docid][1]]['title']))
+        if res[0][0] == 0:
+            print('Not found')
+        else:
+            for docid in range(len(res)):
+                if(res[docid][0] > 0):
+                    print('{}:\t{}'.format(res[docid][0],
+                                           metadata[res[docid][1]]['title']))