fix a hardcoded absolute address in check solving time and provide a …

…top-level interface for odds ratio
Artisan-Lab · May 17, 2021 · 8d44106 · 8d44106
1 parent 78f9891
commit 8d44106
Show file tree

Hide file tree

Showing 9 changed files with 46 additions and 32 deletions.
diff --git a/KNN_training_data/gnucore.json b/KNN_training_data/gnucore.json
diff --git a/check_time.py b/check_time.py
@@ -39,6 +39,7 @@ def parse_arg():
 
 
 def smtfun(data, filename, timeout=300, solver_name="z3"):
+    base_dir = os.getcwd()
     # to debug the parsing and solving error, you may use the code below which is a copy of solve.py
     #
     # signal.signal(signal.SIGALRM, handler)
@@ -59,8 +60,8 @@ def smtfun(data, filename, timeout=300, solver_name="z3"):
     s = time.time()
     try:
         result = EasyProcess(
-            '/home/lsc/lsc/query-solvability/bin/python {} {} {}'.format(
-                "/home/lsc/treelstm.pytorch/solve.py",
+            'python {} {} {}'.format(
+                os.path.join(base_dir, "solve.py"),
                 filename,
                 solver_name
             )
@@ -76,8 +77,9 @@ def smtfun(data, filename, timeout=300, solver_name="z3"):
         else:
             res = result
     except:
+        # traceback.print_exc()
         e = time.time()
-        res = "unknown"
+        res = "error"
         t = str(e - s)
     st = filename + ', solver: ' + solver_name + ', result: ' + str(res) + ', time:' + t
     logger.info(st)

diff --git a/dgl_treelstm/dgl_dataset.py b/dgl_treelstm/dgl_dataset.py
@@ -9,13 +9,14 @@
 SMTBatch = namedtuple('SMTBatch', ['graph', 'wordid', 'label'])
 
 class dgl_dataset(object):
-    def __init__(self, data, embedding, vocab=None, task="regression", time_selection="original"):
+    def __init__(self, data, embedding, vocab=None, task="regression", time_selection="original", time_threshold=200):
         self.trees = []
         self.task = task
         self.time_selection = time_selection
         self.filename_list = []
         self.vocab = vocab
         self.num_classes = 2
+        self.time_threshold = time_threshold
         # self.embedding = nn.Embedding(133, 150)
         # self.embedding.weight.data.copy_(embedding)
         self._load(data)
@@ -52,7 +53,7 @@ def _rec_build(nid, root):
             if isinstance(solving_time, bool):
                 result = 1 if solving_time else 0
             else:
-                result = 1 if solving_time > 200 else 0
+                result = 1 if solving_time > self.time_threshold else 0
         else:
             result = solving_time
             if not result:

diff --git a/merge_check_time.py b/merge_check_time.py
@@ -6,14 +6,14 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--script_input', default='./data/gnucore/query2')
-parser.add_argument('--check_input', default='adjustment.log')
+parser.add_argument('--adjust_log_path', default='adjustment.log')
 parser.add_argument('--prefix', default='')
 parser.add_argument("--output", default=None)
 parser.add_argument("--solver", default="z3")
 args = parser.parse_args()
 
 time_dict = defaultdict(dict)
-with open(args.check_input, "r") as f:
+with open(args.adjust_log_path, "r") as f:
     data = f.read()
 time_list = data.split('\n')
 solver = args.solver
@@ -83,6 +83,9 @@ def modify_time_representation(data):
                         else:
                             if key in data["solving_time_dic"].keys():
                                 data["solving_time_dic"][key].extend(item)
+                                check_valid = data["solving_time_dic"][key]
+                                while(-1 in check_valid and len(check_valid) > 1):
+                                    check_valid.remove(-1)
                             else:
                                 data["solving_time_dic"][key] = item
                     # data["solving_time_dic"] = time_dict[file]

diff --git a/preprocessing/feature_extraction.py b/preprocessing/feature_extraction.py
@@ -108,9 +108,15 @@ def cal_training_label(self):
         else:
             time_list = self.script_info.solving_time_dic
         if time_list:
-            time_list = [float(x) for x in time_list]
-            # self.adjust_time = max(time_list)
-            self.adjust_time = sum(time_list) / len(time_list)
+            valid_time_list = []
+            for x in time_list:
+                if float(x) > 0:
+                    valid_time_list.append(float(x))
+                # self.adjust_time = max(time_list)
+            if len(valid_time_list) == 0:
+                self.adjust_time = 0
+            else:
+                self.adjust_time = sum(valid_time_list) / len(valid_time_list)
         else:
             self.adjust_time = 0
         if self.script_info.solving_time:

diff --git a/simulation.py b/simulation.py
@@ -215,7 +215,7 @@ def script_to_feature(self, data):
         smt_vocab_file = './data/gnucore/smt.vocab'
         smt_vocab = Vocab(filename=smt_vocab_file,
                           data=[Constants.UNK_WORD])
-        data = dgl_dataset([data], None, vocab=smt_vocab,time_selection=self.time_selection)
+        data = dgl_dataset([data], None, vocab=smt_vocab, time_selection=self.time_selection, time_threshold=self.threshold)
         dataloader = DataLoader(dataset=data, batch_size=1, collate_fn=batcher("cpu"),
                                 shuffle=False, num_workers=0)
         iterator = iter(dataloader)
@@ -394,7 +394,7 @@ def make_PCC_output(input, output_result):
                         "total_time": final_time, "input": input, "pos_num":sum(truth), "tp": sum(truth)*rec,
                     "fn": sum(truth)*(1 - rec), "fp": sum(truth)*rec/pre - sum(truth)*rec}
         print(print_output)
-        output = {"train_dataset": "gnucore", "test_dataset": "gnucore", "predicted_solving_time": pred,
+        output = {"train_dataset": "gnucore", "test_dataset": "gnucore", "predicted_result": pred,
                   "acutal_solving_time": truth, "original_time": original_time, "total_time": final_time,
                   "metrics": {"acc": acc, "pre": pre, "rec": rec, "f1": f1, "pos_num":sum(truth), "tp": sum(truth)*rec,
                     "fn": sum(truth)*(1 - rec), "fp": sum(truth)*rec/pre - sum(truth)*rec},
@@ -443,7 +443,7 @@ def make_output(dsn1, dsn2, input, simulation, result, time_section, output_resu
     #                 "fn": sum(truth)*(1 - rec), "fp": sum(truth)*rec/pre - sum(truth)*rec}
     print_output = {"timeout_query_num":sum(truth), "true-positive number": confusion_matrix[1][1],
                     "false-negative number": confusion_matrix[1][0], "false-positive number": confusion_matrix[0][1]}
-    output = {"train_dataset": dsn1, "test_dataset": dsn2, "predicted_solving_time": result.pred.tolist(),
+    output = {"train_dataset": dsn1, "test_dataset": dsn2, "predicted_result": result.pred.tolist(),
               "acutal_solving_time": result.truth.tolist(), "original_time": time_section.original_time, "predict_time":
               time_section.predict_time + time_section.preprocessing_time, "total_time": time_section.final_time,
               "metrics":{"acc": acc, "pre": pre, "rec": rec, "f1": f1}, "time_out_setting": simulation.time_out_setting,
@@ -572,7 +572,7 @@ def simulation_for_single_program(test_directory, args):
         if model_name != "KNN" and not regression:
             pred = th.argmax(F.log_softmax(predict_result), 1)
             skip = pred == 1
-            predict_result = 300 if skip else 0
+            predict_result = 1 if skip else 0
         time_section.update(skip, solve_time)
         result.add(predict_result, solve_time, skip)
         aindex += 1

diff --git a/solve.py b/solve.py
@@ -54,11 +54,6 @@ def process_data(data):
         data = data.replace("bvudiv_i", "bvudiv")
         data = data.replace("bvsdiv_i", "bvsdiv")
         script = parser.get_script(cStringIO(data))
-        # i = 0
-        # for ind, asse in enumerate(script.commands):
-        #     if asse.name == "assert":
-        #         i = ind
-        # script.commands.remove(script.commands[i])
         s = time.time()
         log = script.evaluate(solver)
         e = time.time()

diff --git a/train.py b/train.py
@@ -209,8 +209,8 @@ def main(args):
     if args.model == "tree-lstm":
         trainer = Trainer(args, model, criterion, optimizer, device, metric, metric_name)
         random.shuffle(train_dataset)
-        train_dataset = dgl_dataset(train_dataset, pretrained_emb, smt_vocab, task, args.time_selection)
-        test_dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection)
+        train_dataset = dgl_dataset(train_dataset, pretrained_emb, smt_vocab, task, args.time_selection, args.threshold)
+        test_dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection, args.threshold)
 
         train_loader = DataLoader(dataset=train_dataset,
                                   batch_size=args.batch_size,
@@ -308,7 +308,7 @@ def evaluate_once(args, device, metric_list, metric_name, pretrained_emb, smt_vo
                                  collate_fn=pad_feature_batcher(device, args.time_selection, task, args.threshold),
                                  shuffle=False, num_workers=0)
     elif args.model == "tree-lstm":
-        dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection)
+        dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection, args.threshold)
         test_loader = DataLoader(dataset=dataset,
                                  batch_size=100, collate_fn=batcher(device), shuffle=False, num_workers=0)
     else:

diff --git a/train_KNN.py b/train_KNN.py
@@ -57,8 +57,9 @@ def main(args):
 
     # some data analyse for the data
     # cor(data)
-    # odds_ratio_test(data)
-    # return
+    if args.odds_ratio:
+        odds_ratio_test(data)
+        # return
 
     for fn in test_filename:
         if "smt-comp" in args.input:
@@ -134,23 +135,25 @@ def odds_ratio_test(train_dataset):
         index = np.argwhere(x_with == True).reshape(-1)
         y_wop, y_wo = sum(y[index]), len(index)
         try:
-            # print(op[i])
             # print(y_wp, y_w)
             # print(y_wop, y_wo)
             if y_w == 0:
-                print("absent")
+                if i < len(op):
+                    print(i, op[i], "absent of operator")
                 continue
             if y_wo < 10:
                 if i < len(op):
-                    print(i, op[i], "little without")
+                    print(i, op[i], "too little without scripts")
                 elif i >= 111:
-                    print(i, "var", "unsuitable")
+                    continue
+                    # print(i, "var", "unsuitable")
                 continue
             if y_w < 10:
                 if i < len(op):
-                    print(i, op[i], "little with")
+                    print(i, op[i], "too little with scripts")
                 elif i >= 111:
-                    print(i, "var", "unsuitable")
+                    continue
+                    # print(i, "var", "unsuitable")
                 continue
             if i < len(op):
                 print(i, op[i], (y_wp / y_w) / (y_wop / y_wo))
@@ -225,7 +228,10 @@ def load_dataset(args):
     output_dir = os.path.join( args.input)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    train_file = os.path.join(output_dir, 'train')
+    if os.path.isdir(output_dir):
+        train_file = os.path.join(output_dir, 'train')
+    else:
+        train_file = output_dir
     if os.path.isfile(train_file):
         train_dataset = construct_data_from_json(train_file)
         # train_dataset = th.load(train_file)
@@ -249,6 +255,7 @@ def load_dataset(args):
 def parse_arg():
     # global args
     parser = argparse.ArgumentParser()
+    parser.add_argument('--odds_ratio', action='store_true', help="print odds_ratio for all features")
     parser.add_argument('--data_source', default='gnucore/fv2', help="scripts saving directory")
     parser.add_argument('--input', default='gnucore/training', help="saving directory of feature after "
                             "extraction, avoid duplicate preprocess")