Skip to content

Commit

Permalink
fix a hardcoded absolute address in check solving time and provide a …
Browse files Browse the repository at this point in the history
…top-level interface for odds ratio
  • Loading branch information
luosichengx committed May 17, 2021
1 parent 78f9891 commit 8d44106
Show file tree
Hide file tree
Showing 9 changed files with 46 additions and 32 deletions.
2 changes: 1 addition & 1 deletion KNN_training_data/gnucore.json

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions check_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def parse_arg():


def smtfun(data, filename, timeout=300, solver_name="z3"):
base_dir = os.getcwd()
# to debug the parsing and solving error, you may use the code below which is a copy of solve.py
#
# signal.signal(signal.SIGALRM, handler)
Expand All @@ -59,8 +60,8 @@ def smtfun(data, filename, timeout=300, solver_name="z3"):
s = time.time()
try:
result = EasyProcess(
'/home/lsc/lsc/query-solvability/bin/python {} {} {}'.format(
"/home/lsc/treelstm.pytorch/solve.py",
'python {} {} {}'.format(
os.path.join(base_dir, "solve.py"),
filename,
solver_name
)
Expand All @@ -76,8 +77,9 @@ def smtfun(data, filename, timeout=300, solver_name="z3"):
else:
res = result
except:
# traceback.print_exc()
e = time.time()
res = "unknown"
res = "error"
t = str(e - s)
st = filename + ', solver: ' + solver_name + ', result: ' + str(res) + ', time:' + t
logger.info(st)
Expand Down
5 changes: 3 additions & 2 deletions dgl_treelstm/dgl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
SMTBatch = namedtuple('SMTBatch', ['graph', 'wordid', 'label'])

class dgl_dataset(object):
def __init__(self, data, embedding, vocab=None, task="regression", time_selection="original"):
def __init__(self, data, embedding, vocab=None, task="regression", time_selection="original", time_threshold=200):
self.trees = []
self.task = task
self.time_selection = time_selection
self.filename_list = []
self.vocab = vocab
self.num_classes = 2
self.time_threshold = time_threshold
# self.embedding = nn.Embedding(133, 150)
# self.embedding.weight.data.copy_(embedding)
self._load(data)
Expand Down Expand Up @@ -52,7 +53,7 @@ def _rec_build(nid, root):
if isinstance(solving_time, bool):
result = 1 if solving_time else 0
else:
result = 1 if solving_time > 200 else 0
result = 1 if solving_time > self.time_threshold else 0
else:
result = solving_time
if not result:
Expand Down
7 changes: 5 additions & 2 deletions merge_check_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@

parser = argparse.ArgumentParser()
parser.add_argument('--script_input', default='./data/gnucore/query2')
parser.add_argument('--check_input', default='adjustment.log')
parser.add_argument('--adjust_log_path', default='adjustment.log')
parser.add_argument('--prefix', default='')
parser.add_argument("--output", default=None)
parser.add_argument("--solver", default="z3")
args = parser.parse_args()

time_dict = defaultdict(dict)
with open(args.check_input, "r") as f:
with open(args.adjust_log_path, "r") as f:
data = f.read()
time_list = data.split('\n')
solver = args.solver
Expand Down Expand Up @@ -83,6 +83,9 @@ def modify_time_representation(data):
else:
if key in data["solving_time_dic"].keys():
data["solving_time_dic"][key].extend(item)
check_valid = data["solving_time_dic"][key]
while(-1 in check_valid and len(check_valid) > 1):
check_valid.remove(-1)
else:
data["solving_time_dic"][key] = item
# data["solving_time_dic"] = time_dict[file]
Expand Down
12 changes: 9 additions & 3 deletions preprocessing/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,15 @@ def cal_training_label(self):
else:
time_list = self.script_info.solving_time_dic
if time_list:
time_list = [float(x) for x in time_list]
# self.adjust_time = max(time_list)
self.adjust_time = sum(time_list) / len(time_list)
valid_time_list = []
for x in time_list:
if float(x) > 0:
valid_time_list.append(float(x))
# self.adjust_time = max(time_list)
if len(valid_time_list) == 0:
self.adjust_time = 0
else:
self.adjust_time = sum(valid_time_list) / len(valid_time_list)
else:
self.adjust_time = 0
if self.script_info.solving_time:
Expand Down
8 changes: 4 additions & 4 deletions simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def script_to_feature(self, data):
smt_vocab_file = './data/gnucore/smt.vocab'
smt_vocab = Vocab(filename=smt_vocab_file,
data=[Constants.UNK_WORD])
data = dgl_dataset([data], None, vocab=smt_vocab,time_selection=self.time_selection)
data = dgl_dataset([data], None, vocab=smt_vocab, time_selection=self.time_selection, time_threshold=self.threshold)
dataloader = DataLoader(dataset=data, batch_size=1, collate_fn=batcher("cpu"),
shuffle=False, num_workers=0)
iterator = iter(dataloader)
Expand Down Expand Up @@ -394,7 +394,7 @@ def make_PCC_output(input, output_result):
"total_time": final_time, "input": input, "pos_num":sum(truth), "tp": sum(truth)*rec,
"fn": sum(truth)*(1 - rec), "fp": sum(truth)*rec/pre - sum(truth)*rec}
print(print_output)
output = {"train_dataset": "gnucore", "test_dataset": "gnucore", "predicted_solving_time": pred,
output = {"train_dataset": "gnucore", "test_dataset": "gnucore", "predicted_result": pred,
"acutal_solving_time": truth, "original_time": original_time, "total_time": final_time,
"metrics": {"acc": acc, "pre": pre, "rec": rec, "f1": f1, "pos_num":sum(truth), "tp": sum(truth)*rec,
"fn": sum(truth)*(1 - rec), "fp": sum(truth)*rec/pre - sum(truth)*rec},
Expand Down Expand Up @@ -443,7 +443,7 @@ def make_output(dsn1, dsn2, input, simulation, result, time_section, output_resu
# "fn": sum(truth)*(1 - rec), "fp": sum(truth)*rec/pre - sum(truth)*rec}
print_output = {"timeout_query_num":sum(truth), "true-positive number": confusion_matrix[1][1],
"false-negative number": confusion_matrix[1][0], "false-positive number": confusion_matrix[0][1]}
output = {"train_dataset": dsn1, "test_dataset": dsn2, "predicted_solving_time": result.pred.tolist(),
output = {"train_dataset": dsn1, "test_dataset": dsn2, "predicted_result": result.pred.tolist(),
"acutal_solving_time": result.truth.tolist(), "original_time": time_section.original_time, "predict_time":
time_section.predict_time + time_section.preprocessing_time, "total_time": time_section.final_time,
"metrics":{"acc": acc, "pre": pre, "rec": rec, "f1": f1}, "time_out_setting": simulation.time_out_setting,
Expand Down Expand Up @@ -572,7 +572,7 @@ def simulation_for_single_program(test_directory, args):
if model_name != "KNN" and not regression:
pred = th.argmax(F.log_softmax(predict_result), 1)
skip = pred == 1
predict_result = 300 if skip else 0
predict_result = 1 if skip else 0
time_section.update(skip, solve_time)
result.add(predict_result, solve_time, skip)
aindex += 1
Expand Down
5 changes: 0 additions & 5 deletions solve.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,6 @@ def process_data(data):
data = data.replace("bvudiv_i", "bvudiv")
data = data.replace("bvsdiv_i", "bvsdiv")
script = parser.get_script(cStringIO(data))
# i = 0
# for ind, asse in enumerate(script.commands):
# if asse.name == "assert":
# i = ind
# script.commands.remove(script.commands[i])
s = time.time()
log = script.evaluate(solver)
e = time.time()
Expand Down
6 changes: 3 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ def main(args):
if args.model == "tree-lstm":
trainer = Trainer(args, model, criterion, optimizer, device, metric, metric_name)
random.shuffle(train_dataset)
train_dataset = dgl_dataset(train_dataset, pretrained_emb, smt_vocab, task, args.time_selection)
test_dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection)
train_dataset = dgl_dataset(train_dataset, pretrained_emb, smt_vocab, task, args.time_selection, args.threshold)
test_dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection, args.threshold)

train_loader = DataLoader(dataset=train_dataset,
batch_size=args.batch_size,
Expand Down Expand Up @@ -308,7 +308,7 @@ def evaluate_once(args, device, metric_list, metric_name, pretrained_emb, smt_vo
collate_fn=pad_feature_batcher(device, args.time_selection, task, args.threshold),
shuffle=False, num_workers=0)
elif args.model == "tree-lstm":
dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection)
dataset = dgl_dataset(test_dataset, pretrained_emb, smt_vocab, task, args.time_selection, args.threshold)
test_loader = DataLoader(dataset=dataset,
batch_size=100, collate_fn=batcher(device), shuffle=False, num_workers=0)
else:
Expand Down
25 changes: 16 additions & 9 deletions train_KNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def main(args):

# some data analyse for the data
# cor(data)
# odds_ratio_test(data)
# return
if args.odds_ratio:
odds_ratio_test(data)
# return

for fn in test_filename:
if "smt-comp" in args.input:
Expand Down Expand Up @@ -134,23 +135,25 @@ def odds_ratio_test(train_dataset):
index = np.argwhere(x_with == True).reshape(-1)
y_wop, y_wo = sum(y[index]), len(index)
try:
# print(op[i])
# print(y_wp, y_w)
# print(y_wop, y_wo)
if y_w == 0:
print("absent")
if i < len(op):
print(i, op[i], "absent of operator")
continue
if y_wo < 10:
if i < len(op):
print(i, op[i], "little without")
print(i, op[i], "too little without scripts")
elif i >= 111:
print(i, "var", "unsuitable")
continue
# print(i, "var", "unsuitable")
continue
if y_w < 10:
if i < len(op):
print(i, op[i], "little with")
print(i, op[i], "too little with scripts")
elif i >= 111:
print(i, "var", "unsuitable")
continue
# print(i, "var", "unsuitable")
continue
if i < len(op):
print(i, op[i], (y_wp / y_w) / (y_wop / y_wo))
Expand Down Expand Up @@ -225,7 +228,10 @@ def load_dataset(args):
output_dir = os.path.join( args.input)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
train_file = os.path.join(output_dir, 'train')
if os.path.isdir(output_dir):
train_file = os.path.join(output_dir, 'train')
else:
train_file = output_dir
if os.path.isfile(train_file):
train_dataset = construct_data_from_json(train_file)
# train_dataset = th.load(train_file)
Expand All @@ -249,6 +255,7 @@ def load_dataset(args):
def parse_arg():
# global args
parser = argparse.ArgumentParser()
parser.add_argument('--odds_ratio', action='store_true', help="print odds_ratio for all features")
parser.add_argument('--data_source', default='gnucore/fv2', help="scripts saving directory")
parser.add_argument('--input', default='gnucore/training', help="saving directory of feature after "
"extraction, avoid duplicate preprocess")
Expand Down

0 comments on commit 8d44106

Please sign in to comment.