Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
anuprulez committed May 31, 2023
1 parent c494289 commit ae45032
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 35 deletions.
72 changes: 52 additions & 20 deletions scripts/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
plt.rc('font', **font)

batch_size = 10
test_batches = 10
test_batches = 0
n_topk = 1
max_seq_len = 25

Expand All @@ -63,13 +63,13 @@
#"log_19_09_22_GPU_RNN_full_data/"
#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/rnn/run2/" #"log_19_09_22_GPU_RNN_full_data/" #"log_22_08_22_rnn/" #"log_08_08_22_rnn/"
elif model_type == "cnn":
base_path = "log_cnn/"
base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/cnn_full_data/" #"log_cnn/"

elif model_type == "transformer":
base_path = "log/"
base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/log_19_09_22_GPU_transformer_full_data/"

elif model_type == "dnn":
base_path = "log_dnn/"
base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/dnn_full_data/"

#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/transformer/run2/"
#"log_19_09_22_GPU_transformer_full_data/" #"log_12_09_22_GPU/" #"log_19_09_22_GPU_transformer_full_data/"
Expand All @@ -92,12 +92,21 @@

## Transformer
## GPU: 40,000 steps, batch size: 512 - 132505.20160245895 seconds
## CPU: 40 steps, batch  size: 512 - 158683 seconds
## CPU: 40,000 steps, batch  size: 512 - 158683 seconds

## New CPU: Saving model at training step 400/400
## 400 steps: Program finished in 1742.0802121162415 seconds


## RNN
## GPU: 40,000 steps, batch size: 512 - 129000 seconds
## CPU: 40,000 steps, batch size: 512 - 193863 seconds


## New CPU: 400 Saving model at training step 400/400

### 400 steps Program finished in 2656.933450937271 seconds

#"log_03_08_22_1/" Balanced data with really selection of low freq tools - random choice
# RNN: log_01_08_22_3_rnn
# Transformer: log_01_08_22_0
Expand All @@ -109,16 +118,34 @@

#CPU: Program finished in 211880.3916196823 seconds

### CNN 2

# CPU: Program finished in 206890.36703562737 seconds

### CNN Full data

# CPU: Program finished in 230368.01805138588 seconds

#tr_pos_plot = [1000, 5000, 10000, 20000, 30000, 40000]

# DNN
# Saving model at training step 40000/40000

# CPU: Program finished in 177232.39850640297 seconds

## DNN 2

# Program finished in 181519.5743496418 seconds

model_number = 40
### DNN full

# Saving model at training step 40000/40000

# CPU: Program finished in 209876.5223982334 seconds



model_number = 40000
model_path = base_path + "saved_model/" + str(model_number) + "/tf_model/"
model_path_h5 = base_path + "saved_model/" + str(model_number) + "/tf_model_h5/"

Expand Down Expand Up @@ -596,7 +623,7 @@ def plot_TSNE(embed, labels):

def predict_seq():

visualize_loss_acc()
#visualize_loss_acc()

#sys.exit()

Expand Down Expand Up @@ -768,7 +795,7 @@ def predict_seq():
#import sys
#sys.exit()

te_lowest_t_ids = utils.read_saved_file(base_path + "data/te_lowest_t_ids.txt")
'''te_lowest_t_ids = utils.read_saved_file(base_path + "data/te_lowest_t_ids.txt")
lowest_t_ids = [int(item) for item in te_lowest_t_ids.split(",")]
print(lowest_t_ids)
lowest_t_ids = lowest_t_ids[:5]
Expand Down Expand Up @@ -797,7 +824,7 @@ def predict_seq():
print("Time taken to predict tools: {} seconds".format(low_diff_pred_t))
for i, (low_inp, low_tar) in enumerate(zip(low_te_data, low_te_labels)):
'''pred_s_time = time.time()
pred_s_time = time.time()
if predict_rnn is True:
low_prediction = tf_loaded_model([low_inp], training=False)
else:
Expand All @@ -806,7 +833,7 @@ def predict_seq():
pred_e_time = time.time()
low_diff_pred_t = pred_e_time - pred_s_time
low_te_pred_time.append(low_diff_pred_t)
print("Time taken to predict tools: {} seconds".format(low_diff_pred_t))'''
print("Time taken to predict tools: {} seconds".format(low_diff_pred_t))
low_prediction = bat_low_prediction[i]
low_tar = low_te_labels[i]
low_label_pos = np.where(low_tar > 0)[0]
Expand All @@ -833,9 +860,9 @@ def predict_seq():
print("-----------------")
print()
'''if predict_rnn is False:
if predict_rnn is False:
i_names = ",".join([r_dict[str(int(item))] for item in low_inp[low_inp_pos]])
generated_attention(att_weights[i], i_names, f_dict, r_dict)'''
generated_attention(att_weights[i], i_names, f_dict, r_dict)
if test_batches > 0:
print("Batch Precision@{}: {}".format(n_topk, np.mean(precision)))
Expand All @@ -851,13 +878,13 @@ def predict_seq():
print("Low: test average prediction time: {}".format(np.mean(low_te_pred_time)))
print()
sys.exit()
#sys.exit()
print("----------------------------")
print()
print("Predicting for individual sequences...")
print()
#print("Low precision on labels: {}".format(error_label_tools))
#print("Low precision on labels: {}, # tools: {}".format(list(set(error_label_tools)), len(list(set(error_label_tools)))))
#print("Low precision on labels: {}, # tools: {}".format(list(set(error_label_tools)), len(list(set(error_label_tools)))))'''
# individual tools or seq prediction
'''print()
n_topk_ind = 20
Expand All @@ -875,9 +902,9 @@ def predict_seq():
t_ip[2] = int(f_dict["hicexplorer_hicfindtads"])
t_ip[3] = int(f_dict["hicexplorer_hicpca"])'''

t_ip[0, 0] = int(f_dict["mass_spectrometry_imaging_filtering"]) #ivar_covid_aries_consensus
t_ip[0, 1] = int(f_dict["cardinal_preprocessing"])
t_ip[0, 2] = int(f_dict["cardinal_segmentations"])
t_ip[0, 0] = int(f_dict["keras_train_and_eval"]) #ivar_covid_aries_consensus
#t_ip[0, 1] = int(f_dict["cardinal_preprocessing"])
#t_ip[0, 2] = int(f_dict["cardinal_segmentations"])
#t_ip[0, 3] = int(f_dict["heinz"])

#t_ip[4] = int(f_dict["prokka"])
Expand All @@ -887,7 +914,12 @@ def predict_seq():
#t_ip[8] = int(f_dict["anndata_manipulate"])
# 'snpEff_build_gb', 'bwa_mem', 'samtools_view', snpeff_sars_cov_2

last_tool_name = "cardinal_segmentations"
# 1. snpeff_sars_cov_2
# 2. anndata_import
# 3. keras_train_and_eval
# 4. cardinal_preprocessing, cardinal_segmentations, mass_spectrometry_imaging_filtering

last_tool_name = "keras_train_and_eval"

t_ip = tf.convert_to_tensor(t_ip, dtype=tf.int64)
t_ip = tf.cast(t_ip, dtype=tf.float32)
Expand All @@ -899,8 +931,8 @@ def predict_seq():
#t_ip_mask = utils.create_padding_mask(t_ip)
#t_ip_mask = tf.cast(t_ip_mask, dtype=tf.float32)
#prediction, att_weights = tf_loaded_model([t_ip, t_ip_mask], training=False)
prediction, att_weights = tf_loaded_model(t_ip, training=False)
print(prediction.shape, att_weights.shape)
embed, prediction, att_weights = tf_loaded_model(t_ip, training=False)
print(embed.shape, prediction.shape, att_weights.shape)
pred_e_time = time.time()
print("Time taken to predict tools: {} seconds".format(pred_e_time - pred_s_time))
prediction_cwts = tf.math.multiply(c_weights, prediction)
Expand Down
2 changes: 1 addition & 1 deletion scripts/train_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def create_model(vocab_size, config):
dropout = config["dropout"]

model = Sequential()
model.add(Embedding(vocab_size+1, cnn_units, input_length=config["maximum_path_length"]))
model.add(Embedding(vocab_size, cnn_units, input_length=config["maximum_path_length"], mask_zero=True))
model.add(Lambda(lambda x: tf.expand_dims(x, 3)))
model.add(Conv2D(cnn_units, kernel_size=(16, 3), activation = 'relu', kernel_initializer='he_normal', padding = 'VALID'))
model.add(Dropout(dropout))
Expand Down
2 changes: 1 addition & 1 deletion scripts/train_dnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def create_model(vocab_size, config):
seq_len = config["maximum_path_length"]

model = Sequential()
model.add(Embedding(vocab_size+1, dnn_units, input_length=seq_len))
model.add(Embedding(vocab_size, dnn_units, input_length=seq_len, mask_zero=True))
model.add(SpatialDropout1D(dropout))
model.add(Flatten())
model.add(Dense(dnn_units, input_shape=(seq_len,), activation="elu"))
Expand Down
45 changes: 32 additions & 13 deletions scripts/transformer_paper_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@


base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/"
#"/media/anupkumar/6c9b94c9-2316-4ae1-887a-5047a02bc3d7/home/kumara/tool_prediction_compute_results/backup_tool_pred_transformer_computed_results/aug_22_data/"
#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/"
#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/"
#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/"
#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/"
n_runs = 3
n_runs = 5


def read_file_cnn_dnn(file_path):
Expand All @@ -66,7 +68,7 @@ def collect_loss_prec_data(m_type):
print(m_path)
runs_indices = list()
runs_te_loss = list()
model_numbers = [1, 100, 200, 500, 1000, 1200, 1500, 2000, 2500, 3000, 3500] #, 1500, 2000, 2500, 3000, 3500
model_numbers = [1, 100, 200, 500, 1000, 1200, 1500, 2000, 2500, 3000, 3500, 4000] #, 1500, 2000, 2500, 3000, 3500
fig = plt.figure(figsize=fig_size)
## Transformer: For test loss
for i in range(n_runs):
Expand Down Expand Up @@ -164,10 +166,9 @@ def collect_loss_prec_data(m_type):
dnn_low_te_precision = dnn_low_te_precision[model_numbers]
dnn_runs_te_prec_low.extend(dnn_low_te_precision)


df_tr_rnn_cnn_dnn_runs_te_prec = pd.DataFrame(zip(runs_indices, transformer_runs_te_prec, rnn_runs_te_prec, cnn_runs_te_prec, dnn_runs_te_prec, transformer_runs_te_prec_low, rnn_runs_te_prec_low, cnn_runs_te_prec_low, dnn_runs_te_prec_low), columns=["indices", "tran_prec", "rnn_prec", "cnn_prec", "dnn_prec", "transformer_runs_te_prec_low", "rnn_runs_te_prec_low", "cnn_runs_te_prec_low", "dnn_runs_te_prec_low"])

print(df_tr_rnn_cnn_dnn_runs_te_prec)
# precision
df_tr_rnn_cnn_dnn_runs_te_prec = pd.DataFrame(zip(runs_indices, transformer_runs_te_prec, rnn_runs_te_prec, cnn_runs_te_prec, dnn_runs_te_prec), columns=["indices", "tran_prec", "rnn_prec", "cnn_prec", "dnn_prec"])


df_tr_rnn_cnn_dnn_runs_te_prec.to_csv("plots/df_tr_rnn_cnn_dnn_runs_te_prec.csv", index=None, sep="\t")

Expand All @@ -176,17 +177,35 @@ def collect_loss_prec_data(m_type):
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="cnn_prec", label="CNN: test tools", color="blue", linestyle="-")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="dnn_prec", label="DNN: test tools", color="black", linestyle="-")

sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="transformer_runs_te_prec_low", label="Transformer: lowest 25% test tools", color="green", linestyle=":")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="rnn_runs_te_prec_low", label="RNN (GRU): lowest 25% test tools", color="red", linestyle=":")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="cnn_runs_te_prec_low", label="CNN: lowest 25% test tools", color="blue", linestyle=":")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="dnn_runs_te_prec_low", label="DNN: lowest 25% test tools", color="black", linestyle=":")
plt.grid(True)
plt.xlabel("Training iteration")
plt.ylabel("Precision@k")
plt.title("Test: precision@k")

plt.savefig("plots/df_tr_rnn_cnn_dnn_runs_te_prec.pdf", dpi=150, bbox_inches='tight')
plt.show()

# with low precision
df_tr_rnn_cnn_dnn_runs_te_prec_low_prec = pd.DataFrame(zip(runs_indices, transformer_runs_te_prec, rnn_runs_te_prec, cnn_runs_te_prec, dnn_runs_te_prec, transformer_runs_te_prec_low, rnn_runs_te_prec_low, cnn_runs_te_prec_low, dnn_runs_te_prec_low), columns=["indices", "tran_prec", "rnn_prec", "cnn_prec", "dnn_prec", "transformer_runs_te_prec_low", "rnn_runs_te_prec_low", "cnn_runs_te_prec_low", "dnn_runs_te_prec_low"])

df_tr_rnn_cnn_dnn_runs_te_prec_low_prec.to_csv("plots/df_tr_rnn_cnn_dnn_runs_te_prec_low_prec.csv", index=None, sep="\t")

sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="tran_prec", label="Transformer: test tools", color="green", linestyle="-")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="rnn_prec", label="RNN (GRU): test tools", color="red", linestyle="-")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="cnn_prec", label="CNN: test tools", color="blue", linestyle="-")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="dnn_prec", label="DNN: test tools", color="black", linestyle="-")

sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="transformer_runs_te_prec_low", label="Transformer: lowest 25% test tools", color="green", linestyle=":")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="rnn_runs_te_prec_low", label="RNN (GRU): lowest 25% test tools", color="red", linestyle=":")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="cnn_runs_te_prec_low", label="CNN: lowest 25% test tools", color="blue", linestyle=":")
sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="dnn_runs_te_prec_low", label="DNN: lowest 25% test tools", color="black", linestyle=":")

plt.grid(True)
plt.xlabel("Training iteration")
plt.ylabel("Precision@k")
plt.title("Test: precision@k")

plt.savefig("plots/transformer_rnn_cnn_dnn_runs_te_low_prec.pdf", dpi=150)
plt.savefig("plots/df_tr_rnn_cnn_dnn_runs_te_prec_low_prec.pdf", dpi=150, bbox_inches='tight')
plt.show()


Expand Down Expand Up @@ -344,7 +363,7 @@ def plot_model_vs_load_time():
plt.xlabel("Training step")
plt.ylabel("Model load time (seconds)")
plt.title("Transformer, RNN (GRU), CNN and DNN models loading time")
plt.savefig("plots/transformer_rnn_runs_model_load_time.pdf", dpi=150)
plt.savefig("plots/transformer_rnn_runs_model_load_time.pdf", dpi=150, bbox_inches='tight')
plt.show()


Expand Down Expand Up @@ -497,7 +516,7 @@ def plot_usage_time_vs_seq_len():

############ Call methods ###########################

collect_loss_prec_data(["transformer", "rnn", "cnn", "dnn"])
#collect_loss_prec_data(["transformer", "rnn", "cnn", "dnn"])
plot_model_vs_load_time()
#plot_usage_time_vs_topk()
#plot_usage_time_vs_seq_len()
Expand Down

0 comments on commit ae45032

Please sign in to comment.