update

anuprulez · May 31, 2023 · ae45032 · ae45032
1 parent c494289
commit ae45032
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 35 deletions.
diff --git a/scripts/evaluate_model.py b/scripts/evaluate_model.py
@@ -45,7 +45,7 @@
 plt.rc('font', **font)
 
 batch_size = 10
-test_batches = 10
+test_batches = 0
 n_topk = 1
 max_seq_len = 25
 
@@ -63,13 +63,13 @@
     #"log_19_09_22_GPU_RNN_full_data/"
     #"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/rnn/run2/" #"log_19_09_22_GPU_RNN_full_data/" #"log_22_08_22_rnn/" #"log_08_08_22_rnn/"
 elif model_type == "cnn":
-    base_path = "log_cnn/"
+    base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/cnn_full_data/" #"log_cnn/"
 
 elif model_type == "transformer":
-    base_path = "log/"
+    base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/log_19_09_22_GPU_transformer_full_data/"
 
 elif model_type == "dnn":
-    base_path = "log_dnn/"
+    base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/dnn_full_data/"
 
 #"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/transformer/run2/"
 #"log_19_09_22_GPU_transformer_full_data/" #"log_12_09_22_GPU/" #"log_19_09_22_GPU_transformer_full_data/" 
@@ -92,12 +92,21 @@
 
 ## Transformer
 ## GPU: 40,000 steps, batch size: 512 - 132505.20160245895 seconds
-## CPU: 40 steps, batch  size: 512 - 158683 seconds
+## CPU: 40,000 steps, batch  size: 512 - 158683 seconds
+
+## New CPU: Saving model at training step 400/400
+## 400 steps: Program finished in 1742.0802121162415 seconds
+
 
 ## RNN
 ## GPU: 40,000 steps, batch size: 512 - 129000 seconds
 ## CPU: 40,000 steps, batch size: 512 - 193863 seconds
 
+
+## New CPU: 400 Saving model at training step 400/400
+
+### 400 steps Program finished in 2656.933450937271 seconds
+
 #"log_03_08_22_1/" Balanced data with really selection of low freq tools - random choice
 # RNN: log_01_08_22_3_rnn
 # Transformer: log_01_08_22_0
@@ -109,16 +118,34 @@
 
 #CPU: Program finished in 211880.3916196823 seconds
 
+### CNN 2
+
+# CPU: Program finished in 206890.36703562737 seconds
+
+### CNN Full data
+
+# CPU: Program finished in 230368.01805138588 seconds
+
 #tr_pos_plot = [1000, 5000, 10000, 20000, 30000, 40000]
 
 # DNN
 # Saving model at training step 40000/40000
 
 # CPU: Program finished in 177232.39850640297 seconds
 
+## DNN 2
 
+# Program finished in 181519.5743496418 seconds
 
-model_number = 40
+### DNN full
+
+# Saving model at training step 40000/40000
+
+# CPU: Program finished in 209876.5223982334 seconds
+
+
+
+model_number = 40000
 model_path = base_path + "saved_model/" + str(model_number) + "/tf_model/"
 model_path_h5 = base_path + "saved_model/" + str(model_number) + "/tf_model_h5/"
 
@@ -596,7 +623,7 @@ def plot_TSNE(embed, labels):
 
 def predict_seq():
 
-    visualize_loss_acc()  
+    #visualize_loss_acc()  
 
     #sys.exit()
 
@@ -768,7 +795,7 @@ def predict_seq():
     #import sys
     #sys.exit()
 
-    te_lowest_t_ids = utils.read_saved_file(base_path + "data/te_lowest_t_ids.txt")
+    '''te_lowest_t_ids = utils.read_saved_file(base_path + "data/te_lowest_t_ids.txt")
     lowest_t_ids = [int(item) for item in te_lowest_t_ids.split(",")]
     print(lowest_t_ids)
     lowest_t_ids = lowest_t_ids[:5]
@@ -797,7 +824,7 @@ def predict_seq():
     print("Time taken to predict tools: {} seconds".format(low_diff_pred_t))
 
     for i, (low_inp, low_tar) in enumerate(zip(low_te_data, low_te_labels)):
-        '''pred_s_time = time.time()
+        pred_s_time = time.time()
         if predict_rnn is True:
             low_prediction = tf_loaded_model([low_inp], training=False)
         else:
@@ -806,7 +833,7 @@ def predict_seq():
         pred_e_time = time.time()
         low_diff_pred_t = pred_e_time - pred_s_time
         low_te_pred_time.append(low_diff_pred_t)
-        print("Time taken to predict tools: {} seconds".format(low_diff_pred_t))'''
+        print("Time taken to predict tools: {} seconds".format(low_diff_pred_t))
         low_prediction = bat_low_prediction[i]
         low_tar = low_te_labels[i]
         low_label_pos = np.where(low_tar > 0)[0]
@@ -833,9 +860,9 @@ def predict_seq():
        
         print("-----------------")
         print()
-        '''if predict_rnn is False:
+        if predict_rnn is False:
             i_names = ",".join([r_dict[str(int(item))] for item in low_inp[low_inp_pos]])
-            generated_attention(att_weights[i], i_names, f_dict, r_dict)'''
+            generated_attention(att_weights[i], i_names, f_dict, r_dict)
 
     if test_batches > 0:
         print("Batch Precision@{}: {}".format(n_topk, np.mean(precision)))
@@ -851,13 +878,13 @@ def predict_seq():
         print("Low: test average prediction time: {}".format(np.mean(low_te_pred_time)))
         print()
         
-    sys.exit()
+    #sys.exit()
     print("----------------------------")
     print()
     print("Predicting for individual sequences...")
     print()
     #print("Low precision on labels: {}".format(error_label_tools))
-    #print("Low precision on labels: {}, # tools: {}".format(list(set(error_label_tools)), len(list(set(error_label_tools)))))
+    #print("Low precision on labels: {}, # tools: {}".format(list(set(error_label_tools)), len(list(set(error_label_tools)))))'''
     # individual tools or seq prediction
     '''print()
     n_topk_ind = 20
@@ -875,9 +902,9 @@ def predict_seq():
     t_ip[2] = int(f_dict["hicexplorer_hicfindtads"])
     t_ip[3] = int(f_dict["hicexplorer_hicpca"])'''
 
-    t_ip[0, 0] = int(f_dict["mass_spectrometry_imaging_filtering"]) #ivar_covid_aries_consensus
-    t_ip[0, 1] = int(f_dict["cardinal_preprocessing"])
-    t_ip[0, 2] = int(f_dict["cardinal_segmentations"])
+    t_ip[0, 0] = int(f_dict["keras_train_and_eval"]) #ivar_covid_aries_consensus
+    #t_ip[0, 1] = int(f_dict["cardinal_preprocessing"])
+    #t_ip[0, 2] = int(f_dict["cardinal_segmentations"])
     #t_ip[0, 3] = int(f_dict["heinz"])
 
     #t_ip[4] = int(f_dict["prokka"])
@@ -887,7 +914,12 @@ def predict_seq():
     #t_ip[8] = int(f_dict["anndata_manipulate"])
     # 'snpEff_build_gb', 'bwa_mem', 'samtools_view', snpeff_sars_cov_2
 
-    last_tool_name = "cardinal_segmentations"
+    # 1. snpeff_sars_cov_2
+    # 2. anndata_import
+    # 3. keras_train_and_eval
+    # 4. cardinal_preprocessing, cardinal_segmentations, mass_spectrometry_imaging_filtering
+
+    last_tool_name = "keras_train_and_eval"
 
     t_ip = tf.convert_to_tensor(t_ip, dtype=tf.int64)
     t_ip = tf.cast(t_ip, dtype=tf.float32)
@@ -899,8 +931,8 @@ def predict_seq():
         #t_ip_mask = utils.create_padding_mask(t_ip)
         #t_ip_mask = tf.cast(t_ip_mask, dtype=tf.float32)
         #prediction, att_weights = tf_loaded_model([t_ip, t_ip_mask], training=False)
-        prediction, att_weights = tf_loaded_model(t_ip, training=False)
-        print(prediction.shape, att_weights.shape)
+        embed, prediction, att_weights = tf_loaded_model(t_ip, training=False)
+        print(embed.shape, prediction.shape, att_weights.shape)
     pred_e_time = time.time()
     print("Time taken to predict tools: {} seconds".format(pred_e_time - pred_s_time))
     prediction_cwts = tf.math.multiply(c_weights, prediction)

diff --git a/scripts/train_cnn.py b/scripts/train_cnn.py
@@ -35,7 +35,7 @@ def create_model(vocab_size, config):
     dropout = config["dropout"]
 
     model = Sequential()
-    model.add(Embedding(vocab_size+1, cnn_units, input_length=config["maximum_path_length"]))
+    model.add(Embedding(vocab_size, cnn_units, input_length=config["maximum_path_length"], mask_zero=True))
     model.add(Lambda(lambda x: tf.expand_dims(x, 3)))
     model.add(Conv2D(cnn_units, kernel_size=(16, 3), activation = 'relu', kernel_initializer='he_normal', padding = 'VALID'))
     model.add(Dropout(dropout))

diff --git a/scripts/train_dnn.py b/scripts/train_dnn.py
@@ -34,7 +34,7 @@ def create_model(vocab_size, config):
     seq_len = config["maximum_path_length"]
 
     model = Sequential()
-    model.add(Embedding(vocab_size+1, dnn_units, input_length=seq_len))
+    model.add(Embedding(vocab_size, dnn_units, input_length=seq_len, mask_zero=True))
     model.add(SpatialDropout1D(dropout))
     model.add(Flatten())
     model.add(Dense(dnn_units, input_shape=(seq_len,), activation="elu"))

diff --git a/scripts/transformer_paper_plots.py b/scripts/transformer_paper_plots.py
@@ -43,10 +43,12 @@
 
 
 base_path = "/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/"
+#"/media/anupkumar/6c9b94c9-2316-4ae1-887a-5047a02bc3d7/home/kumara/tool_prediction_compute_results/backup_tool_pred_transformer_computed_results/aug_22_data/"
+#"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/"
 #"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/"
 #"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/backup_tool_pred_transformer_computed_results/aug_22_data/"
 #"/media/anupkumar/b1ea0d39-97af-4ba5-983f-cd3ff76cf7a6/tool_prediction_datasets/computed_results/aug_22 data/"
-n_runs = 3
+n_runs = 5
 
 
 def read_file_cnn_dnn(file_path):
@@ -66,7 +68,7 @@ def collect_loss_prec_data(m_type):
     print(m_path)
     runs_indices = list()
     runs_te_loss = list()
-    model_numbers = [1, 100, 200, 500, 1000, 1200, 1500, 2000, 2500, 3000, 3500] #, 1500, 2000, 2500, 3000, 3500
+    model_numbers = [1, 100, 200, 500, 1000, 1200, 1500, 2000, 2500, 3000, 3500, 4000] #, 1500, 2000, 2500, 3000, 3500
     fig = plt.figure(figsize=fig_size)
     ## Transformer: For test loss
     for i in range(n_runs):
@@ -164,10 +166,9 @@ def collect_loss_prec_data(m_type):
        dnn_low_te_precision = dnn_low_te_precision[model_numbers]
        dnn_runs_te_prec_low.extend(dnn_low_te_precision)
 
-
-    df_tr_rnn_cnn_dnn_runs_te_prec = pd.DataFrame(zip(runs_indices, transformer_runs_te_prec, rnn_runs_te_prec, cnn_runs_te_prec, dnn_runs_te_prec, transformer_runs_te_prec_low, rnn_runs_te_prec_low, cnn_runs_te_prec_low, dnn_runs_te_prec_low), columns=["indices", "tran_prec", "rnn_prec", "cnn_prec", "dnn_prec", "transformer_runs_te_prec_low", "rnn_runs_te_prec_low", "cnn_runs_te_prec_low", "dnn_runs_te_prec_low"])
-
-    print(df_tr_rnn_cnn_dnn_runs_te_prec)
+    # precision
+    df_tr_rnn_cnn_dnn_runs_te_prec = pd.DataFrame(zip(runs_indices, transformer_runs_te_prec, rnn_runs_te_prec, cnn_runs_te_prec, dnn_runs_te_prec), columns=["indices", "tran_prec", "rnn_prec", "cnn_prec", "dnn_prec"])
+
 
     df_tr_rnn_cnn_dnn_runs_te_prec.to_csv("plots/df_tr_rnn_cnn_dnn_runs_te_prec.csv", index=None, sep="\t")
 
@@ -176,17 +177,35 @@ def collect_loss_prec_data(m_type):
     sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="cnn_prec", label="CNN: test tools", color="blue", linestyle="-")
     sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="dnn_prec", label="DNN: test tools", color="black", linestyle="-")
 
-    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="transformer_runs_te_prec_low", label="Transformer: lowest 25% test tools", color="green", linestyle=":")
-    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="rnn_runs_te_prec_low", label="RNN (GRU): lowest 25% test tools", color="red", linestyle=":")
-    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="cnn_runs_te_prec_low", label="CNN: lowest 25% test tools", color="blue", linestyle=":")
-    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec, x="indices", y="dnn_runs_te_prec_low", label="DNN: lowest 25% test tools", color="black", linestyle=":")
+    plt.grid(True)
+    plt.xlabel("Training iteration")
+    plt.ylabel("Precision@k")
+    plt.title("Test: precision@k")
+
+    plt.savefig("plots/df_tr_rnn_cnn_dnn_runs_te_prec.pdf", dpi=150, bbox_inches='tight')
+    plt.show()
+
+    # with low precision
+    df_tr_rnn_cnn_dnn_runs_te_prec_low_prec = pd.DataFrame(zip(runs_indices, transformer_runs_te_prec, rnn_runs_te_prec, cnn_runs_te_prec, dnn_runs_te_prec, transformer_runs_te_prec_low, rnn_runs_te_prec_low, cnn_runs_te_prec_low, dnn_runs_te_prec_low), columns=["indices", "tran_prec", "rnn_prec", "cnn_prec", "dnn_prec", "transformer_runs_te_prec_low", "rnn_runs_te_prec_low", "cnn_runs_te_prec_low", "dnn_runs_te_prec_low"])
+
+    df_tr_rnn_cnn_dnn_runs_te_prec_low_prec.to_csv("plots/df_tr_rnn_cnn_dnn_runs_te_prec_low_prec.csv", index=None, sep="\t")
+
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="tran_prec", label="Transformer: test tools", color="green", linestyle="-")
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="rnn_prec", label="RNN (GRU): test tools", color="red", linestyle="-")
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="cnn_prec", label="CNN: test tools", color="blue", linestyle="-")
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="dnn_prec", label="DNN: test tools", color="black", linestyle="-")
+
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="transformer_runs_te_prec_low", label="Transformer: lowest 25% test tools", color="green", linestyle=":")
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="rnn_runs_te_prec_low", label="RNN (GRU): lowest 25% test tools", color="red", linestyle=":")
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="cnn_runs_te_prec_low", label="CNN: lowest 25% test tools", color="blue", linestyle=":")
+    sns.lineplot(data=df_tr_rnn_cnn_dnn_runs_te_prec_low_prec, x="indices", y="dnn_runs_te_prec_low", label="DNN: lowest 25% test tools", color="black", linestyle=":")
 
     plt.grid(True)
     plt.xlabel("Training iteration")
     plt.ylabel("Precision@k")
     plt.title("Test: precision@k")
 
-    plt.savefig("plots/transformer_rnn_cnn_dnn_runs_te_low_prec.pdf", dpi=150)
+    plt.savefig("plots/df_tr_rnn_cnn_dnn_runs_te_prec_low_prec.pdf", dpi=150, bbox_inches='tight')
     plt.show()
 
 
@@ -344,7 +363,7 @@ def plot_model_vs_load_time():
     plt.xlabel("Training step")
     plt.ylabel("Model load time (seconds)")
     plt.title("Transformer, RNN (GRU), CNN and DNN models loading time")
-    plt.savefig("plots/transformer_rnn_runs_model_load_time.pdf", dpi=150)
+    plt.savefig("plots/transformer_rnn_runs_model_load_time.pdf", dpi=150, bbox_inches='tight')
     plt.show()
 
 
@@ -497,7 +516,7 @@ def plot_usage_time_vs_seq_len():
 
 ############ Call methods ###########################
 
-collect_loss_prec_data(["transformer", "rnn", "cnn", "dnn"])
+#collect_loss_prec_data(["transformer", "rnn", "cnn", "dnn"])
 plot_model_vs_load_time()
 #plot_usage_time_vs_topk()
 #plot_usage_time_vs_seq_len()