Merge branch 'developer' into main

enryH · web-flow · commit fe8c48bde06b · 2024-08-12T13:18:34.000+02:00
diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py
@@ -55,18 +55,21 @@ def encode_data(config: DataConfig):
         # before preprocessing:
         fig = plot_value_distributions(values)
         fig_path = str(
-            output_path / "Value_distribution_{}_unprocessed.png".format(dataset_name)
+            output_path / f"Value_distribution_{dataset_name}_unprocessed.png"
         )
         fig.savefig(fig_path)
 
-        # Plotting the value distribution for all continuous datasets:
-        fig = plot_value_distributions(values)
-        fig_path = str(output_path / f"Value_distribution_{dataset_name}.png")
-        fig.savefig(fig_path)
-
         if scale:
+            logger.debug(
+                f"Scaling dataset: {dataset_name}, log2 transform: {input_config.log2}"
+            )
             values, mask_1d = preprocessing.scale(values, input_config.log2)
             names = names[mask_1d]
             logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")
+            # Plotting the value distribution for all continuous datasets:
+            fig = plot_value_distributions(values)
+            fig_path = str(output_path / f"Value_distribution_{dataset_name}.png")
+            fig.savefig(fig_path)
+
         io.dump_names(interim_data_path / f"{dataset_name}.txt", names)
         np.save(interim_data_path / f"{dataset_name}.npy", values)
diff --git a/src/move/training/training_loop.py b/src/move/training/training_loop.py
@@ -72,13 +72,10 @@ def training_loop(
     counter = 0
 
     kld_weight = 0.0
-    kld_rate = 20 / len(kld_warmup_steps)
-    kld_multiplier = 1 + kld_rate
 
     for epoch in range(1, num_epochs + 1):
         if epoch in kld_warmup_steps:
-            kld_weight = 0.05 * kld_multiplier
-            kld_multiplier += kld_rate
+            kld_weight += 1 / len(kld_warmup_steps)
 
         if epoch in batch_dilation_steps:
             train_dataloader = dilate_batch(train_dataloader)
diff --git a/tutorial/config/data/random_continuous.yaml b/tutorial/config/data/random_continuous.yaml
@@ -22,4 +22,4 @@ continuous_inputs: # a list of continuous datasets
   - name: random.continuous.metagenomics # filename in raw_data_path
     log2: true # log2 transform data
     scale: true # scale data
-
+ 
diff --git a/tutorial/config/data/random_small.yaml b/tutorial/config/data/random_small.yaml
@@ -18,8 +18,8 @@ categorical_inputs: # a list of categorical datasets
 
 continuous_inputs: # a list of continuous datasets
   - name: random.small.proteomics # filename in raw_data_path
-    scale: true # scale data
-    log2: true # log2 transform data
+    log2: true #apply log2 before scaling
+    scale: true #scale data (z-score normalize)
   - name: random.small.metagenomics # filename in raw_data_path
     scale: true # scale data
     log2: true # log2 transform data