Fix transformer and error on example when CI uses single-GPU (#757)

* fix on going errors in transformers unit tests * added range to package versions * add linting changes required by linter * update pre-commit hook action * update file to pass lint action * Fixed bug in end-to-end-session-based that was failing test on CI when only a single GPU was available (which prevented multigpu training) --------- Co-authored-by: Julio <[email protected]> Co-authored-by: Julio Perez <[email protected]>
NVIDIA-Merlin · Nov 6, 2023 · 625897c · 625897c
1 parent 5bef974
commit 625897c
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 6 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -18,4 +18,4 @@ jobs:
         with:
           cache: 'pip'
           cache-dependency-path: '**/**.txt'
-      - uses: pre-commit/action@v2.0.3
+      - uses: pre-commit/action@v3.0.0
diff --git a/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb b/examples/end-to-end-session-based/03-Session-based-Yoochoose-multigpu-training-PyT.ipynb
@@ -286,6 +286,19 @@
     "- <b>per device batch size for evaluation</b>: see above"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c9e83d47-380c-4118-bc29-8bc108163fa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If only 1 GPU are available, starts a single process to use that GPU\n",
+    "from torch.cuda import device_count\n",
+    "num_gpus = device_count()\n",
+    "NUM_PROCESSES = min(num_gpus, 2)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -502,7 +515,7 @@
     "LR = float(os.environ.get(\"LEARNING_RATE\", \"0.0005\"))\n",
     "BATCH_SIZE_TRAIN = int(os.environ.get(\"BATCH_SIZE_TRAIN\", \"256\"))\n",
     "BATCH_SIZE_VALID = int(os.environ.get(\"BATCH_SIZE_VALID\", \"128\"))\n",
-    "!python -m torch.distributed.run --nproc_per_node 2 {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
+    "!python -m torch.distributed.run --nproc_per_node {NUM_PROCESSES} {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
    ]
   },
   {
@@ -554,7 +567,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py b/examples/t4rec_paper_experiments/t4r_paper_repro/transf_exp_main.py
@@ -224,7 +224,7 @@ def mask_last_interaction(x):
     logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}")
     output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt")
     with open(output_file, "a") as writer:
-        writer.write(f"\n***** Recall@10 of simulated inference  = {recall_10} *****\n")
+        writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n")
     # Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually
     if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling):
         # TODO fix inference discrepancy for permutation language modeling

diff --git a/requirements/base_external.txt b/requirements/base_external.txt
@@ -1,4 +1,4 @@
-transformers[torch]>=4.12,<5
+transformers[torch]>=4.12,<4.31.0
 tqdm>=4.27
 pyarrow>=1.0
 torchmetrics>=0.10.0
diff --git a/transformers4rec/torch/experimental.py b/transformers4rec/torch/experimental.py
@@ -97,7 +97,7 @@ def forward(self, inputs, training=False, testing=False, **kwargs):
             output = seq_rep + context_rep
         else:
             raise ValueError(
-                f"The aggregation {self.fusion_aggregation} is not supported,"
+                f"The aggregation {self.fusion_aggregation} is not supported, "
                 f"please select one of the following aggregations "
                 f"['concat', 'elementwise-mul', 'elementwise-sum']"
             )