Allow different model parallelism in pretrain/fine-tune or pretrain1/…

…pretrain2 checkpoints. (#276) * Fix issue where we were only partially setting TP changes during checkpoint resumption * Reference issue #275
NVIDIA · Oct 8, 2024 · f418059 · f418059
1 parent fb33522
commit f418059
Showing 1 changed file with 10 additions and 0 deletions.
diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py b/sub-packages/bionemo-llm/src/bionemo/llm/model/biobert/model.py
@@ -78,9 +78,19 @@
 logger = logging.getLogger(__file__)
 
 # Add some fields specific to the BIOBERT config that we want to override by default
+# TODO automatically determine which fields a user is trying to override in the future.
 _OVERRIDE_BIOBERT_CONFIG_DEFAULTS: List[str] = OVERRIDE_BIONEMO_CONFIG_DEFAULTS + [
     "return_only_hidden_states",
     "include_hiddens",
+    # Model parallelism settings! Important to override these if the user requests different settings from how
+    #  a model was trained (common). See https://github.com/NVIDIA/bionemo-fw-ea/issues/275
+    "tensor_model_parallel_size",
+    "pipeline_model_parallel_size",
+    "virtual_pipeline_model_parallel_size",
+    "sequence_parallel",
+    "context_parallel_size",
+    "expert_model_parallel_size",
+    "moe_extended_tp",
 ]
 
 # A copy that we do not use internally. Useful for external users who want to