KEP-2170: Add validation to Torch numProcPerNode field (kubeflow/trainer#2409)

astefanutti · web-flow · commit 9f1effb49fc3 · 2025-02-14T20:57:37.000Z
Signed-off-by: Antonin Stefanutti &lt;antonin@stefanutti.fr&gt;
diff --git a/docs/TrainerV1alpha1TorchMLPolicySource.md b/docs/TrainerV1alpha1TorchMLPolicySource.md
@@ -5,7 +5,7 @@ TorchMLPolicySource represents a PyTorch runtime configuration.
 Name | Type | Description | Notes
 ------------ | ------------- | ------------- | -------------
 **elastic_policy** | [**TrainerV1alpha1TorchElasticPolicy**](TrainerV1alpha1TorchElasticPolicy.md) |  | [optional] 
-**num_proc_per_node** | **str** | Number of processes per node. This value is inserted into the &#x60;--nproc-per-node&#x60; argument of the &#x60;torchrun&#x60; CLI. Supported values: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value. Defaults to &#x60;auto&#x60;. | [optional] 
+**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) |  | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
diff --git a/docs/TrainerV1alpha1Trainer.md b/docs/TrainerV1alpha1Trainer.md
@@ -9,7 +9,7 @@ Name | Type | Description | Notes
 **env** | [**list[V1EnvVar]**](V1EnvVar.md) | List of environment variables to set in the training container. These values will be merged with the TrainingRuntime&#39;s trainer environments. | [optional] 
 **image** | **str** | Docker image for the training container. | [optional] 
 **num_nodes** | **int** | Number of training nodes. | [optional] 
-**num_proc_per_node** | **str** | Number of processes/workers/slots on every training node. For the Torch runtime: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value can be set. For the MPI runtime only int value can be set. | [optional] 
+**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) |  | [optional] 
 **resources_per_node** | [**V1ResourceRequirements**](V1ResourceRequirements.md) |  | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
diff --git a/kubeflow/trainer/models/trainer_v1alpha1_torch_ml_policy_source.py b/kubeflow/trainer/models/trainer_v1alpha1_torch_ml_policy_source.py
@@ -34,7 +34,7 @@ class TrainerV1alpha1TorchMLPolicySource(object):
     """
     openapi_types = {
         'elastic_policy': 'TrainerV1alpha1TorchElasticPolicy',
-        'num_proc_per_node': 'str'
+        'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString'
     }
 
     attribute_map = {
@@ -82,21 +82,19 @@ def elastic_policy(self, elastic_policy):
     def num_proc_per_node(self):
         """Gets the num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.  # noqa: E501
 
-        Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.  # noqa: E501
 
         :return: The num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.  # noqa: E501
-        :rtype: str
+        :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
         return self._num_proc_per_node
 
     @num_proc_per_node.setter
     def num_proc_per_node(self, num_proc_per_node):
         """Sets the num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.
 
-        Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.  # noqa: E501
 
         :param num_proc_per_node: The num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.  # noqa: E501
-        :type: str
+        :type: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
 
         self._num_proc_per_node = num_proc_per_node
diff --git a/kubeflow/trainer/models/trainer_v1alpha1_trainer.py b/kubeflow/trainer/models/trainer_v1alpha1_trainer.py
@@ -38,7 +38,7 @@ class TrainerV1alpha1Trainer(object):
         'env': 'list[V1EnvVar]',
         'image': 'str',
         'num_nodes': 'int',
-        'num_proc_per_node': 'str',
+        'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString',
         'resources_per_node': 'V1ResourceRequirements'
     }
 
@@ -201,21 +201,19 @@ def num_nodes(self, num_nodes):
     def num_proc_per_node(self):
         """Gets the num_proc_per_node of this TrainerV1alpha1Trainer.  # noqa: E501
 
-        Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.  # noqa: E501
 
         :return: The num_proc_per_node of this TrainerV1alpha1Trainer.  # noqa: E501
-        :rtype: str
+        :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
         return self._num_proc_per_node
 
     @num_proc_per_node.setter
     def num_proc_per_node(self, num_proc_per_node):
         """Sets the num_proc_per_node of this TrainerV1alpha1Trainer.
 
-        Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.  # noqa: E501
 
         :param num_proc_per_node: The num_proc_per_node of this TrainerV1alpha1Trainer.  # noqa: E501
-        :type: str
+        :type: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
 
         self._num_proc_per_node = num_proc_per_node