Skip to content

Commit 905fbe7

Browse files
committed
KEP-2170: Add validation to Torch numProcPerNode field
Signed-off-by: Antonin Stefanutti <[email protected]>
1 parent 112dc75 commit 905fbe7

18 files changed

+60
-41
lines changed

api/openapi-spec/swagger.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,7 @@
517517
},
518518
"numProcPerNode": {
519519
"description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
520-
"type": "string"
520+
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
521521
}
522522
}
523523
},
@@ -716,7 +716,7 @@
716716
},
717717
"numProcPerNode": {
718718
"description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
719-
"type": "string"
719+
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
720720
},
721721
"resourcesPerNode": {
722722
"description": "Compute resources for each training node.",

manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,12 +583,17 @@ spec:
583583
type: integer
584584
type: object
585585
numProcPerNode:
586+
anyOf:
587+
- type: integer
588+
- type: string
586589
description: |-
587590
Number of processes per node.
588591
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
589592
Supported values: `auto`, `cpu`, `gpu`, or int value.
590593
Defaults to `auto`.
591-
type: string
594+
x-kubernetes-int-or-string: true
595+
x-kubernetes-validations:
596+
- rule: self > 0 || self in ['auto', 'cpu', 'gpu']
592597
type: object
593598
type: object
594599
podGroupPolicy:

manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,12 +583,17 @@ spec:
583583
type: integer
584584
type: object
585585
numProcPerNode:
586+
anyOf:
587+
- type: integer
588+
- type: string
586589
description: |-
587590
Number of processes per node.
588591
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
589592
Supported values: `auto`, `cpu`, `gpu`, or int value.
590593
Defaults to `auto`.
591-
type: string
594+
x-kubernetes-int-or-string: true
595+
x-kubernetes-validations:
596+
- rule: self > 0 || self in ['auto', 'cpu', 'gpu']
592597
type: object
593598
type: object
594599
podGroupPolicy:

manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3138,11 +3138,14 @@ spec:
31383138
format: int32
31393139
type: integer
31403140
numProcPerNode:
3141+
anyOf:
3142+
- type: integer
3143+
- type: string
31413144
description: |-
31423145
Number of processes/workers/slots on every training node.
31433146
For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
31443147
For the MPI runtime only int value can be set.
3145-
type: string
3148+
x-kubernetes-int-or-string: true
31463149
resourcesPerNode:
31473150
description: Compute resources for each training node.
31483151
properties:

pkg/apis/trainer/v1alpha1/trainingruntime_types.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package v1alpha1
1919
import (
2020
autoscalingv2 "k8s.io/api/autoscaling/v2"
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
"k8s.io/apimachinery/pkg/util/intstr"
2223
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
2324
)
2425

@@ -171,9 +172,9 @@ type TorchMLPolicySource struct {
171172
// Number of processes per node.
172173
// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
173174
// Supported values: `auto`, `cpu`, `gpu`, or int value.
174-
// TODO (andreyvelich): Add kubebuilder validation.
175175
// Defaults to `auto`.
176-
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
176+
// +kubebuilder:validation:XValidation:rule="self > 0 || self in ['auto', 'cpu', 'gpu']"
177+
NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
177178

178179
// Elastic policy for the PyTorch training.
179180
ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`

pkg/apis/trainer/v1alpha1/trainjob_types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package v1alpha1
1919
import (
2020
corev1 "k8s.io/api/core/v1"
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
"k8s.io/apimachinery/pkg/util/intstr"
2223
)
2324

2425
const (
@@ -194,7 +195,7 @@ type Trainer struct {
194195
// Number of processes/workers/slots on every training node.
195196
// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
196197
// For the MPI runtime only int value can be set.
197-
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
198+
NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
198199
}
199200

200201
// DatasetConfig represents the desired dataset configuration.

pkg/apis/trainer/v1alpha1/zz_generated.deepcopy.go

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/trainer/v1alpha1/zz_generated.openapi.go

Lines changed: 4 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/client/applyconfiguration/trainer/v1alpha1/torchmlpolicysource.go

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/client/applyconfiguration/trainer/v1alpha1/trainer.go

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)