Skip to content

Commit

Permalink
Adding cel validation on trainingRuntime CRD
Browse files Browse the repository at this point in the history
Signed-off-by: Akshay Chitneni <[email protected]>
  • Loading branch information
Akshay Chitneni committed Feb 3, 2025
1 parent ee11629 commit fa2e0ef
Show file tree
Hide file tree
Showing 14 changed files with 201 additions and 27 deletions.
2 changes: 1 addition & 1 deletion api.v2/openapi-spec/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@
},
"numProcPerNode": {
"description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
"type": "string"
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
}
}
},
Expand Down
18 changes: 17 additions & 1 deletion manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ spec:
description: Configuration for the MPI Runtime.
properties:
mpiImplementation:
default: OpenMPI
description: |-
Implementation name for the MPI to create the appropriate hostfile.
Defaults to OpenMPI.
Expand All @@ -61,6 +62,7 @@ spec:
format: int32
type: integer
runLauncherAsNode:
default: false
description: |-
Whether to run training process on the launcher Job.
Defaults to false.
Expand Down Expand Up @@ -583,14 +585,27 @@ spec:
type: integer
type: object
numProcPerNode:
anyOf:
- type: integer
- type: string
default: auto
description: |-
Number of processes per node.
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
Supported values: `auto`, `cpu`, `gpu`, or int value.
Defaults to `auto`.
type: string
x-kubernetes-int-or-string: true
x-kubernetes-validations:
- message: NumProcPerNode must be equal to auto, cpu, gpu,
or int value
rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
type: object
type: object
x-kubernetes-validations:
- message: numNodes should not be set if torch.elasticPolicy is configured
rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
- message: Only one of the policy can be configured
rule: '!(has(self.torch) && has(self.mpi))'
podGroupPolicy:
description: Configuration for the PodGroup to enable gang-scheduling
via supported plugins.
Expand All @@ -600,6 +615,7 @@ spec:
for gang-scheduling.
properties:
scheduleTimeoutSeconds:
default: 60
description: |-
Time threshold to schedule PodGroup for gang-scheduling.
If the scheduling timeout is equal to 0, the default value is used.
Expand Down
18 changes: 17 additions & 1 deletion manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ spec:
description: Configuration for the MPI Runtime.
properties:
mpiImplementation:
default: OpenMPI
description: |-
Implementation name for the MPI to create the appropriate hostfile.
Defaults to OpenMPI.
Expand All @@ -61,6 +62,7 @@ spec:
format: int32
type: integer
runLauncherAsNode:
default: false
description: |-
Whether to run training process on the launcher Job.
Defaults to false.
Expand Down Expand Up @@ -583,14 +585,27 @@ spec:
type: integer
type: object
numProcPerNode:
anyOf:
- type: integer
- type: string
default: auto
description: |-
Number of processes per node.
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
Supported values: `auto`, `cpu`, `gpu`, or int value.
Defaults to `auto`.
type: string
x-kubernetes-int-or-string: true
x-kubernetes-validations:
- message: NumProcPerNode must be equal to auto, cpu, gpu,
or int value
rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
type: object
type: object
x-kubernetes-validations:
- message: numNodes should not be set if torch.elasticPolicy is configured
rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
- message: Only one of the policy can be configured
rule: '!(has(self.torch) && has(self.mpi))'
podGroupPolicy:
description: Configuration for the PodGroup to enable gang-scheduling
via supported plugins.
Expand All @@ -600,6 +615,7 @@ spec:
for gang-scheduling.
properties:
scheduleTimeoutSeconds:
default: 60
description: |-
Time threshold to schedule PodGroup for gang-scheduling.
If the scheduling timeout is equal to 0, the default value is used.
Expand Down
5 changes: 4 additions & 1 deletion manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3138,11 +3138,14 @@ spec:
format: int32
type: integer
numProcPerNode:
anyOf:
- type: integer
- type: string
description: |-
Number of processes/workers/slots on every training node.
For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
For the MPI runtime only int value can be set.
type: string
x-kubernetes-int-or-string: true
resourcesPerNode:
description: Compute resources for each training node.
properties:
Expand Down
10 changes: 9 additions & 1 deletion pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v2alpha1
import (
autoscalingv2 "k8s.io/api/autoscaling/v2"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
)

Expand Down Expand Up @@ -142,10 +143,13 @@ type CoschedulingPodGroupPolicySource struct {
// Time threshold to schedule PodGroup for gang-scheduling.
// If the scheduling timeout is equal to 0, the default value is used.
// Defaults to 60 seconds.
// +kubebuilder:default=60
ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
}

// MLPolicy represents configuration for the model trining with ML-specific parameters.
// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
type MLPolicy struct {
// Number of training nodes.
// Defaults to 1.
Expand Down Expand Up @@ -173,7 +177,9 @@ type TorchMLPolicySource struct {
// Supported values: `auto`, `cpu`, `gpu`, or int value.
// TODO (andreyvelich): Add kubebuilder validation.
// Defaults to `auto`.
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
// +kubebuilder:default="auto"
// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`

// Elastic policy for the PyTorch training.
ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
Expand Down Expand Up @@ -210,13 +216,15 @@ type MPIMLPolicySource struct {

// Implementation name for the MPI to create the appropriate hostfile.
// Defaults to OpenMPI.
// +kubebuilder:default="OpenMPI"
MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`

// Directory where SSH keys are mounted.
SSHAuthMountPath *string `json:"sshAuthMountPath,omitempty"`

// Whether to run training process on the launcher Job.
// Defaults to false.
// +kubebuilder:default=false
RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
}

Expand Down
3 changes: 2 additions & 1 deletion pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v2alpha1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

const (
Expand Down Expand Up @@ -194,7 +195,7 @@ type Trainer struct {
// Number of processes/workers/slots on every training node.
// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
// For the MPI runtime only int value can be set.
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
}

// DatasetConfig represents the desired dataset configuration.
Expand Down
5 changes: 3 additions & 2 deletions pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 4 additions & 6 deletions pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion pkg/runtime.v2/framework/plugins/torch/torch.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package torch
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/util/intstr"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
Expand Down Expand Up @@ -71,14 +72,20 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *kubeflowv2.TrainJo
// TODO (andreyvelich): Add validation to check that TrainJob doesn't have "PET_" envs.
// TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other.
// Ref: https://github.com/kubeflow/training-operator/pull/2308#discussion_r1823229940
var numProcPerNodeVal string
if numProcPerNode.Type == intstr.Int {
numProcPerNodeVal = string(numProcPerNode.IntVal)
} else {
numProcPerNodeVal = numProcPerNode.StrVal
}
infoEnvs := []corev1.EnvVar{
{
Name: constants.TorchEnvNumNodes,
Value: fmt.Sprintf("%d", ptr.Deref(numNodes, 1)),
},
{
Name: constants.TorchEnvNumProcPerNode,
Value: ptr.Deref(numProcPerNode, "auto"),
Value: numProcPerNodeVal,
},
{
Name: constants.TorchEnvNodeRank,
Expand Down
5 changes: 3 additions & 2 deletions pkg/util.v2/testing/wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
Expand Down Expand Up @@ -392,7 +393,7 @@ func (t *TrainJobTrainerWrapper) NumNodes(numNodes int32) *TrainJobTrainerWrappe
return t
}

func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode string) *TrainJobTrainerWrapper {
func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode intstr.IntOrString) *TrainJobTrainerWrapper {
t.Trainer.NumProcPerNode = &numProcPerNode
return t
}
Expand Down Expand Up @@ -689,7 +690,7 @@ func (s *TrainingRuntimeSpecWrapper) NumNodes(numNodes int32) *TrainingRuntimeSp
return s
}

func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode string) *TrainingRuntimeSpecWrapper {
func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode intstr.IntOrString) *TrainingRuntimeSpecWrapper {
s.MLPolicy = &kubeflowv2.MLPolicy{
NumNodes: &numNodes,
MLPolicySource: kubeflowv2.MLPolicySource{
Expand Down
2 changes: 1 addition & 1 deletion sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ TorchMLPolicySource represents a PyTorch runtime configuration.
Name | Type | Description | Notes
------------ | ------------- | ------------- | -------------
**elastic_policy** | [**KubeflowOrgV2alpha1TorchElasticPolicy**](KubeflowOrgV2alpha1TorchElasticPolicy.md) | | [optional]
**num_proc_per_node** | **str** | Number of processes per node. This value is inserted into the &#x60;--nproc-per-node&#x60; argument of the &#x60;torchrun&#x60; CLI. Supported values: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value. Defaults to &#x60;auto&#x60;. | [optional]
**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) | | [optional]

[[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class KubeflowOrgV2alpha1TorchMLPolicySource(object):
"""
openapi_types = {
'elastic_policy': 'KubeflowOrgV2alpha1TorchElasticPolicy',
'num_proc_per_node': 'str'
'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString'
}

attribute_map = {
Expand Down Expand Up @@ -82,21 +82,19 @@ def elastic_policy(self, elastic_policy):
def num_proc_per_node(self):
"""Gets the num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. # noqa: E501
Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. # noqa: E501
:return: The num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. # noqa: E501
:rtype: str
:rtype: K8sIoApimachineryPkgUtilIntstrIntOrString
"""
return self._num_proc_per_node

@num_proc_per_node.setter
def num_proc_per_node(self, num_proc_per_node):
"""Sets the num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource.
Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. # noqa: E501
:param num_proc_per_node: The num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. # noqa: E501
:type: str
:type: K8sIoApimachineryPkgUtilIntstrIntOrString
"""

self._num_proc_per_node = num_proc_per_node
Expand Down
7 changes: 3 additions & 4 deletions test/integration/webhook.v2/clustertrainingruntime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ limitations under the License.
package webhookv2

import (
kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing"
"github.com/kubeflow/training-operator/test/integration/framework"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing"
"github.com/kubeflow/training-operator/test/integration/framework"
)

const clTrainingRuntimeName = "test-clustertrainingruntime"
Expand Down
Loading

0 comments on commit fa2e0ef

Please sign in to comment.