Skip to content

Commit 0e9654d

Browse files
author
Akshay Chitneni
committed
Adding cel validation on trainingRuntime CRD
Signed-off-by: Akshay Chitneni <[email protected]>
1 parent 3f7ec16 commit 0e9654d

File tree

5 files changed

+167
-0
lines changed

5 files changed

+167
-0
lines changed

manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ spec:
5353
description: Directory where SSH keys are mounted.
5454
type: string
5555
mpiImplementation:
56+
default: OpenMPI
5657
description: |-
5758
Implementation name for the MPI to create the appropriate hostfile.
5859
Defaults to OpenMPI.
@@ -64,6 +65,7 @@ spec:
6465
format: int32
6566
type: integer
6667
runLauncherAsNode:
68+
default: false
6769
description: |-
6870
Whether to run training process on the launcher Job.
6971
Defaults to false.
@@ -576,15 +578,25 @@ spec:
576578
type: integer
577579
type: object
578580
numProcPerNode:
581+
default: auto
579582
description: |-
580583
Number of processes per node.
581584
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
582585
Supported values: `auto`, `cpu`, `gpu`, or int value.
583586
TODO (andreyvelich): Add kubebuilder validation.
584587
Defaults to `auto`.
585588
type: string
589+
x-kubernetes-validations:
590+
- message: NumProcPerNode must be auto,cpu,gpu strings or
591+
int value
592+
rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
586593
type: object
587594
type: object
595+
x-kubernetes-validations:
596+
- message: numNodes should not be set if torch.elasticPolicy is configured
597+
rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
598+
- message: Only one of the policy can be configured
599+
rule: '!(has(self.torch) && has(self.mpi))'
588600
podGroupPolicy:
589601
description: Configuration for the PodGroup to enable gang-scheduling
590602
via supported plugins.
@@ -594,6 +606,7 @@ spec:
594606
for gang-scheduling.
595607
properties:
596608
scheduleTimeoutSeconds:
609+
default: 60
597610
description: |-
598611
Time threshold to schedule PodGroup for gang-scheduling.
599612
If the scheduling timeout is equal to 0, the default value is used.

manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ spec:
5353
description: Directory where SSH keys are mounted.
5454
type: string
5555
mpiImplementation:
56+
default: OpenMPI
5657
description: |-
5758
Implementation name for the MPI to create the appropriate hostfile.
5859
Defaults to OpenMPI.
@@ -64,6 +65,7 @@ spec:
6465
format: int32
6566
type: integer
6667
runLauncherAsNode:
68+
default: false
6769
description: |-
6870
Whether to run training process on the launcher Job.
6971
Defaults to false.
@@ -576,15 +578,25 @@ spec:
576578
type: integer
577579
type: object
578580
numProcPerNode:
581+
default: auto
579582
description: |-
580583
Number of processes per node.
581584
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
582585
Supported values: `auto`, `cpu`, `gpu`, or int value.
583586
TODO (andreyvelich): Add kubebuilder validation.
584587
Defaults to `auto`.
585588
type: string
589+
x-kubernetes-validations:
590+
- message: NumProcPerNode must be auto,cpu,gpu strings or
591+
int value
592+
rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
586593
type: object
587594
type: object
595+
x-kubernetes-validations:
596+
- message: numNodes should not be set if torch.elasticPolicy is configured
597+
rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
598+
- message: Only one of the policy can be configured
599+
rule: '!(has(self.torch) && has(self.mpi))'
588600
podGroupPolicy:
589601
description: Configuration for the PodGroup to enable gang-scheduling
590602
via supported plugins.
@@ -594,6 +606,7 @@ spec:
594606
for gang-scheduling.
595607
properties:
596608
scheduleTimeoutSeconds:
609+
default: 60
597610
description: |-
598611
Time threshold to schedule PodGroup for gang-scheduling.
599612
If the scheduling timeout is equal to 0, the default value is used.

pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,13 @@ type CoschedulingPodGroupPolicySource struct {
142142
// Time threshold to schedule PodGroup for gang-scheduling.
143143
// If the scheduling timeout is equal to 0, the default value is used.
144144
// Defaults to 60 seconds.
145+
// +kubebuilder:default=60
145146
ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
146147
}
147148

148149
// MLPolicy represents configuration for the model trining with ML-specific parameters.
150+
// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
151+
// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
149152
type MLPolicy struct {
150153
// Number of training nodes.
151154
// Defaults to 1.
@@ -173,6 +176,8 @@ type TorchMLPolicySource struct {
173176
// Supported values: `auto`, `cpu`, `gpu`, or int value.
174177
// TODO (andreyvelich): Add kubebuilder validation.
175178
// Defaults to `auto`.
179+
// +kubebuilder:default="auto"
180+
// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be auto,cpu,gpu strings or int value"
176181
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
177182

178183
// Elastic policy for the PyTorch training.
@@ -209,13 +214,15 @@ type MPIMLPolicySource struct {
209214

210215
// Implementation name for the MPI to create the appropriate hostfile.
211216
// Defaults to OpenMPI.
217+
// +kubebuilder:default="OpenMPI"
212218
MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`
213219

214220
// Directory where SSH keys are mounted.
215221
SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"`
216222

217223
// Whether to run training process on the launcher Job.
218224
// Defaults to false.
225+
// +kubebuilder:default=false
219226
RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
220227
}
221228

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
Copyright 2024 The Kubeflow Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllerv2
18+
19+
import (
20+
kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
21+
testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing"
22+
"github.com/kubeflow/training-operator/test/integration/framework"
23+
"github.com/kubeflow/training-operator/test/util"
24+
"github.com/onsi/ginkgo/v2"
25+
"github.com/onsi/gomega"
26+
corev1 "k8s.io/api/core/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/utils/ptr"
29+
"sigs.k8s.io/controller-runtime/pkg/client"
30+
)
31+
32+
var _ = ginkgo.Describe("TrainingRuntime marker validations and defaulting", ginkgo.Ordered, func() {
33+
var ns *corev1.Namespace
34+
35+
ginkgo.BeforeAll(func() {
36+
fwk = &framework.Framework{}
37+
cfg = fwk.Init()
38+
ctx, k8sClient = fwk.RunManager(cfg)
39+
})
40+
ginkgo.AfterAll(func() {
41+
fwk.Teardown()
42+
})
43+
44+
ginkgo.BeforeEach(func() {
45+
ns = &corev1.Namespace{
46+
TypeMeta: metav1.TypeMeta{
47+
APIVersion: corev1.SchemeGroupVersion.String(),
48+
Kind: "Namespace",
49+
},
50+
ObjectMeta: metav1.ObjectMeta{
51+
GenerateName: "training-runtime-marker-",
52+
},
53+
}
54+
gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
55+
})
56+
ginkgo.AfterEach(func() {
57+
gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainingRuntime{}, client.InNamespace(ns.Name))).Should(gomega.Succeed())
58+
gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.ClusterTrainingRuntime{})).Should(gomega.Succeed())
59+
})
60+
61+
ginkgo.When("Creating TrainingRuntime", func() {
62+
ginkgo.DescribeTable("Validate TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, errorMatcher gomega.OmegaMatcher) {
63+
gomega.Expect(k8sClient.Create(ctx, trainingRuntime())).Should(errorMatcher)
64+
},
65+
ginkgo.Entry("Should succeed to create trainingRuntime",
66+
func() *kubeflowv2.TrainingRuntime {
67+
return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").
68+
Obj()
69+
},
70+
gomega.Succeed()),
71+
ginkgo.Entry("Should succeed to create clusterTrainingRuntime",
72+
func() *kubeflowv2.TrainingRuntime {
73+
return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").
74+
Obj()
75+
},
76+
gomega.Succeed()),
77+
ginkgo.Entry("Should fail to create trainingRuntime with both MPI and Torch runtimes",
78+
func() *kubeflowv2.TrainingRuntime {
79+
runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
80+
runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
81+
MLPolicySource: kubeflowv2.MLPolicySource{
82+
Torch: &kubeflowv2.TorchMLPolicySource{},
83+
MPI: &kubeflowv2.MPIMLPolicySource{},
84+
},
85+
}
86+
return runtime
87+
},
88+
testingutil.BeInvalidError()),
89+
ginkgo.Entry("Should fail to create trainingRuntime with minNodes and torch.elasticPolicy",
90+
func() *kubeflowv2.TrainingRuntime {
91+
runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
92+
runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
93+
NumNodes: ptr.To(int32(2)),
94+
MLPolicySource: kubeflowv2.MLPolicySource{
95+
Torch: &kubeflowv2.TorchMLPolicySource{
96+
ElasticPolicy: &kubeflowv2.TorchElasticPolicy{},
97+
},
98+
},
99+
}
100+
return runtime
101+
},
102+
testingutil.BeInvalidError()),
103+
)
104+
ginkgo.DescribeTable("Defaulting TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, wantTrainingRuntime func() *kubeflowv2.TrainingRuntime) {
105+
created := trainingRuntime()
106+
gomega.Expect(k8sClient.Create(ctx, created)).Should(gomega.Succeed())
107+
gomega.Expect(created).Should(gomega.BeComparableTo(wantTrainingRuntime(), util.IgnoreObjectMetadata))
108+
},
109+
ginkgo.Entry("Should succeed to default TorchMLPolicySource.NumProcPerNode=auto",
110+
func() *kubeflowv2.TrainingRuntime {
111+
runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
112+
runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
113+
MLPolicySource: kubeflowv2.MLPolicySource{
114+
Torch: &kubeflowv2.TorchMLPolicySource{},
115+
},
116+
}
117+
return runtime
118+
},
119+
func() *kubeflowv2.TrainingRuntime {
120+
runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
121+
runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
122+
MLPolicySource: kubeflowv2.MLPolicySource{
123+
Torch: &kubeflowv2.TorchMLPolicySource{
124+
NumProcPerNode: ptr.To("auto"),
125+
},
126+
},
127+
}
128+
return runtime
129+
}),
130+
)
131+
})
132+
})

test/integration/controller.v2/trainjob_controller_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() {
134134
MinResources(corev1.ResourceList{
135135
corev1.ResourceCPU: resource.MustParse("1500"),
136136
}).
137+
SchedulingTimeout(60).
137138
ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), trainJobKey.Name, string(trainJob.UID)).
138139
Obj(),
139140
util.IgnoreObjectMetadata))
@@ -189,6 +190,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() {
189190
MinResources(corev1.ResourceList{
190191
corev1.ResourceCPU: resource.MustParse("1500"),
191192
}).
193+
SchedulingTimeout(60).
192194
ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), trainJobKey.Name, string(trainJob.UID)).
193195
Obj(),
194196
util.IgnoreObjectMetadata))

0 commit comments

Comments
 (0)