diff --git a/pkg/operators/pytorch-operator/apis/pytorch/v1/defaults.go b/pkg/operators/pytorch-operator/apis/pytorch/v1/defaults.go index a5cfe76bb..8080cd060 100644 --- a/pkg/operators/pytorch-operator/apis/pytorch/v1/defaults.go +++ b/pkg/operators/pytorch-operator/apis/pytorch/v1/defaults.go @@ -16,7 +16,6 @@ package v1 import ( "strings" - common "github.com/kubeflow/arena/pkg/operators/tf-operator/apis/common/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" @@ -87,9 +86,9 @@ func setTypeNameToCamelCase(job *PyTorchJob, typ PyTorchReplicaType) { // SetDefaults_PyTorchJob sets any unspecified values to defaults. func SetDefaults_PyTorchJob(job *PyTorchJob) { // Set default cleanpod policy to None. - if job.Spec.CleanPodPolicy == nil { + if job.Spec.RunPolicy.CleanPodPolicy == nil { policy := common.CleanPodPolicyNone - job.Spec.CleanPodPolicy = &policy + job.Spec.RunPolicy.CleanPodPolicy = &policy } // Update the key of PyTorchReplicaSpecs to camel case. diff --git a/pkg/operators/pytorch-operator/apis/pytorch/v1/types.go b/pkg/operators/pytorch-operator/apis/pytorch/v1/types.go index d76e18fef..4711ca956 100644 --- a/pkg/operators/pytorch-operator/apis/pytorch/v1/types.go +++ b/pkg/operators/pytorch-operator/apis/pytorch/v1/types.go @@ -51,9 +51,9 @@ type PyTorchJobSpec struct { // +optional BackoffLimit *int32 `json:"backoffLimit,omitempty"` - // Defines the policy for cleaning up pods after the PyTorchJob completes. + // Defines the policy for cleaning up pods( under runPolicy) after the PyTorchJob completes. // Defaults to None. - CleanPodPolicy *common.CleanPodPolicy `json:"cleanPodPolicy,omitempty"` + RunPolicy *common.RunPolicy `json:"runPolicy,omitempty"` // Defines the TTL for cleaning up finished PyTorchJobs (temporary // before Kubernetes adds the cleanup controller). diff --git a/pkg/operators/pytorch-operator/apis/pytorch/v1/zz_generated.deepcopy.go b/pkg/operators/pytorch-operator/apis/pytorch/v1/zz_generated.deepcopy.go index 566b80c6d..157a6b0ee 100644 --- a/pkg/operators/pytorch-operator/apis/pytorch/v1/zz_generated.deepcopy.go +++ b/pkg/operators/pytorch-operator/apis/pytorch/v1/zz_generated.deepcopy.go @@ -17,7 +17,6 @@ // Code generated by deepcopy-gen. DO NOT EDIT. package v1 - import ( apiv1 "github.com/kubeflow/arena/pkg/operators/tf-operator/apis/common/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -97,9 +96,9 @@ func (in *PyTorchJobSpec) DeepCopyInto(out *PyTorchJobSpec) { *out = new(int32) **out = **in } - if in.CleanPodPolicy != nil { - in, out := &in.CleanPodPolicy, &out.CleanPodPolicy - *out = new(apiv1.CleanPodPolicy) + if in.RunPolicy != nil { + in, out := &in.RunPolicy, &out.RunPolicy + *out = new(apiv1.RunPolicy) **out = **in } if in.TTLSecondsAfterFinished != nil { diff --git a/pkg/operators/tf-operator/apis/common/v1/types.go b/pkg/operators/tf-operator/apis/common/v1/types.go index bfe2bfd65..d136be915 100644 --- a/pkg/operators/tf-operator/apis/common/v1/types.go +++ b/pkg/operators/tf-operator/apis/common/v1/types.go @@ -141,6 +141,34 @@ const ( CleanPodPolicyNone CleanPodPolicy = "None" ) + +// +k8s:deepcopy-gen=true +// RunPolicy encapsulates various runtime policies of the distributed training +// job, for example how to clean up resources and how long the job can stay +// active. +type RunPolicy struct { + // CleanPodPolicy defines the policy to kill pods after the job completes. + // Default to Running. + CleanPodPolicy *CleanPodPolicy `json:"cleanPodPolicy,omitempty"` + + // TTLSecondsAfterFinished is the TTL to clean up jobs. + // It may take extra ReconcilePeriod seconds for the cleanup, since + // reconcile gets called periodically. + // Default to infinite. + TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` + + // Specifies the duration in seconds relative to the startTime that the job may be active + // before the system tries to terminate it; value must be positive integer. + // +optional + ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"` + + // Optional number of retries before marking this job failed. + // +optional + BackoffLimit *int32 `json:"backoffLimit,omitempty"` + +} + + // RestartPolicy describes how the replicas should be restarted. // Can be one of: Always, OnFailure, Never, or ExitCode. // If none of the following policies is specified, the default one diff --git a/pkg/operators/tf-operator/apis/common/v1beta2/types.go b/pkg/operators/tf-operator/apis/common/v1beta2/types.go index 760bf71f2..a4e630c3e 100644 --- a/pkg/operators/tf-operator/apis/common/v1beta2/types.go +++ b/pkg/operators/tf-operator/apis/common/v1beta2/types.go @@ -136,6 +136,33 @@ const ( CleanPodPolicyNone CleanPodPolicy = "None" ) +// +k8s:deepcopy-gen=true +// RunPolicy encapsulates various runtime policies of the distributed training +// job, for example how to clean up resources and how long the job can stay +// active. +type RunPolicy struct { + // CleanPodPolicy defines the policy to kill pods after the job completes. + // Default to Running. + CleanPodPolicy *CleanPodPolicy `json:"cleanPodPolicy,omitempty"` + + // TTLSecondsAfterFinished is the TTL to clean up jobs. + // It may take extra ReconcilePeriod seconds for the cleanup, since + // reconcile gets called periodically. + // Default to infinite. + TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` + + // Specifies the duration in seconds relative to the startTime that the job may be active + // before the system tries to terminate it; value must be positive integer. + // +optional + ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"` + + // Optional number of retries before marking this job failed. + // +optional + BackoffLimit *int32 `json:"backoffLimit,omitempty"` + +} + + // RestartPolicy describes how the replicas should be restarted. // Only one of the following restart policies may be specified. // If none of the following policies is specified, the default one diff --git a/pkg/operators/tf-operator/apis/tensorflow/v1/defaults.go b/pkg/operators/tf-operator/apis/tensorflow/v1/defaults.go index 2ce8d4575..4938e81b5 100644 --- a/pkg/operators/tf-operator/apis/tensorflow/v1/defaults.go +++ b/pkg/operators/tf-operator/apis/tensorflow/v1/defaults.go @@ -91,9 +91,9 @@ func setTypeNameToCamelCase(tfJob *TFJob, typ TFReplicaType) { // SetDefaults_TFJob sets any unspecified values to defaults. func SetDefaults_TFJob(tfjob *TFJob) { // Set default cleanpod policy to Running. - if tfjob.Spec.CleanPodPolicy == nil { + if tfjob.Spec.RunPolicy.CleanPodPolicy == nil { running := common.CleanPodPolicyRunning - tfjob.Spec.CleanPodPolicy = &running + tfjob.Spec.RunPolicy.CleanPodPolicy = &running } // Update the key of TFReplicaSpecs to camel case. diff --git a/pkg/operators/tf-operator/apis/tensorflow/v1/types.go b/pkg/operators/tf-operator/apis/tensorflow/v1/types.go index c081f878b..84571d12d 100644 --- a/pkg/operators/tf-operator/apis/tensorflow/v1/types.go +++ b/pkg/operators/tf-operator/apis/tensorflow/v1/types.go @@ -53,7 +53,8 @@ type TFJobSpec struct { // Defines the policy for cleaning up pods after the TFJob completes. // Defaults to Running. - CleanPodPolicy *common.CleanPodPolicy `json:"cleanPodPolicy,omitempty"` + RunPolicy *common.RunPolicy `json:"runPolicy,omitempty"` + // Defines the TTL for cleaning up finished TFJobs (temporary // before kubernetes adds the cleanup controller). diff --git a/pkg/operators/tf-operator/apis/tensorflow/v1/zz_generated.deepcopy.go b/pkg/operators/tf-operator/apis/tensorflow/v1/zz_generated.deepcopy.go index 21281de60..5ba3492af 100644 --- a/pkg/operators/tf-operator/apis/tensorflow/v1/zz_generated.deepcopy.go +++ b/pkg/operators/tf-operator/apis/tensorflow/v1/zz_generated.deepcopy.go @@ -97,9 +97,9 @@ func (in *TFJobSpec) DeepCopyInto(out *TFJobSpec) { *out = new(int32) **out = **in } - if in.CleanPodPolicy != nil { - in, out := &in.CleanPodPolicy, &out.CleanPodPolicy - *out = new(commonv1.CleanPodPolicy) + if in.RunPolicy != nil { + in, out := &in.RunPolicy, &out.RunPolicy + *out = new(commonv1.RunPolicy) **out = **in } if in.TTLSecondsAfterFinished != nil { diff --git a/pkg/operators/tf-operator/apis/tensorflow/v1beta2/defaults.go b/pkg/operators/tf-operator/apis/tensorflow/v1beta2/defaults.go index a527c49c4..a3c789b42 100644 --- a/pkg/operators/tf-operator/apis/tensorflow/v1beta2/defaults.go +++ b/pkg/operators/tf-operator/apis/tensorflow/v1beta2/defaults.go @@ -91,9 +91,9 @@ func setTypeNameToCamelCase(tfJob *TFJob, typ TFReplicaType) { // SetDefaults_TFJob sets any unspecified values to defaults. func SetDefaults_TFJob(tfjob *TFJob) { // Set default cleanpod policy to Running. - if tfjob.Spec.CleanPodPolicy == nil { + if tfjob.Spec.RunPolicy.CleanPodPolicy == nil { running := common.CleanPodPolicyRunning - tfjob.Spec.CleanPodPolicy = &running + tfjob.Spec.RunPolicy.CleanPodPolicy = &running } // Update the key of TFReplicaSpecs to camel case. diff --git a/pkg/operators/tf-operator/apis/tensorflow/v1beta2/types.go b/pkg/operators/tf-operator/apis/tensorflow/v1beta2/types.go index b735b825f..a6b18cad5 100644 --- a/pkg/operators/tf-operator/apis/tensorflow/v1beta2/types.go +++ b/pkg/operators/tf-operator/apis/tensorflow/v1beta2/types.go @@ -55,7 +55,7 @@ type TFJobSpec struct { // CleanPodPolicy defines the policy to kill pods after TFJob is // succeeded. // Default to Running. - CleanPodPolicy *common.CleanPodPolicy `json:"cleanPodPolicy,omitempty"` + RunPolicy *common.RunPolicy `json:"runPolicy,omitempty"` // TTLSecondsAfterFinished is the TTL to clean up tf-jobs (temporary // before kubernetes adds the cleanup controller). diff --git a/pkg/operators/tf-operator/apis/tensorflow/v1beta2/zz_generated.deepcopy.go b/pkg/operators/tf-operator/apis/tensorflow/v1beta2/zz_generated.deepcopy.go index d93b58eb9..3c94a8029 100644 --- a/pkg/operators/tf-operator/apis/tensorflow/v1beta2/zz_generated.deepcopy.go +++ b/pkg/operators/tf-operator/apis/tensorflow/v1beta2/zz_generated.deepcopy.go @@ -97,9 +97,9 @@ func (in *TFJobSpec) DeepCopyInto(out *TFJobSpec) { *out = new(int32) **out = **in } - if in.CleanPodPolicy != nil { - in, out := &in.CleanPodPolicy, &out.CleanPodPolicy - *out = new(commonv1beta2.CleanPodPolicy) + if in.RunPolicy != nil { + in, out := &in.RunPolicy, &out.RunPolicy + *out = new(commonv1beta2.RunPolicy) **out = **in } if in.TTLSecondsAfterFinished != nil {