Skip to content

Commit 1736070

Browse files
author
Akshay Chitneni
committed
Adding cel validation on trainingRuntime CRD
Signed-off-by: Akshay Chitneni <[email protected]>
1 parent ee11629 commit 1736070

File tree

19 files changed

+220
-41
lines changed

19 files changed

+220
-41
lines changed

api.v2/openapi-spec/swagger.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,7 @@
517517
},
518518
"numProcPerNode": {
519519
"description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
520-
"type": "string"
520+
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
521521
}
522522
}
523523
},
@@ -716,7 +716,7 @@
716716
},
717717
"numProcPerNode": {
718718
"description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
719-
"type": "string"
719+
"$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
720720
},
721721
"resourcesPerNode": {
722722
"description": "Compute resources for each training node.",

manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ spec:
5050
description: Configuration for the MPI Runtime.
5151
properties:
5252
mpiImplementation:
53+
default: OpenMPI
5354
description: |-
5455
Implementation name for the MPI to create the appropriate hostfile.
5556
Defaults to OpenMPI.
@@ -61,6 +62,7 @@ spec:
6162
format: int32
6263
type: integer
6364
runLauncherAsNode:
65+
default: false
6466
description: |-
6567
Whether to run training process on the launcher Job.
6668
Defaults to false.
@@ -583,14 +585,27 @@ spec:
583585
type: integer
584586
type: object
585587
numProcPerNode:
588+
anyOf:
589+
- type: integer
590+
- type: string
591+
default: auto
586592
description: |-
587593
Number of processes per node.
588594
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
589595
Supported values: `auto`, `cpu`, `gpu`, or int value.
590596
Defaults to `auto`.
591-
type: string
597+
x-kubernetes-int-or-string: true
598+
x-kubernetes-validations:
599+
- message: NumProcPerNode must be equal to auto, cpu, gpu,
600+
or int value
601+
rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
592602
type: object
593603
type: object
604+
x-kubernetes-validations:
605+
- message: numNodes should not be set if torch.elasticPolicy is configured
606+
rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
607+
- message: Only one of the policy can be configured
608+
rule: '!(has(self.torch) && has(self.mpi))'
594609
podGroupPolicy:
595610
description: Configuration for the PodGroup to enable gang-scheduling
596611
via supported plugins.
@@ -600,6 +615,7 @@ spec:
600615
for gang-scheduling.
601616
properties:
602617
scheduleTimeoutSeconds:
618+
default: 60
603619
description: |-
604620
Time threshold to schedule PodGroup for gang-scheduling.
605621
If the scheduling timeout is equal to 0, the default value is used.

manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ spec:
5050
description: Configuration for the MPI Runtime.
5151
properties:
5252
mpiImplementation:
53+
default: OpenMPI
5354
description: |-
5455
Implementation name for the MPI to create the appropriate hostfile.
5556
Defaults to OpenMPI.
@@ -61,6 +62,7 @@ spec:
6162
format: int32
6263
type: integer
6364
runLauncherAsNode:
65+
default: false
6466
description: |-
6567
Whether to run training process on the launcher Job.
6668
Defaults to false.
@@ -583,14 +585,27 @@ spec:
583585
type: integer
584586
type: object
585587
numProcPerNode:
588+
anyOf:
589+
- type: integer
590+
- type: string
591+
default: auto
586592
description: |-
587593
Number of processes per node.
588594
This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
589595
Supported values: `auto`, `cpu`, `gpu`, or int value.
590596
Defaults to `auto`.
591-
type: string
597+
x-kubernetes-int-or-string: true
598+
x-kubernetes-validations:
599+
- message: NumProcPerNode must be equal to auto, cpu, gpu,
600+
or int value
601+
rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
592602
type: object
593603
type: object
604+
x-kubernetes-validations:
605+
- message: numNodes should not be set if torch.elasticPolicy is configured
606+
rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
607+
- message: Only one of the policy can be configured
608+
rule: '!(has(self.torch) && has(self.mpi))'
594609
podGroupPolicy:
595610
description: Configuration for the PodGroup to enable gang-scheduling
596611
via supported plugins.
@@ -600,6 +615,7 @@ spec:
600615
for gang-scheduling.
601616
properties:
602617
scheduleTimeoutSeconds:
618+
default: 60
603619
description: |-
604620
Time threshold to schedule PodGroup for gang-scheduling.
605621
If the scheduling timeout is equal to 0, the default value is used.

manifests/v2/base/crds/kubeflow.org_trainjobs.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3138,11 +3138,14 @@ spec:
31383138
format: int32
31393139
type: integer
31403140
numProcPerNode:
3141+
anyOf:
3142+
- type: integer
3143+
- type: string
31413144
description: |-
31423145
Number of processes/workers/slots on every training node.
31433146
For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
31443147
For the MPI runtime only int value can be set.
3145-
type: string
3148+
x-kubernetes-int-or-string: true
31463149
resourcesPerNode:
31473150
description: Compute resources for each training node.
31483151
properties:

pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package v2alpha1
1919
import (
2020
autoscalingv2 "k8s.io/api/autoscaling/v2"
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
"k8s.io/apimachinery/pkg/util/intstr"
2223
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
2324
)
2425

@@ -142,10 +143,13 @@ type CoschedulingPodGroupPolicySource struct {
142143
// Time threshold to schedule PodGroup for gang-scheduling.
143144
// If the scheduling timeout is equal to 0, the default value is used.
144145
// Defaults to 60 seconds.
146+
// +kubebuilder:default=60
145147
ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
146148
}
147149

148150
// MLPolicy represents configuration for the model trining with ML-specific parameters.
151+
// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
152+
// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
149153
type MLPolicy struct {
150154
// Number of training nodes.
151155
// Defaults to 1.
@@ -173,7 +177,9 @@ type TorchMLPolicySource struct {
173177
// Supported values: `auto`, `cpu`, `gpu`, or int value.
174178
// TODO (andreyvelich): Add kubebuilder validation.
175179
// Defaults to `auto`.
176-
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
180+
// +kubebuilder:default="auto"
181+
// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
182+
NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
177183

178184
// Elastic policy for the PyTorch training.
179185
ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
@@ -210,13 +216,15 @@ type MPIMLPolicySource struct {
210216

211217
// Implementation name for the MPI to create the appropriate hostfile.
212218
// Defaults to OpenMPI.
219+
// +kubebuilder:default="OpenMPI"
213220
MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`
214221

215222
// Directory where SSH keys are mounted.
216223
SSHAuthMountPath *string `json:"sshAuthMountPath,omitempty"`
217224

218225
// Whether to run training process on the launcher Job.
219226
// Defaults to false.
227+
// +kubebuilder:default=false
220228
RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
221229
}
222230

pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package v2alpha1
1919
import (
2020
corev1 "k8s.io/api/core/v1"
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22+
"k8s.io/apimachinery/pkg/util/intstr"
2223
)
2324

2425
const (
@@ -194,7 +195,7 @@ type Trainer struct {
194195
// Number of processes/workers/slots on every training node.
195196
// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
196197
// For the MPI runtime only int value can be set.
197-
NumProcPerNode *string `json:"numProcPerNode,omitempty"`
198+
NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
198199
}
199200

200201
// DatasetConfig represents the desired dataset configuration.

pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go

Lines changed: 4 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)