kubeflow
diff --git a/‎api.v2/openapi-spec/swagger.json
Lines changed: 2 additions & 2 deletions b/‎api.v2/openapi-spec/swagger.json
Lines changed: 2 additions & 2 deletions
diff --git a/‎manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
Lines changed: 17 additions & 1 deletion b/‎manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
Lines changed: 17 additions & 1 deletion
diff --git a/‎manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
Lines changed: 17 additions & 1 deletion b/‎manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
Lines changed: 17 additions & 1 deletion
diff --git a/‎manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
Lines changed: 4 additions & 1 deletion b/‎manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
Lines changed: 4 additions & 1 deletion
diff --git a/‎pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
Lines changed: 9 additions & 1 deletion b/‎pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
Lines changed: 9 additions & 1 deletion
diff --git a/‎pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Lines changed: 2 additions & 1 deletion b/‎pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
Lines changed: 3 additions & 2 deletions b/‎pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
Lines changed: 3 additions & 2 deletions
diff --git a/‎pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go
Lines changed: 4 additions & 6 deletions b/‎pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go
Lines changed: 4 additions & 6 deletions
diff --git a/‎pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go
Lines changed: 6 additions & 2 deletions b/‎pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go
Lines changed: 6 additions & 2 deletions
diff --git a/‎pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go
Lines changed: 3 additions & 2 deletions b/‎pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go
Lines changed: 3 additions & 2 deletions
@@ -517,7 +517,7 @@
         },
         "numProcPerNode": {
           "description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
-          "type": "string"
+          "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
         }
       }
     },
@@ -716,7 +716,7 @@
         },
         "numProcPerNode": {
           "description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
-          "type": "string"
+          "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
         },
         "resourcesPerNode": {
           "description": "Compute resources for each training node.",
 
@@ -50,6 +50,7 @@ spec:
                     description: Configuration for the MPI Runtime.
                     properties:
                       mpiImplementation:
+                        default: OpenMPI
                         description: |-
                           Implementation name for the MPI to create the appropriate hostfile.
                           Defaults to OpenMPI.
@@ -61,6 +62,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -583,14 +585,27 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
-                        type: string
+                        x-kubernetes-int-or-string: true
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be equal to auto, cpu, gpu,
+                            or int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -600,6 +615,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.
 
@@ -50,6 +50,7 @@ spec:
                     description: Configuration for the MPI Runtime.
                     properties:
                       mpiImplementation:
+                        default: OpenMPI
                         description: |-
                           Implementation name for the MPI to create the appropriate hostfile.
                           Defaults to OpenMPI.
@@ -61,6 +62,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -583,14 +585,27 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
-                        type: string
+                        x-kubernetes-int-or-string: true
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be equal to auto, cpu, gpu,
+                            or int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -600,6 +615,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.
 
@@ -3138,11 +3138,14 @@ spec:
                     format: int32
                     type: integer
                   numProcPerNode:
+                    anyOf:
+                    - type: integer
+                    - type: string
                     description: |-
                       Number of processes/workers/slots on every training node.
                       For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
                       For the MPI runtime only int value can be set.
-                    type: string
+                    x-kubernetes-int-or-string: true
                   resourcesPerNode:
                     description: Compute resources for each training node.
                     properties:
 
@@ -19,6 +19,7 @@ package v2alpha1
 import (
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 )
 
@@ -142,10 +143,13 @@ type CoschedulingPodGroupPolicySource struct {
 	// Time threshold to schedule PodGroup for gang-scheduling.
 	// If the scheduling timeout is equal to 0, the default value is used.
 	// Defaults to 60 seconds.
+	// +kubebuilder:default=60
 	ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
 }
 
 // MLPolicy represents configuration for the model trining with ML-specific parameters.
+// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
+// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
 type MLPolicy struct {
 	// Number of training nodes.
 	// Defaults to 1.
@@ -173,7 +177,9 @@ type TorchMLPolicySource struct {
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
 	// TODO (andreyvelich): Add kubebuilder validation.
 	// Defaults to `auto`.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	// +kubebuilder:default="auto"
+	// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
+	NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
 
 	// Elastic policy for the PyTorch training.
 	ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
@@ -210,13 +216,15 @@ type MPIMLPolicySource struct {
 
 	// Implementation name for the MPI to create the appropriate hostfile.
 	// Defaults to OpenMPI.
+	// +kubebuilder:default="OpenMPI"
 	MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`
 
 	// Directory where SSH keys are mounted.
 	SSHAuthMountPath *string `json:"sshAuthMountPath,omitempty"`
 
 	// Whether to run training process on the launcher Job.
 	// Defaults to false.
+	// +kubebuilder:default=false
 	RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
 }
 
 
@@ -19,6 +19,7 @@ package v2alpha1
 import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 )
 
 const (
@@ -194,7 +195,7 @@ type Trainer struct {
 	// Number of processes/workers/slots on every training node.
 	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
 	// For the MPI runtime only int value can be set.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
 }
 
 // DatasetConfig represents the desired dataset configuration.
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ package v2alpha1`
`19`	`19`	`import (`
`20`	`20`	`corev1 "k8s.io/api/core/v1"`
`21`	`21`	`metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"`
	`22`	`+ "k8s.io/apimachinery/pkg/util/intstr"`
`22`	`23`	`)`
`23`	`24`
`24`	`25`	`const (`
`@@ -194,7 +195,7 @@ type Trainer struct {`
`194`	`195`	`// Number of processes/workers/slots on every training node.`
`195`	`196`	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
`196`	`197`	`// For the MPI runtime only int value can be set.`
`197`		- NumProcPerNode *string `json:"numProcPerNode,omitempty"`
	`198`	+ NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
`198`	`199`	`}`
`199`	`200`
`200`	`201`	`// DatasetConfig represents the desired dataset configuration.`