kubeflow
diff --git a/‎api/openapi-spec/swagger.json
Lines changed: 2 additions & 2 deletions b/‎api/openapi-spec/swagger.json
Lines changed: 2 additions & 2 deletions
diff --git a/‎manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml
Lines changed: 5 additions & 2 deletions b/‎manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml
Lines changed: 5 additions & 2 deletions
diff --git a/‎manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml
Lines changed: 5 additions & 2 deletions b/‎manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml
Lines changed: 5 additions & 2 deletions
diff --git a/‎manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml
Lines changed: 4 additions & 1 deletion b/‎manifests/base/crds/trainer.kubeflow.org_trainjobs.yaml
Lines changed: 4 additions & 1 deletion
diff --git a/‎pkg/apis/trainer/v1alpha1/trainingruntime_types.go
Lines changed: 3 additions & 3 deletions b/‎pkg/apis/trainer/v1alpha1/trainingruntime_types.go
Lines changed: 3 additions & 3 deletions
diff --git a/‎pkg/apis/trainer/v1alpha1/trainjob_types.go
Lines changed: 2 additions & 1 deletion b/‎pkg/apis/trainer/v1alpha1/trainjob_types.go
Lines changed: 2 additions & 1 deletion
diff --git a/‎pkg/apis/trainer/v1alpha1/zz_generated.deepcopy.go
Lines changed: 3 additions & 2 deletions b/‎pkg/apis/trainer/v1alpha1/zz_generated.deepcopy.go
Lines changed: 3 additions & 2 deletions
diff --git a/‎pkg/apis/trainer/v1alpha1/zz_generated.openapi.go
Lines changed: 4 additions & 6 deletions b/‎pkg/apis/trainer/v1alpha1/zz_generated.openapi.go
Lines changed: 4 additions & 6 deletions
diff --git a/‎pkg/client/applyconfiguration/trainer/v1alpha1/torchmlpolicysource.go
Lines changed: 6 additions & 2 deletions b/‎pkg/client/applyconfiguration/trainer/v1alpha1/torchmlpolicysource.go
Lines changed: 6 additions & 2 deletions
diff --git a/‎pkg/client/applyconfiguration/trainer/v1alpha1/trainer.go
Lines changed: 3 additions & 2 deletions b/‎pkg/client/applyconfiguration/trainer/v1alpha1/trainer.go
Lines changed: 3 additions & 2 deletions
diff --git a/‎pkg/runtime/core/trainingruntime_test.go
Lines changed: 4 additions & 4 deletions b/‎pkg/runtime/core/trainingruntime_test.go
Lines changed: 4 additions & 4 deletions
diff --git a/‎pkg/runtime/framework/plugins/mpi/mpi.go
Lines changed: 1 addition & 1 deletion b/‎pkg/runtime/framework/plugins/mpi/mpi.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/runtime/framework/plugins/torch/torch.go
Lines changed: 4 additions & 3 deletions b/‎pkg/runtime/framework/plugins/torch/torch.go
Lines changed: 4 additions & 3 deletions
diff --git a/‎pkg/util/testing/wrapper.go
Lines changed: 5 additions & 4 deletions b/‎pkg/util/testing/wrapper.go
Lines changed: 5 additions & 4 deletions
diff --git a/‎sdk/docs/TrainerV1alpha1TorchMLPolicySource.md
Lines changed: 1 addition & 1 deletion b/‎sdk/docs/TrainerV1alpha1TorchMLPolicySource.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/docs/TrainerV1alpha1Trainer.md
Lines changed: 1 addition & 1 deletion b/‎sdk/docs/TrainerV1alpha1Trainer.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/kubeflow/trainer/models/trainer_v1alpha1_torch_ml_policy_source.py
Lines changed: 3 additions & 5 deletions b/‎sdk/kubeflow/trainer/models/trainer_v1alpha1_torch_ml_policy_source.py
Lines changed: 3 additions & 5 deletions
@@ -517,7 +517,7 @@
         },
         "numProcPerNode": {
           "description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
-          "type": "string"
+          "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
         }
       }
     },
@@ -716,7 +716,7 @@
         },
         "numProcPerNode": {
           "description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
-          "type": "string"
+          "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
         },
         "resourcesPerNode": {
           "description": "Compute resources for each training node.",
 
@@ -587,17 +587,20 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        anyOf:
+                        - type: integer
+                        - type: string
                         default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
-                        type: string
+                        x-kubernetes-int-or-string: true
                         x-kubernetes-validations:
                         - message: NumProcPerNode must be equal to auto, cpu, gpu,
                             or int value
-                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
+                          rule: self > 0 || self in ['auto', 'cpu', 'gpu']
                     type: object
                 type: object
                 x-kubernetes-validations:
 
@@ -587,17 +587,20 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        anyOf:
+                        - type: integer
+                        - type: string
                         default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
-                        type: string
+                        x-kubernetes-int-or-string: true
                         x-kubernetes-validations:
                         - message: NumProcPerNode must be equal to auto, cpu, gpu,
                             or int value
-                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
+                          rule: self > 0 || self in ['auto', 'cpu', 'gpu']
                     type: object
                 type: object
                 x-kubernetes-validations:
 
@@ -3138,11 +3138,14 @@ spec:
                     format: int32
                     type: integer
                   numProcPerNode:
+                    anyOf:
+                    - type: integer
+                    - type: string
                     description: |-
                       Number of processes/workers/slots on every training node.
                       For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
                       For the MPI runtime only int value can be set.
-                    type: string
+                    x-kubernetes-int-or-string: true
                   resourcesPerNode:
                     description: Compute resources for each training node.
                     properties:
 
@@ -19,6 +19,7 @@ package v1alpha1
 import (
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 )
 
@@ -174,11 +175,10 @@ type TorchMLPolicySource struct {
 	// Number of processes per node.
 	// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
-	// TODO (andreyvelich): Add kubebuilder validation.
 	// Defaults to `auto`.
 	// +kubebuilder:default="auto"
-	// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	// +kubebuilder:validation:XValidation:rule="self > 0 || self in ['auto', 'cpu', 'gpu']", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
+	NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
 
 	// Elastic policy for the PyTorch training.
 	ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
 
@@ -19,6 +19,7 @@ package v1alpha1
 import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 )
 
 const (
@@ -194,7 +195,7 @@ type Trainer struct {
 	// Number of processes/workers/slots on every training node.
 	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
 	// For the MPI runtime only int value can be set.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
 }
 
 // DatasetConfig represents the desired dataset configuration.
 
@@ -19,14 +19,14 @@ package core
 import (
 	"context"
 	"fmt"
-	"k8s.io/utils/ptr"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
 
@@ -264,7 +264,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 		"succeeded to build JobSet with Torch values from the TrainJob": {
 			trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec(
 				testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec).
-					TorchPolicy(100, ptr.To("auto")).
+					TorchPolicy(100, intstr.FromString("auto")).
 					ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 					Obj(),
 			).Obj(),
@@ -274,7 +274,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 				Trainer(
 					testingutil.MakeTrainJobTrainerWrapper().
 						NumNodes(30).
-						NumProcPerNode(ptr.To("3")).
+						NumProcPerNode(intstr.FromInt32(3)).
 						Obj(),
 				).
 				Obj(),
@@ -318,7 +318,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 		"succeeded to build JobSet with Torch values from the Runtime and envs.": {
 			trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec(
 				testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec).
-					TorchPolicy(100, ptr.To("auto")).
+					TorchPolicy(100, intstr.FromString("auto")).
 					ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 					ContainerTrainerEnv(
 						[]corev1.EnvVar{
 
@@ -94,7 +94,7 @@ func (m *MPI) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob) er
 
 	numProcPerNode := strconv.Itoa(int(*info.RuntimePolicy.MLPolicy.MPI.NumProcPerNode))
 	if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.NumProcPerNode != nil {
-		numProcPerNode = *trainJob.Spec.Trainer.NumProcPerNode
+		numProcPerNode = (*trainJob.Spec.Trainer.NumProcPerNode).String()
 	}
 	info.Trainer.NumProcPerNode = numProcPerNode
 
 
@@ -21,6 +21,7 @@ import (
 	"fmt"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apimachinery/pkg/util/validation/field"
 	"k8s.io/utils/ptr"
@@ -66,9 +67,9 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob)
 	}
 	info.Trainer.NumNodes = numNodes
 
-	numProcPerNode := info.RuntimePolicy.MLPolicy.Torch.NumProcPerNode
+	numProcPerNode := ptr.Deref(info.RuntimePolicy.MLPolicy.Torch.NumProcPerNode, intstr.FromString("auto"))
 	if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.NumProcPerNode != nil {
-		numProcPerNode = trainJob.Spec.Trainer.NumProcPerNode
+		numProcPerNode = ptr.Deref(trainJob.Spec.Trainer.NumProcPerNode, intstr.FromString("auto"))
 	}
 
 	// Update envs for Info object.
@@ -84,7 +85,7 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob)
 		},
 		{
 			Name:  constants.TorchEnvNumProcPerNode,
-			Value: ptr.Deref(numProcPerNode, "auto"),
+			Value: numProcPerNode.String(),
 		},
 		{
 			Name: constants.TorchEnvNodeRank,
 
@@ -22,6 +22,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/utils/ptr"
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
@@ -392,8 +393,8 @@ func (t *TrainJobTrainerWrapper) NumNodes(numNodes int32) *TrainJobTrainerWrappe
 	return t
 }
 
-func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode *string) *TrainJobTrainerWrapper {
-	t.Trainer.NumProcPerNode = numProcPerNode
+func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode intstr.IntOrString) *TrainJobTrainerWrapper {
+	t.Trainer.NumProcPerNode = &numProcPerNode
 	return t
 }
 
@@ -689,12 +690,12 @@ func (s *TrainingRuntimeSpecWrapper) NumNodes(numNodes int32) *TrainingRuntimeSp
 	return s
 }
 
-func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode *string) *TrainingRuntimeSpecWrapper {
+func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode intstr.IntOrString) *TrainingRuntimeSpecWrapper {
 	s.MLPolicy = &trainer.MLPolicy{
 		NumNodes: &numNodes,
 		MLPolicySource: trainer.MLPolicySource{
 			Torch: &trainer.TorchMLPolicySource{
-				NumProcPerNode: numProcPerNode,
+				NumProcPerNode: &numProcPerNode,
 			},
 		},
 	}
 
@@ -5,7 +5,7 @@ TorchMLPolicySource represents a PyTorch runtime configuration.
 Name | Type | Description | Notes
 ------------ | ------------- | ------------- | -------------
 **elastic_policy** | [**TrainerV1alpha1TorchElasticPolicy**](TrainerV1alpha1TorchElasticPolicy.md) |  | [optional] 
-**num_proc_per_node** | **str** | Number of processes per node. This value is inserted into the &#x60;--nproc-per-node&#x60; argument of the &#x60;torchrun&#x60; CLI. Supported values: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value. Defaults to &#x60;auto&#x60;. | [optional] 
+**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) |  | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
 
@@ -9,7 +9,7 @@ Name | Type | Description | Notes
 **env** | [**list[V1EnvVar]**](V1EnvVar.md) | List of environment variables to set in the training container. These values will be merged with the TrainingRuntime&#39;s trainer environments. | [optional] 
 **image** | **str** | Docker image for the training container. | [optional] 
 **num_nodes** | **int** | Number of training nodes. | [optional] 
-**num_proc_per_node** | **str** | Number of processes/workers/slots on every training node. For the Torch runtime: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value can be set. For the MPI runtime only int value can be set. | [optional] 
+**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) |  | [optional] 
 **resources_per_node** | [**V1ResourceRequirements**](V1ResourceRequirements.md) |  | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
@@ -34,7 +34,7 @@ class TrainerV1alpha1TorchMLPolicySource(object):
     """
     openapi_types = {
         'elastic_policy': 'TrainerV1alpha1TorchElasticPolicy',
-        'num_proc_per_node': 'str'
+        'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString'
     }
 
     attribute_map = {
@@ -82,21 +82,19 @@ def elastic_policy(self, elastic_policy):
     def num_proc_per_node(self):
         """Gets the num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.  # noqa: E501
 
-        Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.  # noqa: E501
 
         :return: The num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.  # noqa: E501
-        :rtype: str
+        :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
         return self._num_proc_per_node
 
     @num_proc_per_node.setter
     def num_proc_per_node(self, num_proc_per_node):
         """Sets the num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.
 
-        Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.  # noqa: E501
 
         :param num_proc_per_node: The num_proc_per_node of this TrainerV1alpha1TorchMLPolicySource.  # noqa: E501
-        :type: str
+        :type: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
 
         self._num_proc_per_node = num_proc_per_node
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ package v1alpha1`
`19`	`19`	`import (`
`20`	`20`	`corev1 "k8s.io/api/core/v1"`
`21`	`21`	`metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"`
	`22`	`+ "k8s.io/apimachinery/pkg/util/intstr"`
`22`	`23`	`)`
`23`	`24`
`24`	`25`	`const (`
`@@ -194,7 +195,7 @@ type Trainer struct {`
`194`	`195`	`// Number of processes/workers/slots on every training node.`
`195`	`196`	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
`196`	`197`	`// For the MPI runtime only int value can be set.`
`197`		- NumProcPerNode *string `json:"numProcPerNode,omitempty"`
	`198`	+ NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
`198`	`199`	`}`
`199`	`200`
`200`	`201`	`// DatasetConfig represents the desired dataset configuration.`
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ func (m MPI) EnforceMLPolicy(info runtime.Info, trainJob *trainer.TrainJob) er`
`94`	`94`
`95`	`95`	`numProcPerNode := strconv.Itoa(int(*info.RuntimePolicy.MLPolicy.MPI.NumProcPerNode))`
`96`	`96`	`if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.NumProcPerNode != nil {`
`97`		`- numProcPerNode = *trainJob.Spec.Trainer.NumProcPerNode`
	`97`	`+ numProcPerNode = (*trainJob.Spec.Trainer.NumProcPerNode).String()`
`98`	`98`	`}`
`99`	`99`	`info.Trainer.NumProcPerNode = numProcPerNode`
`100`	`100`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ import (`
`21`	`21`	`"fmt"`
`22`	`22`
`23`	`23`	`corev1 "k8s.io/api/core/v1"`
	`24`	`+ "k8s.io/apimachinery/pkg/util/intstr"`
`24`	`25`	`"k8s.io/apimachinery/pkg/util/sets"`
`25`	`26`	`"k8s.io/apimachinery/pkg/util/validation/field"`
`26`	`27`	`"k8s.io/utils/ptr"`
`@@ -66,9 +67,9 @@ func (t Torch) EnforceMLPolicy(info runtime.Info, trainJob *trainer.TrainJob)`
`66`	`67`	`}`
`67`	`68`	`info.Trainer.NumNodes = numNodes`
`68`	`69`
`69`		`- numProcPerNode := info.RuntimePolicy.MLPolicy.Torch.NumProcPerNode`
	`70`	`+ numProcPerNode := ptr.Deref(info.RuntimePolicy.MLPolicy.Torch.NumProcPerNode, intstr.FromString("auto"))`
`70`	`71`	`if trainJob.Spec.Trainer != nil && trainJob.Spec.Trainer.NumProcPerNode != nil {`
`71`		`- numProcPerNode = trainJob.Spec.Trainer.NumProcPerNode`
	`72`	`+ numProcPerNode = ptr.Deref(trainJob.Spec.Trainer.NumProcPerNode, intstr.FromString("auto"))`
`72`	`73`	`}`
`73`	`74`
`74`	`75`	`// Update envs for Info object.`
`@@ -84,7 +85,7 @@ func (t Torch) EnforceMLPolicy(info runtime.Info, trainJob *trainer.TrainJob)`
`84`	`85`	`},`
`85`	`86`	`{`
`86`	`87`	`Name: constants.TorchEnvNumProcPerNode,`
`87`		`- Value: ptr.Deref(numProcPerNode, "auto"),`
	`88`	`+ Value: numProcPerNode.String(),`
`88`	`89`	`},`
`89`	`90`	`{`
`90`	`91`	`Name: constants.TorchEnvNodeRank,`