chore(test): Update torch validate UTs.

Electronic-Waste · Electronic-Waste · commit a29dddc76fcb · 2025-04-09T09:33:24.000Z
Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;
diff --git a/pkg/runtime/framework/plugins/torch/torch.go b/pkg/runtime/framework/plugins/torch/torch.go
@@ -71,9 +71,8 @@ func (t *Torch) Validate(runtimeInfo *runtime.Info, _, newObj *trainer.TrainJob)
 			}
 		}
 
-		// Check reserved envs for torchrun.
-		// TODO(Electronic-Waste): Add validation for torchtune args.
 		if !slices.Equal(newObj.Spec.Trainer.Command, constants.TorchTuneEntrypoint) {
+			// Check reserved envs for torchrun.
 			torchEnvs := sets.New[string]()
 			for _, env := range newObj.Spec.Trainer.Env {
 				if constants.TorchRunReservedEnvNames.Has(env.Name) {
@@ -85,6 +84,17 @@ func (t *Torch) Validate(runtimeInfo *runtime.Info, _, newObj *trainer.TrainJob)
 				trainerEnvsPath := specPath.Child("trainer").Child("env")
 				allErrs = append(allErrs, field.Invalid(trainerEnvsPath, newObj.Spec.Trainer.Env, fmt.Sprintf("must not have reserved envs, invalid envs configured: %v", sets.List(torchEnvs))))
 			}
+		} else {
+			// Check supported pretrained models for torchtune.
+			// TODO(Electronic-Waste): Add more validation for torchtune when we support more arguments.
+			argPath := specPath.Child("trainer").Child("args")
+			model := getModelFromArgs(newObj.Spec.Trainer.Args)
+
+			if model == nil {
+				allErrs = append(allErrs, field.Invalid(argPath, newObj.Spec.Trainer.Args, "must specify a pretrained model"))
+			} else if !constants.TorchTuneSupportedPretrainedModels.Has(*model) {
+				allErrs = append(allErrs, field.Invalid(argPath, newObj.Spec.Trainer.Args, fmt.Sprintf("must have a supported pretrained model, invalid model configured: %v", *model)))
+			}
 		}
 	}
 
@@ -246,15 +256,6 @@ func getRecipeFromArgs(numNodes int32, numProcPerNode intstr.IntOrString, _ []st
 
 // getConfigFromArgs extracts the config from distributed parameters, recipe and command line arguments.
 func getConfigFileFromArgs(numNodes int32, recipe string, args []string) string {
-	// Extract model from command line args.
-	model := constants.MODEL_LLAMA3_2_1B
-	for _, arg := range args {
-		if strings.HasPrefix(arg, "model") {
-			model = strings.Split(arg, "=")[1]
-			break
-		}
-	}
-
 	// Determine the config file name based on the recipe and number of nodes.
 	var suffix string
 	switch recipe {
@@ -268,5 +269,16 @@ func getConfigFileFromArgs(numNodes int32, recipe string, args []string) string
 		suffix = constants.TorchTuneFullFinetuneSingleDeviceConfigSuffix
 	}
 
-	return fmt.Sprintf("%s%s.yaml", model, suffix)
+	return fmt.Sprintf("%s%s.yaml", *getModelFromArgs(args), suffix)
+}
+
+func getModelFromArgs(args []string) *string {
+	var model *string
+	for _, arg := range args {
+		if strings.HasPrefix(arg, "model") {
+			model = &strings.Split(arg, "=")[1]
+			break
+		}
+	}
+	return model
 }
diff --git a/pkg/runtime/framework/plugins/torch/torch_test.go b/pkg/runtime/framework/plugins/torch/torch_test.go
@@ -1568,7 +1568,8 @@ func TestValidate(t *testing.T) {
 					Container(
 						"ghcr.io/kubeflow/trainer/torchtune-trainer",
 						[]string{"tune", "run"},
-						nil, corev1.ResourceList{},
+						[]string{"model=llama3_2/1B"},
+						corev1.ResourceList{},
 					).
 					Env(
 						[]corev1.EnvVar{
@@ -1586,6 +1587,65 @@ func TestValidate(t *testing.T) {
 				).
 				Obj(),
 		},
+		"missing pretrained model": {
+			info: runtime.NewInfo(
+				runtime.WithMLPolicySource(utiltesting.MakeMLPolicyWrapper().
+					WithMLPolicySource(*utiltesting.MakeMLPolicySourceWrapper().
+						TorchPolicy(ptr.To(intstr.FromString("auto")), nil).
+						Obj(),
+					).
+					Obj(),
+				),
+			),
+			newObj: utiltesting.MakeTrainJobWrapper(metav1.NamespaceDefault, "test").
+				Trainer(utiltesting.MakeTrainJobTrainerWrapper().
+					NumProcPerNode(intstr.FromString("auto")).
+					Container(
+						"ghcr.io/kubeflow/trainer/torchtune-trainer",
+						[]string{"tune", "run"},
+						nil, corev1.ResourceList{},
+					).
+					Obj(),
+				).
+				Obj(),
+			wantError: field.ErrorList{
+				field.Invalid(
+					field.NewPath("spec").Child("trainer").Child("args"),
+					[]string(nil),
+					"must specify a pretrained model",
+				),
+			},
+		},
+		"unsupported pretrained model": {
+			info: runtime.NewInfo(
+				runtime.WithMLPolicySource(utiltesting.MakeMLPolicyWrapper().
+					WithMLPolicySource(*utiltesting.MakeMLPolicySourceWrapper().
+						TorchPolicy(ptr.To(intstr.FromString("auto")), nil).
+						Obj(),
+					).
+					Obj(),
+				),
+			),
+			newObj: utiltesting.MakeTrainJobWrapper(metav1.NamespaceDefault, "test").
+				Trainer(utiltesting.MakeTrainJobTrainerWrapper().
+					NumProcPerNode(intstr.FromString("auto")).
+					Container(
+						"ghcr.io/kubeflow/trainer/torchtune-trainer",
+						[]string{"tune", "run"},
+						[]string{"model=llama3_1/70B"},
+						corev1.ResourceList{},
+					).
+					Obj(),
+				).
+				Obj(),
+			wantError: field.ErrorList{
+				field.Invalid(
+					field.NewPath("spec").Child("trainer").Child("args"),
+					[]string{"model=llama3_1/70B"},
+					fmt.Sprintf("must have a supported pretrained model, invalid model configured: %s", "llama3_1/70B"),
+				),
+			},
+		},
 	}
 	for name, tc := range cases {
 		t.Run(name, func(t *testing.T) {