Skip to content

Commit 5a5f92d

Browse files
johnugeorgetenzen-ySyulin7yshalabi
authored
Cherry pick commits (#1757)
* Add validation for verifying that the CustomJob (e.g., TFJob) name meets DNS1035 (#1748) Signed-off-by: Yuki Iwai <[email protected]> * Fix the success condition of the job in PyTorchJob's Elastic mode. (#1752) Signed-off-by: Syulin7 <[email protected]> * Set the default value of CleanPodPolicy to None (#1754) Signed-off-by: Syulin7 <[email protected]> * Update mpijob_controller.go (#1755) fix typo TFJob, should be MPIJob --------- Signed-off-by: Yuki Iwai <[email protected]> Signed-off-by: Syulin7 <[email protected]> Co-authored-by: Yuki Iwai <[email protected]> Co-authored-by: yu lin <[email protected]> Co-authored-by: Yasser Shalabi <[email protected]>
1 parent b8004ae commit 5a5f92d

23 files changed

+840
-269
lines changed

pkg/apis/kubeflow.org/v1/defaulting_utils.go

+4
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,7 @@ func setTypeNameToCamelCase(replicaSpecs map[commonv1.ReplicaType]*commonv1.Repl
5858
}
5959
}
6060
}
61+
62+
func cleanPodPolicyPointer(cleanPodPolicy commonv1.CleanPodPolicy) *commonv1.CleanPodPolicy {
63+
return &cleanPodPolicy
64+
}

pkg/apis/kubeflow.org/v1/mxnet_defaults.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ func setMXNetTypeNamesToCamelCase(mxJob *MXJob) {
4646

4747
// SetDefaults_MXJob sets any unspecified values to defaults.
4848
func SetDefaults_MXJob(mxjob *MXJob) {
49-
// Set default cleanpod policy to All.
49+
// Set default cleanpod policy to None.
5050
if mxjob.Spec.RunPolicy.CleanPodPolicy == nil {
51-
all := commonv1.CleanPodPolicyAll
51+
all := commonv1.CleanPodPolicyNone
5252
mxjob.Spec.RunPolicy.CleanPodPolicy = &all
5353
}
5454

pkg/apis/kubeflow.org/v1/mxnet_defaults_test.go

+30-5
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func TestSetDefaults_MXJob(t *testing.T) {
9696
},
9797
},
9898
},
99-
expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort),
99+
expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort),
100100
},
101101
"Set spec with restart policy": {
102102
original: &MXJob{
@@ -118,7 +118,7 @@ func TestSetDefaults_MXJob(t *testing.T) {
118118
},
119119
},
120120
},
121-
expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, commonv1.RestartPolicyOnFailure, 1, MXJobDefaultPortName, MXJobDefaultPort),
121+
expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, commonv1.RestartPolicyOnFailure, 1, MXJobDefaultPortName, MXJobDefaultPort),
122122
},
123123
"Set spec with replicas": {
124124
original: &MXJob{
@@ -140,7 +140,7 @@ func TestSetDefaults_MXJob(t *testing.T) {
140140
},
141141
},
142142
},
143-
expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 3, MXJobDefaultPortName, MXJobDefaultPort),
143+
expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 3, MXJobDefaultPortName, MXJobDefaultPort),
144144
},
145145

146146
"Set spec with default node port name and port": {
@@ -168,7 +168,7 @@ func TestSetDefaults_MXJob(t *testing.T) {
168168
},
169169
},
170170
},
171-
expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort),
171+
expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort),
172172
},
173173

174174
"Set spec with node port": {
@@ -196,7 +196,32 @@ func TestSetDefaults_MXJob(t *testing.T) {
196196
},
197197
},
198198
},
199-
expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, 9999),
199+
expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, 9999),
200+
},
201+
202+
"set spec with cleanpod policy": {
203+
original: &MXJob{
204+
Spec: MXJobSpec{
205+
RunPolicy: commonv1.RunPolicy{
206+
CleanPodPolicy: cleanPodPolicyPointer(commonv1.CleanPodPolicyAll),
207+
},
208+
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
209+
MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{
210+
Template: corev1.PodTemplateSpec{
211+
Spec: corev1.PodSpec{
212+
Containers: []corev1.Container{
213+
corev1.Container{
214+
Name: MXJobDefaultContainerName,
215+
Image: testImage,
216+
},
217+
},
218+
},
219+
},
220+
},
221+
},
222+
},
223+
},
224+
expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort),
200225
},
201226
}
202227

pkg/apis/kubeflow.org/v1/mxnet_validation.go

+12-5
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,29 @@ package v1
1616

1717
import (
1818
"fmt"
19-
commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
2019

20+
commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
2121
log "github.com/sirupsen/logrus"
22+
apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation"
2223
)
2324

24-
// ValidateV1MXJobSpec checks that the kubeflowv1.MXJobSpec is valid.
25-
func ValidateV1MXJobSpec(c *MXJobSpec) error {
26-
return validateMXNetReplicaSpecs(c.MXReplicaSpecs)
25+
// ValidateV1MXJob checks that the kubeflowv1.MXJobSpec is valid.
26+
func ValidateV1MXJob(mxJob *MXJob) error {
27+
if errors := apimachineryvalidation.NameIsDNS1035Label(mxJob.ObjectMeta.Name, false); errors != nil {
28+
return fmt.Errorf("MXJob name is invalid: %v", errors)
29+
}
30+
if err := validateMXReplicaSpecs(mxJob.Spec.MXReplicaSpecs); err != nil {
31+
return err
32+
}
33+
return nil
2734
}
2835

2936
// IsScheduler returns true if the type is Scheduler.
3037
func IsScheduler(typ commonv1.ReplicaType) bool {
3138
return typ == MXJobReplicaTypeScheduler
3239
}
3340

34-
func validateMXNetReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error {
41+
func validateMXReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error {
3542
if specs == nil {
3643
return fmt.Errorf("MXJobSpec is not valid")
3744
}

pkg/apis/kubeflow.org/v1/mxnet_validation_test.go

+144-40
Original file line numberDiff line numberDiff line change
@@ -18,72 +18,176 @@ import (
1818
"testing"
1919

2020
commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
21-
v1 "k8s.io/api/core/v1"
21+
corev1 "k8s.io/api/core/v1"
22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
"k8s.io/utils/pointer"
2224
)
2325

24-
func TestValidateV1MXJobSpec(t *testing.T) {
25-
testCases := []MXJobSpec{
26-
{
27-
MXReplicaSpecs: nil,
26+
func TestValidateV1MXJob(t *testing.T) {
27+
validMXReplicaSpecs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
28+
MXJobReplicaTypeScheduler: {
29+
Replicas: pointer.Int32(1),
30+
RestartPolicy: commonv1.RestartPolicyNever,
31+
Template: corev1.PodTemplateSpec{
32+
Spec: corev1.PodSpec{
33+
Containers: []corev1.Container{{
34+
Name: "mxnet",
35+
Image: "mxjob/mxnet",
36+
}},
37+
},
38+
},
39+
},
40+
MXJobReplicaTypeServer: {
41+
Replicas: pointer.Int32(1),
42+
RestartPolicy: commonv1.RestartPolicyNever,
43+
Template: corev1.PodTemplateSpec{
44+
Spec: corev1.PodSpec{
45+
Containers: []corev1.Container{{
46+
Name: "mxnet",
47+
Image: "mxjob/mxnet",
48+
}},
49+
},
50+
},
2851
},
29-
{
30-
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
31-
MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{
32-
Template: v1.PodTemplateSpec{
33-
Spec: v1.PodSpec{
34-
Containers: []v1.Container{},
52+
MXJobReplicaTypeWorker: {
53+
Replicas: pointer.Int32(1),
54+
RestartPolicy: commonv1.RestartPolicyNever,
55+
Template: corev1.PodTemplateSpec{
56+
Spec: corev1.PodSpec{
57+
Containers: []corev1.Container{{
58+
Name: "mxnet",
59+
Image: "mxjob/mxnet",
60+
Command: []string{"python"},
61+
Args: []string{
62+
"/incubator-mxnet/example/image-classification/train_mnist.py",
63+
"--num-epochs=10",
64+
"--num-layers=2",
65+
"--kv-store=dist_device_sync",
3566
},
36-
},
67+
}},
68+
},
69+
},
70+
},
71+
}
72+
73+
testCases := map[string]struct {
74+
MXJob *MXJob
75+
wantErr bool
76+
}{
77+
"valid mxJob": {
78+
MXJob: &MXJob{
79+
ObjectMeta: metav1.ObjectMeta{
80+
Name: "test",
81+
},
82+
Spec: MXJobSpec{
83+
MXReplicaSpecs: validMXReplicaSpecs,
84+
},
85+
},
86+
wantErr: false,
87+
},
88+
"mxReplicaSpecs is nil": {
89+
MXJob: &MXJob{
90+
ObjectMeta: metav1.ObjectMeta{
91+
Name: "test",
3792
},
3893
},
94+
wantErr: true,
3995
},
40-
{
41-
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
42-
MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{
43-
Template: v1.PodTemplateSpec{
44-
Spec: v1.PodSpec{
45-
Containers: []v1.Container{
46-
v1.Container{
47-
Image: "",
96+
"mxJob name does not meet DNS1035": {
97+
MXJob: &MXJob{
98+
ObjectMeta: metav1.ObjectMeta{
99+
Name: "10test",
100+
},
101+
Spec: MXJobSpec{
102+
MXReplicaSpecs: validMXReplicaSpecs,
103+
},
104+
},
105+
wantErr: true,
106+
},
107+
"no containers": {
108+
MXJob: &MXJob{
109+
ObjectMeta: metav1.ObjectMeta{
110+
Name: "test",
111+
},
112+
Spec: MXJobSpec{
113+
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
114+
MXJobReplicaTypeWorker: {
115+
Template: corev1.PodTemplateSpec{
116+
Spec: corev1.PodSpec{
117+
Containers: []corev1.Container{},
48118
},
49119
},
50120
},
51121
},
52122
},
53123
},
124+
wantErr: true,
54125
},
55-
{
56-
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
57-
MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{
58-
Template: v1.PodTemplateSpec{
59-
Spec: v1.PodSpec{
60-
Containers: []v1.Container{
61-
v1.Container{
62-
Name: "",
63-
Image: "mxjob/mxnet:gpu",
126+
"image is empty": {
127+
MXJob: &MXJob{
128+
ObjectMeta: metav1.ObjectMeta{
129+
Name: "test",
130+
},
131+
Spec: MXJobSpec{
132+
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
133+
MXJobReplicaTypeWorker: {
134+
Template: corev1.PodTemplateSpec{
135+
Spec: corev1.PodSpec{
136+
Containers: []corev1.Container{{
137+
Name: "mxnet",
138+
Image: "",
139+
}},
64140
},
65141
},
66142
},
67143
},
68144
},
69145
},
146+
wantErr: true,
70147
},
71-
{
72-
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
73-
MXJobReplicaTypeScheduler: &commonv1.ReplicaSpec{
74-
Template: v1.PodTemplateSpec{
75-
Spec: v1.PodSpec{
76-
Containers: []v1.Container{},
148+
"mxnet default container name doesn't find": {
149+
MXJob: &MXJob{
150+
ObjectMeta: metav1.ObjectMeta{
151+
Name: "test",
152+
},
153+
Spec: MXJobSpec{
154+
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
155+
MXJobReplicaTypeWorker: {
156+
Template: corev1.PodTemplateSpec{
157+
Spec: corev1.PodSpec{
158+
Containers: []corev1.Container{{
159+
Name: "",
160+
Image: "mxjob/mxnet:gpu",
161+
}},
162+
},
163+
},
77164
},
78165
},
79166
},
80167
},
168+
wantErr: true,
169+
},
170+
"replicaSpec is nil": {
171+
MXJob: &MXJob{
172+
ObjectMeta: metav1.ObjectMeta{
173+
Name: "test",
174+
},
175+
Spec: MXJobSpec{
176+
MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{
177+
MXJobReplicaTypeScheduler: nil,
178+
},
179+
},
180+
},
181+
wantErr: true,
81182
},
82183
}
83-
for _, c := range testCases {
84-
err := ValidateV1MXJobSpec(&c)
85-
if err.Error() != "MXJobSpec is not valid" {
86-
t.Error("Failed validate the alpha2.MXJobSpec")
87-
}
184+
185+
for name, tc := range testCases {
186+
t.Run(name, func(t *testing.T) {
187+
got := ValidateV1MXJob(tc.MXJob)
188+
if (got != nil) != tc.wantErr {
189+
t.Fatalf("ValidateV1MXJob() error = %v, wantErr %v", got, tc.wantErr)
190+
}
191+
})
88192
}
89193
}

pkg/apis/kubeflow.org/v1/paddlepaddle_validation.go

+14-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,24 @@ import (
1818
"fmt"
1919

2020
commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
21+
apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation"
2122
)
2223

23-
func ValidateV1PaddleJobSpec(c *PaddleJobSpec) error {
24-
if c.PaddleReplicaSpecs == nil {
24+
func ValidateV1PaddleJob(paddleJob *PaddleJob) error {
25+
if errors := apimachineryvalidation.NameIsDNS1035Label(paddleJob.ObjectMeta.Name, false); errors != nil {
26+
return fmt.Errorf("PaddleJob name is invalid: %v", errors)
27+
}
28+
if err := validatePaddleReplicaSpecs(paddleJob.Spec.PaddleReplicaSpecs); err != nil {
29+
return err
30+
}
31+
return nil
32+
}
33+
34+
func validatePaddleReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error {
35+
if specs == nil {
2536
return fmt.Errorf("PaddleJobSpec is not valid")
2637
}
27-
for rType, value := range c.PaddleReplicaSpecs {
38+
for rType, value := range specs {
2839
if value == nil || len(value.Template.Spec.Containers) == 0 {
2940
return fmt.Errorf("PaddleJobSpec is not valid: containers definition expected in %v", rType)
3041
}
@@ -63,5 +74,4 @@ func ValidateV1PaddleJobSpec(c *PaddleJobSpec) error {
6374
}
6475

6576
return nil
66-
6777
}

0 commit comments

Comments
 (0)