Skip to content

Commit 30089eb

Browse files
committed
Enable job suspend for Kueue
Signed-off-by: ted chang <[email protected]>
1 parent ab6bc98 commit 30089eb

File tree

7 files changed

+1353
-1403
lines changed

7 files changed

+1353
-1403
lines changed

api/lmes/v1alpha1/lmevaljob_types.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import (
2828
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
2929

3030
// Represent a job's status
31-
// +kubebuilder:validation:Enum=New;Scheduled;Running;Complete;Cancelled
31+
// +kubebuilder:validation:Enum=New;Scheduled;Running;Complete;Cancelled;Suspended
3232
type JobState string
3333

3434
const (
@@ -42,6 +42,8 @@ const (
4242
CompleteJobState JobState = "Complete"
4343
// The job is cancelled
4444
CancelledJobState JobState = "Cancelled"
45+
// The job is suspended
46+
SuspendedJobState JobState = "Suspended"
4547
)
4648

4749
// +kubebuilder:validation:Enum=NoReason;Succeeded;Failed;Cancelled
@@ -236,6 +238,8 @@ type LMEvalJobSpec struct {
236238
// Specify extra information for the lm-eval job's pod
237239
// +optional
238240
Pod *LMEvalPodSpec `json:"pod,omitempty"`
241+
// Suspend keeps the job but without pods. This is intended to be used by the Kueue integration
242+
Suspend bool `json:"suspend,omitempty"`
239243
}
240244

241245
// LMEvalJobStatus defines the observed state of LMEvalJob

api/lmes/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/tas/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml

Lines changed: 1285 additions & 1317 deletions
Large diffs are not rendered by default.

config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
33
kind: CustomResourceDefinition
44
metadata:
55
annotations:
6-
controller-gen.kubebuilder.io/version: v0.11.1
7-
creationTimestamp: null
6+
controller-gen.kubebuilder.io/version: v0.16.3
87
name: trustyaiservices.trustyai.opendatahub.io
98
spec:
109
group: trustyai.opendatahub.io
@@ -21,14 +20,19 @@ spec:
2120
description: TrustyAIService is the Schema for the trustyaiservices API
2221
properties:
2322
apiVersion:
24-
description: 'APIVersion defines the versioned schema of this representation
25-
of an object. Servers should convert recognized schemas to the latest
26-
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
23+
description: |-
24+
APIVersion defines the versioned schema of this representation of an object.
25+
Servers should convert recognized schemas to the latest internal value, and
26+
may reject unrecognized values.
27+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
2728
type: string
2829
kind:
29-
description: 'Kind is a string value representing the REST resource this
30-
object represents. Servers may infer this from the endpoint the client
31-
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
30+
description: |-
31+
Kind is a string value representing the REST resource this object represents.
32+
Servers may infer this from the endpoint the client submits requests to.
33+
Cannot be updated.
34+
In CamelCase.
35+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
3236
type: string
3337
metadata:
3438
type: object

config/rbac/role.yaml

Lines changed: 7 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
apiVersion: rbac.authorization.k8s.io/v1
33
kind: ClusterRole
44
metadata:
5-
creationTimestamp: null
65
name: manager-role
76
rules:
87
- apiGroups:
98
- ""
109
resources:
1110
- configmaps
11+
- persistentvolumeclaims
12+
- pods
13+
- secrets
14+
- services
1215
verbs:
1316
- create
1417
- delete
@@ -28,14 +31,10 @@ rules:
2831
- apiGroups:
2932
- ""
3033
resources:
31-
- pods
34+
- persistentvolumes
3235
verbs:
33-
- create
34-
- delete
3536
- get
3637
- list
37-
- patch
38-
- update
3938
- watch
4039
- apiGroups:
4140
- ""
@@ -47,18 +46,6 @@ rules:
4746
- get
4847
- list
4948
- watch
50-
- apiGroups:
51-
- ""
52-
resources:
53-
- secrets
54-
verbs:
55-
- create
56-
- delete
57-
- get
58-
- list
59-
- patch
60-
- update
61-
- watch
6249
- apiGroups:
6350
- ""
6451
resources:
@@ -104,38 +91,6 @@ rules:
10491
- create
10592
- get
10693
- update
107-
- apiGroups:
108-
- ""
109-
resources:
110-
- persistentvolumeclaims
111-
verbs:
112-
- create
113-
- delete
114-
- get
115-
- list
116-
- patch
117-
- update
118-
- watch
119-
- apiGroups:
120-
- ""
121-
resources:
122-
- persistentvolumes
123-
verbs:
124-
- get
125-
- list
126-
- watch
127-
- apiGroups:
128-
- ""
129-
resources:
130-
- services
131-
verbs:
132-
- create
133-
- delete
134-
- get
135-
- list
136-
- patch
137-
- update
138-
- watch
13994
- apiGroups:
14095
- monitoring.coreos.com
14196
resources:
@@ -212,31 +167,6 @@ rules:
212167
- trustyai.opendatahub.io
213168
resources:
214169
- lmevaljobs
215-
verbs:
216-
- create
217-
- delete
218-
- get
219-
- list
220-
- patch
221-
- update
222-
- watch
223-
- apiGroups:
224-
- trustyai.opendatahub.io
225-
resources:
226-
- lmevaljobs/finalizers
227-
verbs:
228-
- update
229-
- apiGroups:
230-
- trustyai.opendatahub.io
231-
resources:
232-
- lmevaljobs/status
233-
verbs:
234-
- get
235-
- patch
236-
- update
237-
- apiGroups:
238-
- trustyai.opendatahub.io
239-
resources:
240170
- trustyaiservices
241171
verbs:
242172
- create
@@ -249,12 +179,14 @@ rules:
249179
- apiGroups:
250180
- trustyai.opendatahub.io
251181
resources:
182+
- lmevaljobs/finalizers
252183
- trustyaiservices/finalizers
253184
verbs:
254185
- update
255186
- apiGroups:
256187
- trustyai.opendatahub.io
257188
resources:
189+
- lmevaljobs/status
258190
- trustyaiservices/status
259191
verbs:
260192
- get

controllers/lmes/lmevaljob_controller.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
181181
job.Status.State = lmesv1alpha1.NewJobState
182182
}
183183

184+
if job.Spec.Suspend {
185+
r.handleSuspend(ctx, log, job)
186+
}
187+
184188
// Handle the job based on its state
185189
switch job.Status.State {
186190
case lmesv1alpha1.NewJobState:
@@ -198,6 +202,11 @@ func (r *LMEvalJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
198202
return r.handleComplete(ctx, log, job)
199203
case lmesv1alpha1.CancelledJobState:
200204
return r.handleCancel(ctx, log, job)
205+
case lmesv1alpha1.SuspendedJobState:
206+
if !job.Spec.Suspend {
207+
return r.handleResume(ctx, log, job)
208+
}
209+
return ctrl.Result{}, nil
201210
}
202211

203212
return ctrl.Result{}, nil
@@ -634,6 +643,41 @@ func (r *LMEvalJobReconciler) handleCancel(ctx context.Context, log logr.Logger,
634643
return ctrl.Result{}, err
635644
}
636645

646+
func (r *LMEvalJobReconciler) handleSuspend(ctx context.Context, log logr.Logger, job *lmesv1alpha1.LMEvalJob) (ctrl.Result, error) {
647+
defer r.pullingJobs.remove(string(job.GetUID()))
648+
if job.Status.State != lmesv1alpha1.NewJobState {
649+
log.Info("Suspend job")
650+
if err := r.deleteJobPod(ctx, job); err != nil && client.IgnoreNotFound(err) != nil {
651+
log.Error(err, "failed to delete pod for suspended job")
652+
return ctrl.Result{Requeue: true, RequeueAfter: r.options.PodCheckingInterval}, err
653+
}
654+
} else {
655+
log.Info("Create job in suspend state.")
656+
}
657+
job.Status.State = lmesv1alpha1.SuspendedJobState
658+
err := r.Status().Update(ctx, job)
659+
if err != nil {
660+
log.Error(err, "failed to update job status to suspended")
661+
}
662+
663+
return ctrl.Result{}, err
664+
}
665+
666+
func (r *LMEvalJobReconciler) handleResume(ctx context.Context, log logr.Logger, job *lmesv1alpha1.LMEvalJob) (ctrl.Result, error) {
667+
log.Info("Resume job")
668+
pod := r.createPod(job, log)
669+
if err := r.Create(ctx, pod); err != nil {
670+
log.Error(err, "failed to create pod to resume job")
671+
return ctrl.Result{Requeue: true, RequeueAfter: r.options.PodCheckingInterval}, err
672+
}
673+
job.Status.State = lmesv1alpha1.ScheduledJobState
674+
err := r.Status().Update(ctx, job)
675+
if err != nil {
676+
log.Error(err, "failed to update job status to scheduled")
677+
}
678+
return ctrl.Result{}, err
679+
}
680+
637681
func (r *LMEvalJobReconciler) validateCustomCard(job *lmesv1alpha1.LMEvalJob, log logr.Logger) error {
638682
if job.Spec.TaskList.TaskRecipes == nil {
639683
return nil

0 commit comments

Comments
 (0)