Skip to content

Commit 24ae784

Browse files
committed
Add initial Kueue integration
Signed-off-by: ted chang <[email protected]>
1 parent b2bec12 commit 24ae784

15 files changed

+1793
-1689
lines changed

api/lmes/v1alpha1/lmevaljob_types.go

+2
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,8 @@ type LMEvalJobSpec struct {
236236
// Specify extra information for the lm-eval job's pod
237237
// +optional
238238
Pod *LMEvalPodSpec `json:"pod,omitempty"`
239+
// This is for Kueue integration. Kueue requires this value to be false initially.
240+
Suspend bool `json:"suspend,omitempty"`
239241
}
240242

241243
// LMEvalJobStatus defines the observed state of LMEvalJob

api/lmes/v1alpha1/zz_generated.deepcopy.go

-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/tas/v1alpha1/zz_generated.deepcopy.go

-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/operator/main.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
ctrl "sigs.k8s.io/controller-runtime"
3737
"sigs.k8s.io/controller-runtime/pkg/healthz"
3838
"sigs.k8s.io/controller-runtime/pkg/log/zap"
39+
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
3940

4041
lmesv1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/lmes/v1alpha1"
4142
tasv1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/tas/v1alpha1"
@@ -89,12 +90,10 @@ func main() {
8990

9091
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
9192
Scheme: scheme,
92-
MetricsBindAddress: metricsAddr,
93-
Port: 9443,
93+
Metrics: server.Options{BindAddress: metricsAddr + ":9443"},
9494
HealthProbeBindAddress: probeAddr,
9595
LeaderElection: enableLeaderElection,
9696
LeaderElectionID: "b7e9931f.trustyai.opendatahub.io",
97-
Namespace: "", // We are defining a cluster-scoped operator
9897
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
9998
// when the Manager ends. This requires the binary to immediately end when the
10099
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly

config/crd/bases/trustyai.opendatahub.io_lmevaljobs.yaml

+1,451-1,336
Large diffs are not rendered by default.

config/crd/bases/trustyai.opendatahub.io_trustyaiservices.yaml

+12-8
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
33
kind: CustomResourceDefinition
44
metadata:
55
annotations:
6-
controller-gen.kubebuilder.io/version: v0.11.1
7-
creationTimestamp: null
6+
controller-gen.kubebuilder.io/version: v0.16.3
87
name: trustyaiservices.trustyai.opendatahub.io
98
spec:
109
group: trustyai.opendatahub.io
@@ -21,14 +20,19 @@ spec:
2120
description: TrustyAIService is the Schema for the trustyaiservices API
2221
properties:
2322
apiVersion:
24-
description: 'APIVersion defines the versioned schema of this representation
25-
of an object. Servers should convert recognized schemas to the latest
26-
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
23+
description: |-
24+
APIVersion defines the versioned schema of this representation of an object.
25+
Servers should convert recognized schemas to the latest internal value, and
26+
may reject unrecognized values.
27+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
2728
type: string
2829
kind:
29-
description: 'Kind is a string value representing the REST resource this
30-
object represents. Servers may infer this from the endpoint the client
31-
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
30+
description: |-
31+
Kind is a string value representing the REST resource this object represents.
32+
Servers may infer this from the endpoint the client submits requests to.
33+
Cannot be updated.
34+
In CamelCase.
35+
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
3236
type: string
3337
metadata:
3438
type: object

config/rbac/role.yaml

+31-55
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
apiVersion: rbac.authorization.k8s.io/v1
33
kind: ClusterRole
44
metadata:
5-
creationTimestamp: null
65
name: manager-role
76
rules:
87
- apiGroups:
98
- ""
109
resources:
1110
- configmaps
11+
- persistentvolumeclaims
12+
- pods
13+
- secrets
14+
- services
1215
verbs:
1316
- create
1417
- delete
@@ -25,29 +28,14 @@ rules:
2528
- create
2629
- patch
2730
- update
28-
- apiGroups:
29-
- ""
30-
resources:
31-
- pods
32-
verbs:
33-
- create
34-
- delete
35-
- get
36-
- list
37-
- patch
38-
- update
3931
- watch
4032
- apiGroups:
4133
- ""
4234
resources:
43-
- secrets
35+
- persistentvolumes
4436
verbs:
45-
- create
46-
- delete
4737
- get
4838
- list
49-
- patch
50-
- update
5139
- watch
5240
- apiGroups:
5341
- ""
@@ -95,9 +83,18 @@ rules:
9583
- get
9684
- update
9785
- apiGroups:
98-
- ""
86+
- kueue.x-k8s.io
9987
resources:
100-
- persistentvolumeclaims
88+
- resourceflavors
89+
- workloadpriorityclasses
90+
verbs:
91+
- get
92+
- list
93+
- watch
94+
- apiGroups:
95+
- kueue.x-k8s.io
96+
resources:
97+
- workloads
10198
verbs:
10299
- create
103100
- delete
@@ -107,25 +104,19 @@ rules:
107104
- update
108105
- watch
109106
- apiGroups:
110-
- ""
107+
- kueue.x-k8s.io
111108
resources:
112-
- persistentvolumes
109+
- workloads/finalizers
113110
verbs:
114-
- get
115-
- list
116-
- watch
111+
- update
117112
- apiGroups:
118-
- ""
113+
- kueue.x-k8s.io
119114
resources:
120-
- services
115+
- workloads/status
121116
verbs:
122-
- create
123-
- delete
124117
- get
125-
- list
126118
- patch
127119
- update
128-
- watch
129120
- apiGroups:
130121
- monitoring.coreos.com
131122
resources:
@@ -158,21 +149,18 @@ rules:
158149
- update
159150
- watch
160151
- apiGroups:
161-
- serving.kserve.io
152+
- scheduling.k8s.io
162153
resources:
163-
- inferenceservices
154+
- priorityclasses
164155
verbs:
165156
- get
166157
- list
167-
- patch
168-
- update
169158
- watch
170159
- apiGroups:
171160
- serving.kserve.io
172161
resources:
173-
- inferenceservices/finalizers
162+
- inferenceservices
174163
verbs:
175-
- delete
176164
- get
177165
- list
178166
- patch
@@ -181,9 +169,8 @@ rules:
181169
- apiGroups:
182170
- serving.kserve.io
183171
resources:
184-
- servingruntimes
172+
- inferenceservices/finalizers
185173
verbs:
186-
- create
187174
- delete
188175
- get
189176
- list
@@ -193,15 +180,7 @@ rules:
193180
- apiGroups:
194181
- serving.kserve.io
195182
resources:
196-
- servingruntimes/status
197-
verbs:
198-
- get
199-
- patch
200-
- update
201-
- apiGroups:
202-
- trustyai.opendatahub.io
203-
resources:
204-
- lmevaljobs
183+
- servingruntimes
205184
verbs:
206185
- create
207186
- delete
@@ -211,22 +190,17 @@ rules:
211190
- update
212191
- watch
213192
- apiGroups:
214-
- trustyai.opendatahub.io
215-
resources:
216-
- lmevaljobs/finalizers
217-
verbs:
218-
- update
219-
- apiGroups:
220-
- trustyai.opendatahub.io
193+
- serving.kserve.io
221194
resources:
222-
- lmevaljobs/status
195+
- servingruntimes/status
223196
verbs:
224197
- get
225198
- patch
226199
- update
227200
- apiGroups:
228201
- trustyai.opendatahub.io
229202
resources:
203+
- lmevaljobs
230204
- trustyaiservices
231205
verbs:
232206
- create
@@ -239,12 +213,14 @@ rules:
239213
- apiGroups:
240214
- trustyai.opendatahub.io
241215
resources:
216+
- lmevaljobs/finalizers
242217
- trustyaiservices/finalizers
243218
verbs:
244219
- update
245220
- apiGroups:
246221
- trustyai.opendatahub.io
247222
resources:
223+
- lmevaljobs/status
248224
- trustyaiservices/status
249225
verbs:
250226
- get

0 commit comments

Comments
 (0)