|
1 | 1 | ## Instructions to test the controller on Kind locally
|
2 |
| - |
| 2 | +When Job Manager is an enabled service LMevalJob requires `kueue.x-k8s.io/queue-name` label. |
3 | 3 | 1. Setup Kind
|
4 | 4 |
|
5 | 5 | Create a kind cluster with 3 nodes
|
|
105 | 105 | apiVersion: v1
|
106 | 106 | kind: ConfigMap
|
107 | 107 | metadata:
|
108 |
| - name: trustyai-service-operator-config |
109 |
| - labels: |
| 108 | + name: trustyai-service-operator-config |
| 109 | + labels: |
110 | 110 | app.kubernetes.io/part-of: trustyai
|
111 |
| - annotations: |
| 111 | + annotations: |
112 | 112 | internal.config.kubernetes.io/generatorBehavior: unspecified
|
113 | 113 | internal.config.kubernetes.io/prefixes: trustyai-service-operator-
|
114 | 114 | internal.config.kubernetes.io/previousKinds: ConfigMap,ConfigMap
|
115 | 115 | internal.config.kubernetes.io/previousNames: config,trustyai-service-operator-config
|
116 | 116 | internal.config.kubernetes.io/previousNamespaces: default,default
|
117 | 117 | data:
|
118 |
| - kServeServerless: disabled |
119 |
| - lmes-default-batch-size: "8" |
120 |
| - lmes-driver-image: quay.io/yhwang/ta-lmes-driver:latest |
121 |
| - lmes-grpc-port: "8082" |
122 |
| - lmes-grpc-service: lmes-grpc |
123 |
| - lmes-image-pull-policy: Always |
124 |
| - lmes-max-batch-size: "24" |
125 |
| - lmes-pod-checking-interval: 10s |
126 |
| - lmes-pod-image: quay.io/tedchang/ta-lmes-job:latest |
127 |
| - oauthProxyImage: quay.io/openshift/origin-oauth-proxy:4.14.0 |
128 |
| - trustyaiOperatorImage: quay.io/tedchang/trustyai-service-operator:latest |
129 |
| - trustyaiServiceImage: quay.io/trustyai/trustyai-service:latest |
| 118 | + kServeServerless: disabled |
| 119 | + lmes-default-batch-size: "8" |
| 120 | + lmes-driver-image: quay.io/yhwang/ta-lmes-driver:latest |
| 121 | + lmes-image-pull-policy: Always |
| 122 | + lmes-max-batch-size: "24" |
| 123 | + lmes-pod-checking-interval: 10s |
| 124 | + lmes-pod-image: quay.io/tedchang/ta-lmes-job:latest |
| 125 | + oauthProxyImage: quay.io/openshift/origin-oauth-proxy:4.14.0 |
| 126 | + trustyaiOperatorImage: quay.io/tedchang/trustyai-service-operator:latest |
| 127 | + trustyaiServiceImage: quay.io/trustyai/trustyai-service:latest |
130 | 128 | EOF
|
131 | 129 | ```
|
132 | 130 | Start the controller locally:
|
|
138 | 136 | INFO Starting workers {"controller": "lmevaljob", "controllerGroup": "trustyai.opendatahub.io", "controllerKind": "LMEvalJob", "worker count": 1}
|
139 | 137 | INFO Starting workers {"controller": "LMEvalJobWorkload", "controllerGroup": "trustyai.opendatahub.io", "controllerKind": "LMEvalJob", "worker count": 1}
|
140 | 138 | ```
|
141 |
| -1. Create 5 jobs. |
| 139 | +1. Quota and Node Affinity example. We will create 5 jobs. |
142 | 140 |
|
143 |
| - Jobs labeled with `user-queue` will be run on `kueue-worker` node. Job labeled with `user-queue-2` will be run on `kueue-worker2` node. |
| 141 | + Jobs labeled with `user-queue` will be run on `kueue-worker` node. |
| 142 | + Job labeled with `user-queue-2` will be run on `kueue-worker2` node. |
| 143 | + Job will be Suspended if there is not enough quota. |
144 | 144 |
|
145 | 145 | Run 3 times.
|
146 | 146 | ```bash
|
147 |
| - cat <<EOF | kubectl apply -f - |
| 147 | + cat <<EOF | kubectl create -f - |
| 148 | + apiVersion: trustyai.opendatahub.io/v1alpha1 |
148 | 149 | kind: LMEvalJob
|
149 | 150 | metadata:
|
150 |
| - labels: |
| 151 | + labels: |
151 | 152 | app.kubernetes.io/name: fms-lm-eval-service
|
152 | 153 | app.kubernetes.io/managed-by: kustomize
|
153 | 154 | kueue.x-k8s.io/queue-name: user-queue
|
154 |
| - generateName: evaljob-sample- |
155 |
| - namespace: default |
| 155 | + generateName: evaljob-sample- |
| 156 | + namespace: default |
156 | 157 | spec:
|
157 |
| - pod: |
| 158 | + pod: |
158 | 159 | container:
|
159 |
| - resources: |
| 160 | + resources: |
160 | 161 | requests:
|
161 |
| - cpu: 2 |
162 |
| - suspend: true |
163 |
| - model: hf |
164 |
| - modelArgs: |
165 |
| - - name: pretrained |
| 162 | + cpu: 2 |
| 163 | + suspend: true |
| 164 | + model: hf |
| 165 | + modelArgs: |
| 166 | + - name: pretrained |
166 | 167 | value: EleutherAI/pythia-70m
|
167 |
| - taskList: |
| 168 | + taskList: |
168 | 169 | taskNames:
|
169 | 170 | - unfair_tos
|
170 |
| - logSamples: true |
171 |
| - limit: "5" |
| 171 | + logSamples: true |
| 172 | + limit: "5" |
172 | 173 | EOF
|
173 | 174 | ```
|
174 | 175 |
|
175 | 176 | Run 2 times.
|
176 | 177 | ```bash
|
177 |
| - cat <<EOF | kubectl apply -f - |
| 178 | + cat <<EOF | kubectl create -f - |
| 179 | + apiVersion: trustyai.opendatahub.io/v1alpha1 |
178 | 180 | kind: LMEvalJob
|
179 | 181 | metadata:
|
180 |
| - labels: |
| 182 | + labels: |
181 | 183 | app.kubernetes.io/name: fms-lm-eval-service
|
182 | 184 | app.kubernetes.io/managed-by: kustomize
|
183 | 185 | kueue.x-k8s.io/queue-name: user-queue-2
|
184 |
| - generateName: evaljob-sample- |
185 |
| - namespace: default |
| 186 | + generateName: evaljob-sample- |
| 187 | + namespace: default |
186 | 188 | spec:
|
187 |
| - pod: |
| 189 | + pod: |
188 | 190 | container:
|
189 |
| - resources: |
| 191 | + resources: |
190 | 192 | requests:
|
191 |
| - cpu: 2 |
192 |
| - suspend: true |
193 |
| - model: hf |
194 |
| - modelArgs: |
195 |
| - - name: pretrained |
| 193 | + cpu: 2 |
| 194 | + suspend: true |
| 195 | + model: hf |
| 196 | + modelArgs: |
| 197 | + - name: pretrained |
196 | 198 | value: EleutherAI/pythia-70m
|
197 |
| - taskList: |
| 199 | + taskList: |
198 | 200 | taskNames:
|
199 | 201 | - unfair_tos
|
200 |
| - logSamples: true |
201 |
| - limit: "5" |
| 202 | + logSamples: true |
| 203 | + limit: "5" |
202 | 204 | EOF
|
203 | 205 | ```
|
204 | 206 |
|
|
216 | 218 | lmevaljob.trustyai.opendatahub.io/evaljob-sample-d2jtx Running
|
217 | 219 | lmevaljob.trustyai.opendatahub.io/evaljob-sample-dpr2q Running
|
218 | 220 |
|
219 |
| - # Each lmevaljob is represented by a Kueue Workerload resource. A Workload is only ADMITTED when there is enough quota in a Queue. In our example, user-queue has 4 cpu quota. We created 3 jobs each requests 2 cpu; therefore only 2 jobs can be admitted to user-queue. |
| 221 | + # Each lmevaljob is represented by a Kueue Workload resource. A Workload is only ADMITTED when there is enough quota in a Queue. In our example, user-queue has 4 cpu quota. We created 3 jobs each requests 2 cpu; therefore only 2 jobs can be admitted to user-queue. |
220 | 222 |
|
221 | 223 | NAME QUEUE RESERVED IN ADMITTED FINISHED AGE
|
222 | 224 | workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-2zwb4-74b05 user-queue 71s
|
|
232 | 234 | pod/evaljob-sample-6gh6f 1/1 Running 0 82s 10.244.1.26 kueue-worker <none> <none>
|
233 | 235 | pod/evaljob-sample-d2jtx 1/1 Running 0 13s 10.244.2.38 kueue-worker2 <none> <none>
|
234 | 236 | pod/evaljob-sample-dpr2q 1/1 Running 0 16s 10.244.2.37 kueue-worker2 <none> <none>
|
235 |
| - ``` |
| 237 | + ``` |
| 238 | +
|
| 239 | +1. Preemption example: |
| 240 | + |
| 241 | + Clean up jobs |
| 242 | + ``` |
| 243 | + kubectl delete lmevaljob $(kubectl get lmevaljob|grep evaljob-sample-|cut -d" " -f1) |
| 244 | + ``` |
| 245 | +
|
| 246 | + Create a new ClusterQueue, LocalQueue, and 2 WorkloadPriorityClass(low and high). |
| 247 | + |
| 248 | + ```bash |
| 249 | + cat <<EOF | kubectl apply -f - |
| 250 | + apiVersion: kueue.x-k8s.io/v1beta1 |
| 251 | + kind: ClusterQueue |
| 252 | + metadata: |
| 253 | + name: "cluster-queue-3" |
| 254 | + spec: |
| 255 | + namespaceSelector: {} # match all. |
| 256 | + resourceGroups: |
| 257 | + - coveredResources: ["cpu", "memory"] |
| 258 | + flavors: |
| 259 | + - name: "default-flavor" |
| 260 | + resources: |
| 261 | + - name: "cpu" |
| 262 | + nominalQuota: 4 |
| 263 | + - name: "memory" |
| 264 | + nominalQuota: 88Gi |
| 265 | + - name: "default-flavor-2" |
| 266 | + resources: |
| 267 | + - name: "cpu" |
| 268 | + nominalQuota: 4 |
| 269 | + - name: "memory" |
| 270 | + nominalQuota: 88Gi |
| 271 | + preemption: |
| 272 | + withinClusterQueue: LowerPriority |
| 273 | + --- |
| 274 | + apiVersion: kueue.x-k8s.io/v1beta1 |
| 275 | + kind: LocalQueue |
| 276 | + metadata: |
| 277 | + namespace: "default" |
| 278 | + name: "user-queue-3" |
| 279 | + spec: |
| 280 | + clusterQueue: "cluster-queue-3" |
| 281 | + --- |
| 282 | + apiVersion: kueue.x-k8s.io/v1beta1 |
| 283 | + kind: WorkloadPriorityClass |
| 284 | + metadata: |
| 285 | + name: low-priority |
| 286 | + value: 10 |
| 287 | + description: "10 is lower priority" |
| 288 | + --- |
| 289 | + apiVersion: kueue.x-k8s.io/v1beta1 |
| 290 | + kind: WorkloadPriorityClass |
| 291 | + metadata: |
| 292 | + name: high-priority |
| 293 | + value: 10000 |
| 294 | + description: "10000 is higher priority" |
| 295 | + EOF |
| 296 | + ``` |
| 297 | +
|
| 298 | + Create 4 low priory jobs. |
| 299 | + Run 4 times. |
| 300 | + ```bash |
| 301 | + cat << EOF| kubectl create -f - |
| 302 | + apiVersion: trustyai.opendatahub.io/v1alpha1 |
| 303 | + kind: LMEvalJob |
| 304 | + metadata: |
| 305 | + labels: |
| 306 | + app.kubernetes.io/name: fms-lm-eval-service |
| 307 | + app.kubernetes.io/managed-by: kustomize |
| 308 | + kueue.x-k8s.io/queue-name: user-queue-3 |
| 309 | + kueue.x-k8s.io/priority-class: low-priority |
| 310 | + generateName: evaljob-sample- |
| 311 | + namespace: default |
| 312 | + spec: |
| 313 | + pod: |
| 314 | + container: |
| 315 | + resources: |
| 316 | + requests: |
| 317 | + cpu: 2 |
| 318 | + suspend: true |
| 319 | + model: hf |
| 320 | + modelArgs: |
| 321 | + - name: pretrained |
| 322 | + value: EleutherAI/pythia-70m |
| 323 | + taskList: |
| 324 | + taskNames: |
| 325 | + - unfair_tos |
| 326 | + logSamples: true |
| 327 | + limit: "5" |
| 328 | + EOF |
| 329 | + ``` |
| 330 | +
|
| 331 | + Verify they are in running state: |
| 332 | + ``` |
| 333 | + NAME STATE |
| 334 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-8cr8k Running |
| 335 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-n5s9d Running |
| 336 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-wnm2q Running |
| 337 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-xck8c Running |
| 338 | +
|
| 339 | + NAME QUEUE RESERVED IN ADMITTED FINISHED AGE |
| 340 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-8cr8k-34feb user-queue-3 cluster-queue-3 True 22s |
| 341 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-n5s9d-1daba user-queue-3 cluster-queue-3 True 20s |
| 342 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-wnm2q-52093 user-queue-3 cluster-queue-3 True 21s |
| 343 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-xck8c-44e13 user-queue-3 cluster-queue-3 True 23s |
| 344 | +
|
| 345 | + NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES |
| 346 | + pod/evaljob-sample-8cr8k 1/1 Running 0 22s 10.244.1.17 kueue-worker <none> <none> |
| 347 | + pod/evaljob-sample-n5s9d 1/1 Running 0 20s 10.244.2.11 kueue-worker2 <none> <none> |
| 348 | + pod/evaljob-sample-wnm2q 1/1 Running 0 21s 10.244.2.10 kueue-worker2 <none> <none> |
| 349 | + pod/evaljob-sample-xck8c 1/1 Running 0 23s 10.244.1.16 kueue-worker <none> <none> |
| 350 | + ``` |
| 351 | +
|
| 352 | +
|
| 353 | + Create 1 high priority job |
| 354 | + ```bash |
| 355 | + cat << EOF| kubectl create -f - |
| 356 | + apiVersion: trustyai.opendatahub.io/v1alpha1 |
| 357 | + kind: LMEvalJob |
| 358 | + metadata: |
| 359 | + labels: |
| 360 | + app.kubernetes.io/name: fms-lm-eval-service |
| 361 | + app.kubernetes.io/managed-by: kustomize |
| 362 | + kueue.x-k8s.io/queue-name: user-queue-3 |
| 363 | + kueue.x-k8s.io/priority-class: high-priority |
| 364 | + generateName: evaljob-sample- |
| 365 | + namespace: default |
| 366 | + spec: |
| 367 | + pod: |
| 368 | + container: |
| 369 | + resources: |
| 370 | + requests: |
| 371 | + cpu: 2 |
| 372 | + suspend: true |
| 373 | + model: hf |
| 374 | + modelArgs: |
| 375 | + - name: pretrained |
| 376 | + value: EleutherAI/pythia-70m |
| 377 | + taskList: |
| 378 | + taskNames: |
| 379 | + - unfair_tos |
| 380 | + logSamples: true |
| 381 | + limit: "5" |
| 382 | + EOF |
| 383 | + ``` |
| 384 | +
|
| 385 | + Job labeled with low-priority will be preempted/evicted(Suspended) by the new job labeled with high-priority because nominal cpu quota has reached. |
| 386 | + ``` |
| 387 | + NAME STATE |
| 388 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-8cr8k Suspended |
| 389 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-mqj8j Running |
| 390 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-n5s9d Running |
| 391 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-wnm2q Running |
| 392 | + lmevaljob.trustyai.opendatahub.io/evaljob-sample-xck8c Running |
| 393 | +
|
| 394 | + NAME QUEUE RESERVED IN ADMITTED FINISHED AGE |
| 395 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-8cr8k-34feb user-queue-3 False 78s |
| 396 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-mqj8j-fdceb user-queue-3 cluster-queue-3 True 16s |
| 397 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-n5s9d-1daba user-queue-3 cluster-queue-3 True 76s |
| 398 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-wnm2q-52093 user-queue-3 cluster-queue-3 True 77s |
| 399 | + workload.kueue.x-k8s.io/lmevaljob-evaljob-sample-xck8c-44e13 user-queue-3 cluster-queue-3 True 79s |
| 400 | +
|
| 401 | + NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES |
| 402 | + pod/evaljob-sample-mqj8j 1/1 Running 0 15s 10.244.1.18 kueue-worker <none> <none> |
| 403 | + pod/evaljob-sample-n5s9d 1/1 Running 0 76s 10.244.2.11 kueue-worker2 <none> <none> |
| 404 | + pod/evaljob-sample-wnm2q 1/1 Running 0 77s 10.244.2.10 kueue-worker2 <none> <none> |
| 405 | + pod/evaljob-sample-xck8c 1/1 Running 0 79s 10.244.1.16 kueue-worker <none> <none> |
| 406 | + ``` |
0 commit comments