25
25
from kubeflow .training .constants import constants
26
26
from kubeflow .training .utils import utils
27
27
from kubeflow .storage_initializer .constants import (
28
- INIT_CONTAINER_MOUNT_PATH ,
29
28
VOLUME_PATH_DATASET ,
30
29
VOLUME_PATH_MODEL ,
31
30
)
32
31
32
+
33
33
logger = logging .getLogger (__name__ )
34
34
35
35
status_logger = utils .StatusLogger (
@@ -139,64 +139,50 @@ def train(
139
139
140
140
namespace = namespace or self .namespace
141
141
142
- if isinstance (resources_per_worker , dict ):
143
- if "gpu" in resources_per_worker :
144
- if resources_per_worker ["gpu" ] is not None and (
145
- num_procs_per_worker > resources_per_worker ["gpu" ]
146
- ):
147
- raise ValueError (
148
- "Insufficient gpu resources allocated to the container."
149
- )
150
- if resources_per_worker ["gpu" ] is not None :
151
- resources_per_worker ["nvidia.com/gpu" ] = resources_per_worker .pop (
152
- "gpu"
153
- )
154
-
155
- if (
156
- "cpu" not in resources_per_worker
157
- or "memory" not in resources_per_worker
158
- ):
159
- raise ValueError ("cpu and memory resources not specified" )
160
-
161
- resources_per_worker = client .V1ResourceRequirements (
162
- requests = resources_per_worker ,
163
- limits = resources_per_worker ,
164
- )
165
-
142
+ # TODO (andreyvelich): PVC Creation should be part of Training Operator Controller.
143
+ # Ref issue: https://github.com/kubeflow/training-operator/issues/1971
166
144
try :
167
145
self .core_api .create_namespaced_persistent_volume_claim (
168
146
namespace = namespace ,
169
147
body = utils .get_pvc_spec (
170
- pvc_name = constants .TRAINER_PVC_NAME ,
148
+ pvc_name = constants .STORAGE_INITIALIZER ,
171
149
namespace = namespace ,
172
- storage_size = storage_config ["size" ],
173
- storage_class = storage_config ["storage_class" ],
150
+ storage_config = storage_config ,
174
151
),
175
152
)
176
153
except Exception as e :
177
154
pvc_list = self .core_api .list_namespaced_persistent_volume_claim (namespace )
178
155
# Check if the PVC with the specified name exists
179
156
for pvc in pvc_list .items :
180
- if pvc .metadata .name == constants .TRAINER_PVC_NAME :
157
+ if pvc .metadata .name == constants .STORAGE_INITIALIZER :
181
158
print (
182
- f"PVC '{ constants .TRAINER_PVC_NAME } ' already exists in namespace '{ namespace } '."
159
+ f"PVC '{ constants .STORAGE_INITIALIZER } ' already exists in namespace "
160
+ f"{ namespace } ."
183
161
)
184
162
break
185
163
else :
186
164
raise RuntimeError ("failed to create pvc" )
187
165
188
166
if isinstance (model_provider_parameters , HuggingFaceModelParams ):
189
167
mp = "hf"
168
+ else :
169
+ raise ValueError (
170
+ f"Invalid model provider parameters { model_provider_parameters } "
171
+ )
190
172
191
173
if isinstance (dataset_provider_parameters , S3DatasetParams ):
192
174
dp = "s3"
193
175
elif isinstance (dataset_provider_parameters , HfDatasetParams ):
194
176
dp = "hf"
177
+ else :
178
+ raise ValueError (
179
+ f"Invalid dataset provider parameters { dataset_provider_parameters } "
180
+ )
195
181
196
182
# create init container spec
197
183
init_container_spec = utils .get_container_spec (
198
- name = constants .STORAGE_CONTAINER ,
199
- image = constants .STORAGE_CONTAINER_IMAGE ,
184
+ name = constants .STORAGE_INITIALIZER ,
185
+ base_image = constants .STORAGE_INITIALIZER_IMAGE ,
200
186
args = [
201
187
"--model_provider" ,
202
188
mp ,
@@ -207,18 +193,13 @@ def train(
207
193
"--dataset_provider_parameters" ,
208
194
json .dumps (dataset_provider_parameters .__dict__ ),
209
195
],
210
- volume_mounts = [
211
- models .V1VolumeMount (
212
- name = constants .TRAINER_PV ,
213
- mount_path = INIT_CONTAINER_MOUNT_PATH ,
214
- )
215
- ],
196
+ volume_mounts = [constants .STORAGE_INITIALIZER_VOLUME_MOUNT ],
216
197
)
217
198
218
199
# create app container spec
219
200
container_spec = utils .get_container_spec (
220
201
name = constants .JOB_PARAMETERS [constants .PYTORCHJOB_KIND ]["container" ],
221
- image = constants .TRAINER_TRANSFORMER_IMAGE ,
202
+ base_image = constants .TRAINER_TRANSFORMER_IMAGE ,
222
203
args = [
223
204
"--model_uri" ,
224
205
model_provider_parameters .model_uri ,
@@ -235,41 +216,22 @@ def train(
235
216
"--training_parameters" ,
236
217
json .dumps (train_parameters .training_parameters .to_dict ()),
237
218
],
238
- volume_mounts = [
239
- models .V1VolumeMount (
240
- name = constants .TRAINER_PV ,
241
- mount_path = INIT_CONTAINER_MOUNT_PATH ,
242
- )
243
- ],
219
+ volume_mounts = [constants .STORAGE_INITIALIZER_VOLUME_MOUNT ],
244
220
resources = resources_per_worker ,
245
221
)
246
222
247
223
# create worker pod spec
248
224
worker_pod_template_spec = utils .get_pod_template_spec (
249
- job_kind = constants .PYTORCHJOB_KIND ,
250
- containers_spec = [container_spec ],
251
- volumes_spec = [
252
- models .V1Volume (
253
- name = constants .TRAINER_PV ,
254
- persistent_volume_claim = models .V1PersistentVolumeClaimVolumeSource (
255
- claim_name = constants .TRAINER_PVC_NAME
256
- ),
257
- )
258
- ],
225
+ containers = [container_spec ],
226
+ init_containers = [init_container_spec ],
227
+ volumes = [constants .STORAGE_INITIALIZER_VOLUME ],
259
228
)
260
229
261
230
# create master pod spec
262
231
master_pod_template_spec = utils .get_pod_template_spec (
263
- job_kind = constants .PYTORCHJOB_KIND ,
264
- containers_spec = [init_container_spec , container_spec ],
265
- volumes_spec = [
266
- models .V1Volume (
267
- name = constants .TRAINER_PV ,
268
- persistent_volume_claim = models .V1PersistentVolumeClaimVolumeSource (
269
- claim_name = constants .TRAINER_PVC_NAME
270
- ),
271
- )
272
- ],
232
+ containers = [container_spec ],
233
+ init_containers = [init_container_spec ],
234
+ volumes = [constants .STORAGE_INITIALIZER_VOLUME ],
273
235
)
274
236
275
237
job = utils .get_pytorchjob_template (
@@ -293,6 +255,7 @@ def create_job(
293
255
train_func : Optional [Callable ] = None ,
294
256
parameters : Optional [Dict [str , Any ]] = None ,
295
257
num_workers : Optional [int ] = None ,
258
+ resources_per_worker : Union [dict , models .V1ResourceRequirements , None ] = None ,
296
259
num_chief_replicas : Optional [int ] = None ,
297
260
num_ps_replicas : Optional [int ] = None ,
298
261
packages_to_install : Optional [List [str ]] = None ,
@@ -324,6 +287,26 @@ def create_job(
324
287
set, Base Image must support `bash` CLI to execute the training script.
325
288
parameters: Dict of input parameters that training function might receive.
326
289
num_workers: Number of Worker replicas for the Job.
290
+ resources_per_worker: A parameter that lets you specify how much
291
+ resources each Worker container should have. You can either specify a
292
+ kubernetes.client.V1ResourceRequirements object (documented here:
293
+ https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
294
+ or a dictionary that includes one or more of the following keys:
295
+ `cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
296
+ values for these keys are documented here:
297
+ https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
298
+ For example:
299
+ ```
300
+ {
301
+ "cpu": "1",
302
+ "memory": "2Gi",
303
+ "gpu": "1",
304
+ }
305
+ ```
306
+ Please note, `gpu` specifies a resource request with a key of
307
+ `nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type
308
+ of GPU, pass in a V1ResourceRequirement instance instead, since it's
309
+ more flexible. This parameter is optional and defaults to None.
327
310
num_chief_replicas: Number of Chief replicas for the TFJob. Number
328
311
of Chief replicas can't be more than 1.
329
312
num_ps_replicas: Number of Parameter Server replicas for the TFJob.
@@ -353,29 +336,40 @@ def create_job(
353
336
namespace = namespace or self .namespace
354
337
job_kind = job_kind or self .job_kind
355
338
if job is not None :
356
- job_kind = job .kind
339
+ job_kind = str ( job .kind )
357
340
358
341
if job_kind not in constants .JOB_PARAMETERS :
359
342
raise ValueError (
360
343
f"Job kind must be one of these: { constants .JOB_PARAMETERS .keys ()} "
361
344
)
362
345
363
346
# If Training function or base image is set, configure Job template.
364
- if train_func is not None or base_image is not None :
347
+ if job is None and ( train_func is not None or base_image is not None ) :
365
348
# Job name must be set to configure Job template.
366
349
if name is None :
367
350
raise ValueError (
368
351
"Job name must be set to configure Job from function or image"
369
352
)
370
353
371
- # Get Pod template spec from function or image.
372
- pod_template_spec = utils .get_pod_template_spec (
373
- job_kind = job_kind ,
354
+ # Assign the default base image.
355
+ # TODO (andreyvelich): Add base image for other Job kinds.
356
+ if base_image is None :
357
+ base_image = constants .JOB_PARAMETERS [job_kind ]["base_image" ]
358
+
359
+ # Get Training Container template.
360
+ container_spec = utils .get_container_spec (
361
+ name = constants .JOB_PARAMETERS [job_kind ]["container" ],
374
362
base_image = base_image ,
375
363
train_func = train_func ,
376
- parameters = parameters ,
364
+ train_func_parameters = parameters ,
377
365
packages_to_install = packages_to_install ,
378
366
pip_index_url = pip_index_url ,
367
+ resources = resources_per_worker ,
368
+ )
369
+
370
+ # Get Pod template spec using the above container.
371
+ pod_template_spec = utils .get_pod_template_spec (
372
+ containers = [container_spec ],
379
373
)
380
374
381
375
# Configure template for different Jobs.
@@ -403,16 +397,21 @@ def create_job(
403
397
)
404
398
405
399
# Verify Job object type.
406
- if not isinstance (job , constants .JOB_MODELS ):
407
- raise ValueError (f"Job must be one of these types: { constants .JOB_MODELS } " )
400
+ if not isinstance (
401
+ job ,
402
+ getattr (models , constants .JOB_PARAMETERS [job_kind ]["model" ]),
403
+ ):
404
+ raise ValueError (
405
+ f"Job must be one of these types: { constants .JOB_MODELS } , but Job is: { type (job )} "
406
+ )
408
407
409
408
# Create the Training Job.
410
409
try :
411
410
self .custom_api .create_namespaced_custom_object (
412
411
constants .GROUP ,
413
412
constants .VERSION ,
414
413
namespace ,
415
- constants .JOB_PARAMETERS [job . kind ]["plural" ],
414
+ constants .JOB_PARAMETERS [job_kind ]["plural" ],
416
415
job ,
417
416
)
418
417
except multiprocessing .TimeoutError :
@@ -580,7 +579,9 @@ def get_job_conditions(
580
579
f"Job kind must be one of these: { constants .JOB_PARAMETERS .keys ()} "
581
580
)
582
581
583
- if job is not None and not isinstance (job , constants .JOB_MODELS ):
582
+ if job is not None and not isinstance (
583
+ job , getattr (models , constants .JOB_PARAMETERS [job_kind ]["model" ])
584
+ ):
584
585
raise ValueError (f"Job must be one of these types: { constants .JOB_MODELS } " )
585
586
586
587
# If Job is not set, get the Training Job.
@@ -1235,7 +1236,7 @@ def delete_job(
1235
1236
name : str ,
1236
1237
namespace : Optional [str ] = None ,
1237
1238
job_kind : Optional [str ] = None ,
1238
- delete_options : Optional [client .V1DeleteOptions ] = None ,
1239
+ delete_options : Optional [models .V1DeleteOptions ] = None ,
1239
1240
):
1240
1241
"""Delete the Training Job
1241
1242
0 commit comments