Skip to content

Commit 802609b

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add support for reservation affinity in custom training jobs.
PiperOrigin-RevId: 662596199
1 parent 0008735 commit 802609b

File tree

4 files changed

+593
-37
lines changed

4 files changed

+593
-37
lines changed

google/cloud/aiplatform/training_jobs.py

+176-16
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import datetime
1919
import time
20-
from typing import Dict, List, Optional, Sequence, Tuple, Union
20+
from typing import Dict, List, Literal, Optional, Sequence, Tuple, Union
2121
from google.protobuf import json_format
2222

2323
import abc
@@ -1404,6 +1404,11 @@ def _prepare_and_validate_run(
14041404
reduction_server_replica_count: int = 0,
14051405
reduction_server_machine_type: Optional[str] = None,
14061406
tpu_topology: Optional[str] = None,
1407+
reservation_affinity_type: Optional[
1408+
Literal["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"]
1409+
] = None,
1410+
reservation_affinity_key: Optional[str] = None,
1411+
reservation_affinity_values: Optional[List[str]] = None,
14071412
) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]:
14081413
"""Create worker pool specs and managed model as well validating the
14091414
run.
@@ -1451,6 +1456,23 @@ def _prepare_and_validate_run(
14511456
tpu_topology (str):
14521457
Optional. Only required if the machine type is a TPU
14531458
v5 version.
1459+
reservation_affinity_type (str):
1460+
Optional. The type of reservation affinity. One of:
1461+
* "NO_RESERVATION" : No reservation is used.
1462+
* "ANY_RESERVATION" : Any reservation that matches machine spec
1463+
can be used.
1464+
* "SPECIFIC_RESERVATION" : A specific reservation must be use
1465+
used. See reservation_affinity_key and
1466+
reservation_affinity_values for how to specify the reservation.
1467+
reservation_affinity_key (str):
1468+
Optional. Corresponds to the label key of a reservation resource.
1469+
To target a SPECIFIC_RESERVATION by name, use
1470+
`compute.googleapis.com/reservation-name` as the key
1471+
and specify the name of your reservation as its value.
1472+
reservation_affinity_values (List[str]):
1473+
Optional. Corresponds to the label values of a reservation resource.
1474+
This must be the full resource name of the reservation.
1475+
Format: 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}'
14541476
14551477
Returns:
14561478
Worker pools specs and managed model for run.
@@ -1490,6 +1512,9 @@ def _prepare_and_validate_run(
14901512
reduction_server_replica_count=reduction_server_replica_count,
14911513
reduction_server_machine_type=reduction_server_machine_type,
14921514
tpu_topology=tpu_topology,
1515+
reservation_affinity_type=reservation_affinity_type,
1516+
reservation_affinity_key=reservation_affinity_key,
1517+
reservation_affinity_values=reservation_affinity_values,
14931518
).pool_specs
14941519
)
14951520

@@ -3016,6 +3041,11 @@ def run(
30163041
persistent_resource_id: Optional[str] = None,
30173042
tpu_topology: Optional[str] = None,
30183043
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
3044+
reservation_affinity_type: Optional[
3045+
Literal["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"]
3046+
] = None,
3047+
reservation_affinity_key: Optional[str] = None,
3048+
reservation_affinity_values: Optional[List[str]] = None,
30193049
) -> Optional[models.Model]:
30203050
"""Runs the custom training job.
30213051
@@ -3373,6 +3403,23 @@ def run(
33733403
be a supported value for the TPU machine type.
33743404
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
33753405
Optional. Indicates the job scheduling strategy.
3406+
reservation_affinity_type (str):
3407+
Optional. The type of reservation affinity. One of:
3408+
* "NO_RESERVATION" : No reservation is used.
3409+
* "ANY_RESERVATION" : Any reservation that matches machine spec
3410+
can be used.
3411+
* "SPECIFIC_RESERVATION" : A specific reservation must be use
3412+
used. See reservation_affinity_key and
3413+
reservation_affinity_values for how to specify the reservation.
3414+
reservation_affinity_key (str):
3415+
Optional. Corresponds to the label key of a reservation resource.
3416+
To target a SPECIFIC_RESERVATION by name, use
3417+
`compute.googleapis.com/reservation-name` as the key
3418+
and specify the name of your reservation as its value.
3419+
reservation_affinity_values (List[str]):
3420+
Optional. Corresponds to the label values of a reservation resource.
3421+
This must be the full resource name of the reservation.
3422+
Format: 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}'
33763423
33773424
Returns:
33783425
The trained Vertex AI model resource or None if the training
@@ -3393,6 +3440,9 @@ def run(
33933440
reduction_server_replica_count=reduction_server_replica_count,
33943441
reduction_server_machine_type=reduction_server_machine_type,
33953442
tpu_topology=tpu_topology,
3443+
reservation_affinity_type=reservation_affinity_type,
3444+
reservation_affinity_key=reservation_affinity_key,
3445+
reservation_affinity_values=reservation_affinity_values,
33963446
)
33973447

33983448
# make and copy package
@@ -3430,9 +3480,11 @@ def run(
34303480
enable_web_access=enable_web_access,
34313481
enable_dashboard_access=enable_dashboard_access,
34323482
tensorboard=tensorboard,
3433-
reduction_server_container_uri=reduction_server_container_uri
3434-
if reduction_server_replica_count > 0
3435-
else None,
3483+
reduction_server_container_uri=(
3484+
reduction_server_container_uri
3485+
if reduction_server_replica_count > 0
3486+
else None
3487+
),
34363488
sync=sync,
34373489
create_request_timeout=create_request_timeout,
34383490
disable_retries=disable_retries,
@@ -3492,6 +3544,11 @@ def submit(
34923544
persistent_resource_id: Optional[str] = None,
34933545
tpu_topology: Optional[str] = None,
34943546
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
3547+
reservation_affinity_type: Optional[
3548+
Literal["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"]
3549+
] = None,
3550+
reservation_affinity_key: Optional[str] = None,
3551+
reservation_affinity_values: Optional[List[str]] = None,
34953552
) -> Optional[models.Model]:
34963553
"""Submits the custom training job without blocking until completion.
34973554
@@ -3794,6 +3851,23 @@ def submit(
37943851
be a supported value for the TPU machine type.
37953852
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
37963853
Optional. Indicates the job scheduling strategy.
3854+
reservation_affinity_type (str):
3855+
Optional. The type of reservation affinity. One of:
3856+
* "NO_RESERVATION" : No reservation is used.
3857+
* "ANY_RESERVATION" : Any reservation that matches machine spec
3858+
can be used.
3859+
* "SPECIFIC_RESERVATION" : A specific reservation must be use
3860+
used. See reservation_affinity_key and
3861+
reservation_affinity_values for how to specify the reservation.
3862+
reservation_affinity_key (str):
3863+
Optional. Corresponds to the label key of a reservation resource.
3864+
To target a SPECIFIC_RESERVATION by name, use
3865+
`compute.googleapis.com/reservation-name` as the key
3866+
and specify the name of your reservation as its value.
3867+
reservation_affinity_values (List[str]):
3868+
Optional. Corresponds to the label values of a reservation resource.
3869+
This must be the full resource name of the reservation.
3870+
Format: 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}'
37973871
37983872
Returns:
37993873
model: The trained Vertex AI Model resource or None if training did not
@@ -3813,6 +3887,9 @@ def submit(
38133887
reduction_server_replica_count=reduction_server_replica_count,
38143888
reduction_server_machine_type=reduction_server_machine_type,
38153889
tpu_topology=tpu_topology,
3890+
reservation_affinity_type=reservation_affinity_type,
3891+
reservation_affinity_key=reservation_affinity_key,
3892+
reservation_affinity_values=reservation_affinity_values,
38163893
)
38173894

38183895
# make and copy package
@@ -3850,9 +3927,11 @@ def submit(
38503927
enable_web_access=enable_web_access,
38513928
enable_dashboard_access=enable_dashboard_access,
38523929
tensorboard=tensorboard,
3853-
reduction_server_container_uri=reduction_server_container_uri
3854-
if reduction_server_replica_count > 0
3855-
else None,
3930+
reduction_server_container_uri=(
3931+
reduction_server_container_uri
3932+
if reduction_server_replica_count > 0
3933+
else None
3934+
),
38563935
sync=sync,
38573936
create_request_timeout=create_request_timeout,
38583937
block=False,
@@ -4485,6 +4564,11 @@ def run(
44854564
persistent_resource_id: Optional[str] = None,
44864565
tpu_topology: Optional[str] = None,
44874566
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
4567+
reservation_affinity_type: Optional[
4568+
Literal["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"]
4569+
] = None,
4570+
reservation_affinity_key: Optional[str] = None,
4571+
reservation_affinity_values: Optional[List[str]] = None,
44884572
) -> Optional[models.Model]:
44894573
"""Runs the custom training job.
44904574
@@ -4780,6 +4864,23 @@ def run(
47804864
must be a supported value for the TPU machine type.
47814865
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
47824866
Optional. Indicates the job scheduling strategy.
4867+
reservation_affinity_type (str):
4868+
Optional. The type of reservation affinity. One of:
4869+
* "NO_RESERVATION" : No reservation is used.
4870+
* "ANY_RESERVATION" : Any reservation that matches machine spec
4871+
can be used.
4872+
* "SPECIFIC_RESERVATION" : A specific reservation must be use
4873+
used. See reservation_affinity_key and
4874+
reservation_affinity_values for how to specify the reservation.
4875+
reservation_affinity_key (str):
4876+
Optional. Corresponds to the label key of a reservation resource.
4877+
To target a SPECIFIC_RESERVATION by name, use
4878+
`compute.googleapis.com/reservation-name` as the key
4879+
and specify the name of your reservation as its value.
4880+
reservation_affinity_values (List[str]):
4881+
Optional. Corresponds to the label values of a reservation resource.
4882+
This must be the full resource name of the reservation.
4883+
Format: 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}'
47834884
47844885
Returns:
47854886
model: The trained Vertex AI Model resource or None if training did not
@@ -4805,6 +4906,9 @@ def run(
48054906
reduction_server_replica_count=reduction_server_replica_count,
48064907
reduction_server_machine_type=reduction_server_machine_type,
48074908
tpu_topology=tpu_topology,
4909+
reservation_affinity_type=reservation_affinity_type,
4910+
reservation_affinity_key=reservation_affinity_key,
4911+
reservation_affinity_values=reservation_affinity_values,
48084912
)
48094913

48104914
return self._run(
@@ -4836,9 +4940,11 @@ def run(
48364940
enable_web_access=enable_web_access,
48374941
enable_dashboard_access=enable_dashboard_access,
48384942
tensorboard=tensorboard,
4839-
reduction_server_container_uri=reduction_server_container_uri
4840-
if reduction_server_replica_count > 0
4841-
else None,
4943+
reduction_server_container_uri=(
4944+
reduction_server_container_uri
4945+
if reduction_server_replica_count > 0
4946+
else None
4947+
),
48424948
sync=sync,
48434949
create_request_timeout=create_request_timeout,
48444950
disable_retries=disable_retries,
@@ -4898,6 +5004,11 @@ def submit(
48985004
persistent_resource_id: Optional[str] = None,
48995005
tpu_topology: Optional[str] = None,
49005006
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
5007+
reservation_affinity_type: Optional[
5008+
Literal["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"]
5009+
] = None,
5010+
reservation_affinity_key: Optional[str] = None,
5011+
reservation_affinity_values: Optional[List[str]] = None,
49015012
) -> Optional[models.Model]:
49025013
"""Submits the custom training job without blocking until completion.
49035014
@@ -5193,6 +5304,23 @@ def submit(
51935304
must be a supported value for the TPU machine type.
51945305
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
51955306
Optional. Indicates the job scheduling strategy.
5307+
reservation_affinity_type (str):
5308+
Optional. The type of reservation affinity. One of:
5309+
* "NO_RESERVATION" : No reservation is used.
5310+
* "ANY_RESERVATION" : Any reservation that matches machine spec
5311+
can be used.
5312+
* "SPECIFIC_RESERVATION" : A specific reservation must be use
5313+
used. See reservation_affinity_key and
5314+
reservation_affinity_values for how to specify the reservation.
5315+
reservation_affinity_key (str):
5316+
Optional. Corresponds to the label key of a reservation resource.
5317+
To target a SPECIFIC_RESERVATION by name, use
5318+
`compute.googleapis.com/reservation-name` as the key
5319+
and specify the name of your reservation as its value.
5320+
reservation_affinity_values (List[str]):
5321+
Optional. Corresponds to the label values of a reservation resource.
5322+
This must be the full resource name of the reservation.
5323+
Format: 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}'
51965324
51975325
Returns:
51985326
model: The trained Vertex AI Model resource or None if training did not
@@ -5217,6 +5345,9 @@ def submit(
52175345
reduction_server_replica_count=reduction_server_replica_count,
52185346
reduction_server_machine_type=reduction_server_machine_type,
52195347
tpu_topology=tpu_topology,
5348+
reservation_affinity_type=reservation_affinity_type,
5349+
reservation_affinity_key=reservation_affinity_key,
5350+
reservation_affinity_values=reservation_affinity_values,
52205351
)
52215352

52225353
return self._run(
@@ -5248,9 +5379,11 @@ def submit(
52485379
enable_web_access=enable_web_access,
52495380
enable_dashboard_access=enable_dashboard_access,
52505381
tensorboard=tensorboard,
5251-
reduction_server_container_uri=reduction_server_container_uri
5252-
if reduction_server_replica_count > 0
5253-
else None,
5382+
reduction_server_container_uri=(
5383+
reduction_server_container_uri
5384+
if reduction_server_replica_count > 0
5385+
else None
5386+
),
52545387
sync=sync,
52555388
create_request_timeout=create_request_timeout,
52565389
block=False,
@@ -7572,6 +7705,11 @@ def run(
75727705
persistent_resource_id: Optional[str] = None,
75737706
tpu_topology: Optional[str] = None,
75747707
scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None,
7708+
reservation_affinity_type: Optional[
7709+
Literal["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"]
7710+
] = None,
7711+
reservation_affinity_key: Optional[str] = None,
7712+
reservation_affinity_values: Optional[List[str]] = None,
75757713
) -> Optional[models.Model]:
75767714
"""Runs the custom training job.
75777715
@@ -7868,6 +8006,23 @@ def run(
78688006
must be a supported value for the TPU machine type.
78698007
scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy):
78708008
Optional. Indicates the job scheduling strategy.
8009+
reservation_affinity_type (str):
8010+
Optional. The type of reservation affinity. One of:
8011+
* "NO_RESERVATION" : No reservation is used.
8012+
* "ANY_RESERVATION" : Any reservation that matches machine spec
8013+
can be used.
8014+
* "SPECIFIC_RESERVATION" : A specific reservation must be use
8015+
used. See reservation_affinity_key and
8016+
reservation_affinity_values for how to specify the reservation.
8017+
reservation_affinity_key (str):
8018+
Optional. Corresponds to the label key of a reservation resource.
8019+
To target a SPECIFIC_RESERVATION by name, use
8020+
`compute.googleapis.com/reservation-name` as the key
8021+
and specify the name of your reservation as its value.
8022+
reservation_affinity_values (List[str]):
8023+
Optional. Corresponds to the label values of a reservation resource.
8024+
This must be the full resource name of the reservation.
8025+
Format: 'projects/{project_id_or_number}/zones/{zone}/reservations/{reservation_name}'
78718026
78728027
Returns:
78738028
model: The trained Vertex AI Model resource or None if training did not
@@ -7888,6 +8043,9 @@ def run(
78888043
reduction_server_replica_count=reduction_server_replica_count,
78898044
reduction_server_machine_type=reduction_server_machine_type,
78908045
tpu_topology=tpu_topology,
8046+
reservation_affinity_type=reservation_affinity_type,
8047+
reservation_affinity_key=reservation_affinity_key,
8048+
reservation_affinity_values=reservation_affinity_values,
78918049
)
78928050

78938051
return self._run(
@@ -7919,9 +8077,11 @@ def run(
79198077
enable_web_access=enable_web_access,
79208078
enable_dashboard_access=enable_dashboard_access,
79218079
tensorboard=tensorboard,
7922-
reduction_server_container_uri=reduction_server_container_uri
7923-
if reduction_server_replica_count > 0
7924-
else None,
8080+
reduction_server_container_uri=(
8081+
reduction_server_container_uri
8082+
if reduction_server_replica_count > 0
8083+
else None
8084+
),
79258085
sync=sync,
79268086
create_request_timeout=create_request_timeout,
79278087
disable_retries=disable_retries,

0 commit comments

Comments
 (0)