Skip to content

Commit 415912e

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add support for TPU v5 lite pod(v5e) for custom training jobs. Custom training jobs now accept the v5e machine types as listed in https://cloud.google.com/tpu/docs/tpus-in-gke#v5e.
PiperOrigin-RevId: 627165889
1 parent 0599ca1 commit 415912e

File tree

6 files changed

+859
-2
lines changed

6 files changed

+859
-2
lines changed

google/cloud/aiplatform/jobs.py

+8
Original file line numberDiff line numberDiff line change
@@ -1924,6 +1924,7 @@ def from_local_script(
19241924
encryption_spec_key_name: Optional[str] = None,
19251925
staging_bucket: Optional[str] = None,
19261926
persistent_resource_id: Optional[str] = None,
1927+
tpu_topology: Optional[str] = None,
19271928
) -> "CustomJob":
19281929
"""Configures a custom job from a local script.
19291930
@@ -2034,6 +2035,12 @@ def from_local_script(
20342035
on-demand short-live machines. The network, CMEK, and node pool
20352036
configs on the job should be consistent with those on the
20362037
PersistentResource, otherwise, the job will be rejected.
2038+
tpu_topology (str):
2039+
Optional. Specifies the tpu topology to be used for
2040+
TPU training job. This field is required for TPU v5 versions. For
2041+
details on the TPU topology, refer to
2042+
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
2043+
must be a supported value for the TPU machine type.
20372044
20382045
Raises:
20392046
RuntimeError: If staging bucket was not set using aiplatform.init
@@ -2063,6 +2070,7 @@ def from_local_script(
20632070
boot_disk_size_gb=boot_disk_size_gb,
20642071
reduction_server_replica_count=reduction_server_replica_count,
20652072
reduction_server_machine_type=reduction_server_machine_type,
2073+
tpu_topology=tpu_topology,
20662074
).pool_specs
20672075
)
20682076

google/cloud/aiplatform/training_jobs.py

+46
Original file line numberDiff line numberDiff line change
@@ -1373,6 +1373,7 @@ def _prepare_and_validate_run(
13731373
boot_disk_size_gb: int = 100,
13741374
reduction_server_replica_count: int = 0,
13751375
reduction_server_machine_type: Optional[str] = None,
1376+
tpu_topology: Optional[str] = None,
13761377
) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]:
13771378
"""Create worker pool specs and managed model as well validating the
13781379
run.
@@ -1417,6 +1418,10 @@ def _prepare_and_validate_run(
14171418
The number of reduction server replicas, default is 0.
14181419
reduction_server_machine_type (str):
14191420
Optional. The type of machine to use for reduction server.
1421+
tpu_topology (str):
1422+
Optional. Only required if the machine type is a TPU
1423+
v5 version.
1424+
14201425
Returns:
14211426
Worker pools specs and managed model for run.
14221427
@@ -1454,6 +1459,7 @@ def _prepare_and_validate_run(
14541459
boot_disk_size_gb=boot_disk_size_gb,
14551460
reduction_server_replica_count=reduction_server_replica_count,
14561461
reduction_server_machine_type=reduction_server_machine_type,
1462+
tpu_topology=tpu_topology,
14571463
).pool_specs
14581464
)
14591465

@@ -2974,6 +2980,7 @@ def run(
29742980
create_request_timeout: Optional[float] = None,
29752981
disable_retries: bool = False,
29762982
persistent_resource_id: Optional[str] = None,
2983+
tpu_topology: Optional[str] = None,
29772984
) -> Optional[models.Model]:
29782985
"""Runs the custom training job.
29792986
@@ -3268,6 +3275,12 @@ def run(
32683275
on-demand short-live machines. The network, CMEK, and node pool
32693276
configs on the job should be consistent with those on the
32703277
PersistentResource, otherwise, the job will be rejected.
3278+
tpu_topology (str):
3279+
Optional. Specifies the tpu topology to be used for
3280+
TPU training job. This field is required for TPU v5 versions. For
3281+
details on the TPU topology, refer to
3282+
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology must
3283+
be a supported value for the TPU machine type.
32713284
32723285
Returns:
32733286
model: The trained Vertex AI Model resource or None if training did not
@@ -3287,6 +3300,7 @@ def run(
32873300
boot_disk_size_gb=boot_disk_size_gb,
32883301
reduction_server_replica_count=reduction_server_replica_count,
32893302
reduction_server_machine_type=reduction_server_machine_type,
3303+
tpu_topology=tpu_topology,
32903304
)
32913305

32923306
# make and copy package
@@ -3383,6 +3397,7 @@ def submit(
33833397
create_request_timeout: Optional[float] = None,
33843398
disable_retries: bool = False,
33853399
persistent_resource_id: Optional[str] = None,
3400+
tpu_topology: Optional[str] = None,
33863401
) -> Optional[models.Model]:
33873402
"""Submits the custom training job without blocking until completion.
33883403
@@ -3677,6 +3692,12 @@ def submit(
36773692
on-demand short-live machines. The network, CMEK, and node pool
36783693
configs on the job should be consistent with those on the
36793694
PersistentResource, otherwise, the job will be rejected.
3695+
tpu_topology (str):
3696+
Optional. Specifies the tpu topology to be used for
3697+
TPU training job. This field is required for TPU v5 versions. For
3698+
details on the TPU topology, refer to
3699+
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology must
3700+
be a supported value for the TPU machine type.
36803701
36813702
Returns:
36823703
model: The trained Vertex AI Model resource or None if training did not
@@ -3695,6 +3716,7 @@ def submit(
36953716
boot_disk_size_gb=boot_disk_size_gb,
36963717
reduction_server_replica_count=reduction_server_replica_count,
36973718
reduction_server_machine_type=reduction_server_machine_type,
3719+
tpu_topology=tpu_topology,
36983720
)
36993721

37003722
# make and copy package
@@ -4360,6 +4382,7 @@ def run(
43604382
create_request_timeout: Optional[float] = None,
43614383
disable_retries: bool = False,
43624384
persistent_resource_id: Optional[str] = None,
4385+
tpu_topology: Optional[str] = None,
43634386
) -> Optional[models.Model]:
43644387
"""Runs the custom training job.
43654388
@@ -4647,6 +4670,12 @@ def run(
46474670
on-demand short-live machines. The network, CMEK, and node pool
46484671
configs on the job should be consistent with those on the
46494672
PersistentResource, otherwise, the job will be rejected.
4673+
tpu_topology (str):
4674+
Optional. Specifies the tpu topology to be used for
4675+
TPU training job. This field is required for TPU v5 versions. For
4676+
details on the TPU topology, refer to
4677+
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
4678+
must be a supported value for the TPU machine type.
46504679
46514680
Returns:
46524681
model: The trained Vertex AI Model resource or None if training did not
@@ -4671,6 +4700,7 @@ def run(
46714700
boot_disk_size_gb=boot_disk_size_gb,
46724701
reduction_server_replica_count=reduction_server_replica_count,
46734702
reduction_server_machine_type=reduction_server_machine_type,
4703+
tpu_topology=tpu_topology,
46744704
)
46754705

46764706
return self._run(
@@ -4761,6 +4791,7 @@ def submit(
47614791
create_request_timeout: Optional[float] = None,
47624792
disable_retries: bool = False,
47634793
persistent_resource_id: Optional[str] = None,
4794+
tpu_topology: Optional[str] = None,
47644795
) -> Optional[models.Model]:
47654796
"""Submits the custom training job without blocking until completion.
47664797
@@ -5048,6 +5079,12 @@ def submit(
50485079
on-demand short-live machines. The network, CMEK, and node pool
50495080
configs on the job should be consistent with those on the
50505081
PersistentResource, otherwise, the job will be rejected.
5082+
tpu_topology (str):
5083+
Optional. Specifies the tpu topology to be used for
5084+
TPU training job. This field is required for TPU v5 versions. For
5085+
details on the TPU topology, refer to
5086+
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
5087+
must be a supported value for the TPU machine type.
50515088
50525089
Returns:
50535090
model: The trained Vertex AI Model resource or None if training did not
@@ -5071,6 +5108,7 @@ def submit(
50715108
boot_disk_size_gb=boot_disk_size_gb,
50725109
reduction_server_replica_count=reduction_server_replica_count,
50735110
reduction_server_machine_type=reduction_server_machine_type,
5111+
tpu_topology=tpu_topology,
50745112
)
50755113

50765114
return self._run(
@@ -7315,6 +7353,7 @@ def run(
73157353
create_request_timeout: Optional[float] = None,
73167354
disable_retries: bool = False,
73177355
persistent_resource_id: Optional[str] = None,
7356+
tpu_topology: Optional[str] = None,
73187357
) -> Optional[models.Model]:
73197358
"""Runs the custom training job.
73207359
@@ -7603,6 +7642,12 @@ def run(
76037642
on-demand short-live machines. The network, CMEK, and node pool
76047643
configs on the job should be consistent with those on the
76057644
PersistentResource, otherwise, the job will be rejected.
7645+
tpu_topology (str):
7646+
Optional. Specifies the tpu topology to be used for
7647+
TPU training job. This field is required for TPU v5 versions. For
7648+
details on the TPU topology, refer to
7649+
https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
7650+
must be a supported value for the TPU machine type.
76067651
76077652
Returns:
76087653
model: The trained Vertex AI Model resource or None if training did not
@@ -7622,6 +7667,7 @@ def run(
76227667
boot_disk_size_gb=boot_disk_size_gb,
76237668
reduction_server_replica_count=reduction_server_replica_count,
76247669
reduction_server_machine_type=reduction_server_machine_type,
7670+
tpu_topology=tpu_topology,
76257671
)
76267672

76277673
return self._run(

google/cloud/aiplatform/utils/worker_spec_utils.py

+10
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class _WorkerPoolSpec(NamedTuple):
5656
accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED"
5757
boot_disk_type: str = "pd-ssd"
5858
boot_disk_size_gb: int = 100
59+
tpu_topology: Optional[str] = None
5960

6061
def _get_accelerator_type(self) -> Optional[str]:
6162
"""Validates accelerator_type and returns the name of the accelerator.
@@ -97,6 +98,9 @@ def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]:
9798
spec["machine_spec"]["accelerator_type"] = accelerator_type
9899
spec["machine_spec"]["accelerator_count"] = self.accelerator_count
99100

101+
if self.tpu_topology:
102+
spec["machine_spec"]["tpu_topology"] = self.tpu_topology
103+
100104
return spec
101105

102106
@property
@@ -185,6 +189,7 @@ def chief_worker_pool(
185189
boot_disk_size_gb: int = 100,
186190
reduction_server_replica_count: int = 0,
187191
reduction_server_machine_type: str = None,
192+
tpu_topology: str = None,
188193
) -> "_DistributedTrainingSpec":
189194
"""Parametrizes Config to support only chief with worker replicas.
190195
@@ -214,6 +219,10 @@ def chief_worker_pool(
214219
The number of reduction server replicas, default is 0.
215220
reduction_server_machine_type (str):
216221
The type of machine to use for reduction server, default is `n1-highcpu-16`.
222+
tpu_topology (str):
223+
TPU topology for the TPU type. This field is
224+
required for the TPU v5 versions. This field is only passed to the
225+
chief replica as TPU jobs only allow 1 replica.
217226
218227
Returns:
219228
_DistributedTrainingSpec representing one chief and n workers all of
@@ -230,6 +239,7 @@ def chief_worker_pool(
230239
accelerator_type=accelerator_type,
231240
boot_disk_type=boot_disk_type,
232241
boot_disk_size_gb=boot_disk_size_gb,
242+
tpu_topology=tpu_topology,
233243
)
234244

235245
worker_spec = _WorkerPoolSpec(

tests/unit/aiplatform/constants.py

+54-1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ class TrainingJobConstants:
7676
}
7777
_TEST_REPLICA_COUNT = 1
7878
_TEST_MACHINE_TYPE = "n1-standard-4"
79+
_TEST_MACHINE_TYPE_TPU = "cloud-tpu"
80+
_TEST_MACHINE_TYPE_TPU_V5E = "ct5lp-hightpu-4t"
81+
_TEST_ACCELERATOR_TPU_TYPE = "TPU_V3"
7982
_TEST_ACCELERATOR_TYPE = "NVIDIA_TESLA_K80"
8083
_TEST_ACCELERATOR_COUNT = 1
8184
_TEST_BOOT_DISK_TYPE = "pd-standard"
@@ -120,12 +123,40 @@ class TrainingJobConstants:
120123
},
121124
}
122125
]
126+
_TEST_TPU_V5E_WORKER_POOL_SPEC = [
127+
{
128+
"machine_spec": {
129+
"machine_type": _TEST_MACHINE_TYPE_TPU_V5E,
130+
"tpu_topology": "2x2",
131+
},
132+
"replica_count": 1,
133+
"disk_spec": {"boot_disk_type": "pd-ssd", "boot_disk_size_gb": 100},
134+
"container_spec": {
135+
"image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
136+
},
137+
}
138+
]
139+
_TEST_TPU_V3_WORKER_POOL_SPEC = [
140+
{
141+
"machine_spec": {
142+
"machine_type": _TEST_MACHINE_TYPE_TPU,
143+
"accelerator_type": _TEST_ACCELERATOR_TPU_TYPE,
144+
"accelerator_count": 32,
145+
},
146+
"replica_count": 1,
147+
"disk_spec": {"boot_disk_type": "pd-ssd", "boot_disk_size_gb": 100},
148+
"container_spec": {
149+
"image_uri": _TEST_TRAINING_CONTAINER_IMAGE,
150+
},
151+
}
152+
]
123153
_TEST_ID = "1028944691210842416"
124154
_TEST_NETWORK = (
125155
f"projects/{ProjectConstants._TEST_PROJECT}/global/networks/{_TEST_ID}"
126156
)
127157
_TEST_RESERVED_IP_RANGES = ["example_ip_range"]
128158
_TEST_TIMEOUT = 8000
159+
_TEST_TIMEOUT_SECONDS = duration_pb2.Duration(seconds=_TEST_TIMEOUT)
129160
_TEST_RESTART_JOB_ON_WORKER_RESTART = True
130161
_TEST_DISABLE_RETRIES = True
131162

@@ -137,7 +168,7 @@ class TrainingJobConstants:
137168
output_uri_prefix=_TEST_BASE_OUTPUT_DIR
138169
),
139170
scheduling=custom_job.Scheduling(
140-
timeout=duration_pb2.Duration(seconds=_TEST_TIMEOUT),
171+
timeout=_TEST_TIMEOUT_SECONDS,
141172
restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART,
142173
disable_retries=_TEST_DISABLE_RETRIES,
143174
),
@@ -167,6 +198,28 @@ class TrainingJobConstants:
167198
)
168199
_TEST_DEFAULT_ENCRYPTION_KEY_NAME = "key_default"
169200

201+
def create_tpu_job_proto(tpu_version):
202+
worker_pool_spec = (
203+
TrainingJobConstants._TEST_TPU_V5E_WORKER_POOL_SPEC
204+
if tpu_version == "v5e"
205+
else TrainingJobConstants._TEST_TPU_V3_WORKER_POOL_SPEC
206+
)
207+
return custom_job.CustomJob(
208+
display_name=TrainingJobConstants._TEST_DISPLAY_NAME,
209+
job_spec=custom_job.CustomJobSpec(
210+
worker_pool_specs=worker_pool_spec,
211+
base_output_directory=io.GcsDestination(
212+
output_uri_prefix=TrainingJobConstants._TEST_BASE_OUTPUT_DIR
213+
),
214+
scheduling=custom_job.Scheduling(
215+
timeout=TrainingJobConstants._TEST_TIMEOUT_SECONDS,
216+
restart_job_on_worker_restart=TrainingJobConstants._TEST_RESTART_JOB_ON_WORKER_RESTART,
217+
),
218+
service_account=ProjectConstants._TEST_SERVICE_ACCOUNT,
219+
network=TrainingJobConstants._TEST_NETWORK,
220+
),
221+
)
222+
170223

171224
@dataclasses.dataclass(frozen=True)
172225
class ModelConstants:

0 commit comments

Comments
 (0)