@@ -1373,6 +1373,7 @@ def _prepare_and_validate_run(
1373
1373
boot_disk_size_gb : int = 100 ,
1374
1374
reduction_server_replica_count : int = 0 ,
1375
1375
reduction_server_machine_type : Optional [str ] = None ,
1376
+ tpu_topology : Optional [str ] = None ,
1376
1377
) -> Tuple [worker_spec_utils ._DistributedTrainingSpec , Optional [gca_model .Model ]]:
1377
1378
"""Create worker pool specs and managed model as well validating the
1378
1379
run.
@@ -1417,6 +1418,10 @@ def _prepare_and_validate_run(
1417
1418
The number of reduction server replicas, default is 0.
1418
1419
reduction_server_machine_type (str):
1419
1420
Optional. The type of machine to use for reduction server.
1421
+ tpu_topology (str):
1422
+ Optional. Only required if the machine type is a TPU
1423
+ v5 version.
1424
+
1420
1425
Returns:
1421
1426
Worker pools specs and managed model for run.
1422
1427
@@ -1454,6 +1459,7 @@ def _prepare_and_validate_run(
1454
1459
boot_disk_size_gb = boot_disk_size_gb ,
1455
1460
reduction_server_replica_count = reduction_server_replica_count ,
1456
1461
reduction_server_machine_type = reduction_server_machine_type ,
1462
+ tpu_topology = tpu_topology ,
1457
1463
).pool_specs
1458
1464
)
1459
1465
@@ -2974,6 +2980,7 @@ def run(
2974
2980
create_request_timeout : Optional [float ] = None ,
2975
2981
disable_retries : bool = False ,
2976
2982
persistent_resource_id : Optional [str ] = None ,
2983
+ tpu_topology : Optional [str ] = None ,
2977
2984
) -> Optional [models .Model ]:
2978
2985
"""Runs the custom training job.
2979
2986
@@ -3268,6 +3275,12 @@ def run(
3268
3275
on-demand short-live machines. The network, CMEK, and node pool
3269
3276
configs on the job should be consistent with those on the
3270
3277
PersistentResource, otherwise, the job will be rejected.
3278
+ tpu_topology (str):
3279
+ Optional. Specifies the tpu topology to be used for
3280
+ TPU training job. This field is required for TPU v5 versions. For
3281
+ details on the TPU topology, refer to
3282
+ https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology must
3283
+ be a supported value for the TPU machine type.
3271
3284
3272
3285
Returns:
3273
3286
model: The trained Vertex AI Model resource or None if training did not
@@ -3287,6 +3300,7 @@ def run(
3287
3300
boot_disk_size_gb = boot_disk_size_gb ,
3288
3301
reduction_server_replica_count = reduction_server_replica_count ,
3289
3302
reduction_server_machine_type = reduction_server_machine_type ,
3303
+ tpu_topology = tpu_topology ,
3290
3304
)
3291
3305
3292
3306
# make and copy package
@@ -3383,6 +3397,7 @@ def submit(
3383
3397
create_request_timeout : Optional [float ] = None ,
3384
3398
disable_retries : bool = False ,
3385
3399
persistent_resource_id : Optional [str ] = None ,
3400
+ tpu_topology : Optional [str ] = None ,
3386
3401
) -> Optional [models .Model ]:
3387
3402
"""Submits the custom training job without blocking until completion.
3388
3403
@@ -3677,6 +3692,12 @@ def submit(
3677
3692
on-demand short-live machines. The network, CMEK, and node pool
3678
3693
configs on the job should be consistent with those on the
3679
3694
PersistentResource, otherwise, the job will be rejected.
3695
+ tpu_topology (str):
3696
+ Optional. Specifies the tpu topology to be used for
3697
+ TPU training job. This field is required for TPU v5 versions. For
3698
+ details on the TPU topology, refer to
3699
+ https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology must
3700
+ be a supported value for the TPU machine type.
3680
3701
3681
3702
Returns:
3682
3703
model: The trained Vertex AI Model resource or None if training did not
@@ -3695,6 +3716,7 @@ def submit(
3695
3716
boot_disk_size_gb = boot_disk_size_gb ,
3696
3717
reduction_server_replica_count = reduction_server_replica_count ,
3697
3718
reduction_server_machine_type = reduction_server_machine_type ,
3719
+ tpu_topology = tpu_topology ,
3698
3720
)
3699
3721
3700
3722
# make and copy package
@@ -4360,6 +4382,7 @@ def run(
4360
4382
create_request_timeout : Optional [float ] = None ,
4361
4383
disable_retries : bool = False ,
4362
4384
persistent_resource_id : Optional [str ] = None ,
4385
+ tpu_topology : Optional [str ] = None ,
4363
4386
) -> Optional [models .Model ]:
4364
4387
"""Runs the custom training job.
4365
4388
@@ -4647,6 +4670,12 @@ def run(
4647
4670
on-demand short-live machines. The network, CMEK, and node pool
4648
4671
configs on the job should be consistent with those on the
4649
4672
PersistentResource, otherwise, the job will be rejected.
4673
+ tpu_topology (str):
4674
+ Optional. Specifies the tpu topology to be used for
4675
+ TPU training job. This field is required for TPU v5 versions. For
4676
+ details on the TPU topology, refer to
4677
+ https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
4678
+ must be a supported value for the TPU machine type.
4650
4679
4651
4680
Returns:
4652
4681
model: The trained Vertex AI Model resource or None if training did not
@@ -4671,6 +4700,7 @@ def run(
4671
4700
boot_disk_size_gb = boot_disk_size_gb ,
4672
4701
reduction_server_replica_count = reduction_server_replica_count ,
4673
4702
reduction_server_machine_type = reduction_server_machine_type ,
4703
+ tpu_topology = tpu_topology ,
4674
4704
)
4675
4705
4676
4706
return self ._run (
@@ -4761,6 +4791,7 @@ def submit(
4761
4791
create_request_timeout : Optional [float ] = None ,
4762
4792
disable_retries : bool = False ,
4763
4793
persistent_resource_id : Optional [str ] = None ,
4794
+ tpu_topology : Optional [str ] = None ,
4764
4795
) -> Optional [models .Model ]:
4765
4796
"""Submits the custom training job without blocking until completion.
4766
4797
@@ -5048,6 +5079,12 @@ def submit(
5048
5079
on-demand short-live machines. The network, CMEK, and node pool
5049
5080
configs on the job should be consistent with those on the
5050
5081
PersistentResource, otherwise, the job will be rejected.
5082
+ tpu_topology (str):
5083
+ Optional. Specifies the tpu topology to be used for
5084
+ TPU training job. This field is required for TPU v5 versions. For
5085
+ details on the TPU topology, refer to
5086
+ https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
5087
+ must be a supported value for the TPU machine type.
5051
5088
5052
5089
Returns:
5053
5090
model: The trained Vertex AI Model resource or None if training did not
@@ -5071,6 +5108,7 @@ def submit(
5071
5108
boot_disk_size_gb = boot_disk_size_gb ,
5072
5109
reduction_server_replica_count = reduction_server_replica_count ,
5073
5110
reduction_server_machine_type = reduction_server_machine_type ,
5111
+ tpu_topology = tpu_topology ,
5074
5112
)
5075
5113
5076
5114
return self ._run (
@@ -7315,6 +7353,7 @@ def run(
7315
7353
create_request_timeout : Optional [float ] = None ,
7316
7354
disable_retries : bool = False ,
7317
7355
persistent_resource_id : Optional [str ] = None ,
7356
+ tpu_topology : Optional [str ] = None ,
7318
7357
) -> Optional [models .Model ]:
7319
7358
"""Runs the custom training job.
7320
7359
@@ -7603,6 +7642,12 @@ def run(
7603
7642
on-demand short-live machines. The network, CMEK, and node pool
7604
7643
configs on the job should be consistent with those on the
7605
7644
PersistentResource, otherwise, the job will be rejected.
7645
+ tpu_topology (str):
7646
+ Optional. Specifies the tpu topology to be used for
7647
+ TPU training job. This field is required for TPU v5 versions. For
7648
+ details on the TPU topology, refer to
7649
+ https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
7650
+ must be a supported value for the TPU machine type.
7606
7651
7607
7652
Returns:
7608
7653
model: The trained Vertex AI Model resource or None if training did not
@@ -7622,6 +7667,7 @@ def run(
7622
7667
boot_disk_size_gb = boot_disk_size_gb ,
7623
7668
reduction_server_replica_count = reduction_server_replica_count ,
7624
7669
reduction_server_machine_type = reduction_server_machine_type ,
7670
+ tpu_topology = tpu_topology ,
7625
7671
)
7626
7672
7627
7673
return self ._run (
0 commit comments