Skip to content

Commit f428006

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add Persistent Resource Id parameter to Custom Training Job run and submit methods.
PiperOrigin-RevId: 622311949
1 parent f5be0b5 commit f428006

File tree

2 files changed

+322
-0
lines changed

2 files changed

+322
-0
lines changed

google/cloud/aiplatform/training_jobs.py

+83
Original file line numberDiff line numberDiff line change
@@ -1489,6 +1489,7 @@ def _prepare_training_task_inputs_and_output_dir(
14891489
enable_dashboard_access: bool = False,
14901490
tensorboard: Optional[str] = None,
14911491
disable_retries: bool = False,
1492+
persistent_resource_id: Optional[str] = None,
14921493
) -> Tuple[Dict, str]:
14931494
"""Prepares training task inputs and output directory for custom job.
14941495
@@ -1539,6 +1540,14 @@ def _prepare_training_task_inputs_and_output_dir(
15391540
Indicates if the job should retry for internal errors after the
15401541
job starts running. If True, overrides
15411542
`restart_job_on_worker_restart` to False.
1543+
persistent_resource_id (str):
1544+
Optional. The ID of the PersistentResource in the same Project
1545+
and Location. If this is specified, the job will be run on
1546+
existing machines held by the PersistentResource instead of
1547+
on-demand short-live machines. The network, CMEK, and node pool
1548+
configs on the job should be consistent with those on the
1549+
PersistentResource, otherwise, the job will be rejected.
1550+
15421551
Returns:
15431552
Training task inputs and Output directory for custom job.
15441553
"""
@@ -1565,6 +1574,8 @@ def _prepare_training_task_inputs_and_output_dir(
15651574
training_task_inputs["enable_web_access"] = enable_web_access
15661575
if enable_dashboard_access:
15671576
training_task_inputs["enable_dashboard_access"] = enable_dashboard_access
1577+
if persistent_resource_id:
1578+
training_task_inputs["persistent_resource_id"] = persistent_resource_id
15681579

15691580
if timeout or restart_job_on_worker_restart or disable_retries:
15701581
timeout = f"{timeout}s" if timeout else None
@@ -2962,6 +2973,7 @@ def run(
29622973
sync=True,
29632974
create_request_timeout: Optional[float] = None,
29642975
disable_retries: bool = False,
2976+
persistent_resource_id: Optional[str] = None,
29652977
) -> Optional[models.Model]:
29662978
"""Runs the custom training job.
29672979
@@ -3249,6 +3261,13 @@ def run(
32493261
Indicates if the job should retry for internal errors after the
32503262
job starts running. If True, overrides
32513263
`restart_job_on_worker_restart` to False.
3264+
persistent_resource_id (str):
3265+
Optional. The ID of the PersistentResource in the same Project
3266+
and Location. If this is specified, the job will be run on
3267+
existing machines held by the PersistentResource instead of
3268+
on-demand short-live machines. The network, CMEK, and node pool
3269+
configs on the job should be consistent with those on the
3270+
PersistentResource, otherwise, the job will be rejected.
32523271
32533272
Returns:
32543273
model: The trained Vertex AI Model resource or None if training did not
@@ -3311,6 +3330,7 @@ def run(
33113330
sync=sync,
33123331
create_request_timeout=create_request_timeout,
33133332
disable_retries=disable_retries,
3333+
persistent_resource_id=persistent_resource_id,
33143334
)
33153335

33163336
def submit(
@@ -3362,6 +3382,7 @@ def submit(
33623382
sync=True,
33633383
create_request_timeout: Optional[float] = None,
33643384
disable_retries: bool = False,
3385+
persistent_resource_id: Optional[str] = None,
33653386
) -> Optional[models.Model]:
33663387
"""Submits the custom training job without blocking until completion.
33673388
@@ -3649,6 +3670,13 @@ def submit(
36493670
Indicates if the job should retry for internal errors after the
36503671
job starts running. If True, overrides
36513672
`restart_job_on_worker_restart` to False.
3673+
persistent_resource_id (str):
3674+
Optional. The ID of the PersistentResource in the same Project
3675+
and Location. If this is specified, the job will be run on
3676+
existing machines held by the PersistentResource instead of
3677+
on-demand short-live machines. The network, CMEK, and node pool
3678+
configs on the job should be consistent with those on the
3679+
PersistentResource, otherwise, the job will be rejected.
36523680
36533681
Returns:
36543682
model: The trained Vertex AI Model resource or None if training did not
@@ -3711,6 +3739,7 @@ def submit(
37113739
create_request_timeout=create_request_timeout,
37123740
block=False,
37133741
disable_retries=disable_retries,
3742+
persistent_resource_id=persistent_resource_id,
37143743
)
37153744

37163745
@base.optional_sync(construct_object_on_arg="managed_model")
@@ -3757,6 +3786,7 @@ def _run(
37573786
create_request_timeout: Optional[float] = None,
37583787
block: Optional[bool] = True,
37593788
disable_retries: bool = False,
3789+
persistent_resource_id: Optional[str] = None,
37603790
) -> Optional[models.Model]:
37613791
"""Packages local script and launches training_job.
37623792
@@ -3946,6 +3976,13 @@ def _run(
39463976
Indicates if the job should retry for internal errors after the
39473977
job starts running. If True, overrides
39483978
`restart_job_on_worker_restart` to False.
3979+
persistent_resource_id (str):
3980+
Optional. The ID of the PersistentResource in the same Project
3981+
and Location. If this is specified, the job will be run on
3982+
existing machines held by the PersistentResource instead of
3983+
on-demand short-live machines. The network, CMEK, and node pool
3984+
configs on the job should be consistent with those on the
3985+
PersistentResource, otherwise, the job will be rejected.
39493986
39503987
Returns:
39513988
model: The trained Vertex AI Model resource or None if training did not
@@ -3999,6 +4036,7 @@ def _run(
39994036
enable_dashboard_access=enable_dashboard_access,
40004037
tensorboard=tensorboard,
40014038
disable_retries=disable_retries,
4039+
persistent_resource_id=persistent_resource_id,
40024040
)
40034041

40044042
model = self._run_job(
@@ -4321,6 +4359,7 @@ def run(
43214359
sync=True,
43224360
create_request_timeout: Optional[float] = None,
43234361
disable_retries: bool = False,
4362+
persistent_resource_id: Optional[str] = None,
43244363
) -> Optional[models.Model]:
43254364
"""Runs the custom training job.
43264365
@@ -4601,6 +4640,13 @@ def run(
46014640
Indicates if the job should retry for internal errors after the
46024641
job starts running. If True, overrides
46034642
`restart_job_on_worker_restart` to False.
4643+
persistent_resource_id (str):
4644+
Optional. The ID of the PersistentResource in the same Project
4645+
and Location. If this is specified, the job will be run on
4646+
existing machines held by the PersistentResource instead of
4647+
on-demand short-live machines. The network, CMEK, and node pool
4648+
configs on the job should be consistent with those on the
4649+
PersistentResource, otherwise, the job will be rejected.
46044650
46054651
Returns:
46064652
model: The trained Vertex AI Model resource or None if training did not
@@ -4662,6 +4708,7 @@ def run(
46624708
sync=sync,
46634709
create_request_timeout=create_request_timeout,
46644710
disable_retries=disable_retries,
4711+
persistent_resource_id=persistent_resource_id,
46654712
)
46664713

46674714
def submit(
@@ -4713,6 +4760,7 @@ def submit(
47134760
sync=True,
47144761
create_request_timeout: Optional[float] = None,
47154762
disable_retries: bool = False,
4763+
persistent_resource_id: Optional[str] = None,
47164764
) -> Optional[models.Model]:
47174765
"""Submits the custom training job without blocking until completion.
47184766
@@ -4993,6 +5041,13 @@ def submit(
49935041
Indicates if the job should retry for internal errors after the
49945042
job starts running. If True, overrides
49955043
`restart_job_on_worker_restart` to False.
5044+
persistent_resource_id (str):
5045+
Optional. The ID of the PersistentResource in the same Project
5046+
and Location. If this is specified, the job will be run on
5047+
existing machines held by the PersistentResource instead of
5048+
on-demand short-live machines. The network, CMEK, and node pool
5049+
configs on the job should be consistent with those on the
5050+
PersistentResource, otherwise, the job will be rejected.
49965051
49975052
Returns:
49985053
model: The trained Vertex AI Model resource or None if training did not
@@ -5054,6 +5109,7 @@ def submit(
50545109
create_request_timeout=create_request_timeout,
50555110
block=False,
50565111
disable_retries=disable_retries,
5112+
persistent_resource_id=persistent_resource_id,
50575113
)
50585114

50595115
@base.optional_sync(construct_object_on_arg="managed_model")
@@ -5099,6 +5155,7 @@ def _run(
50995155
create_request_timeout: Optional[float] = None,
51005156
block: Optional[bool] = True,
51015157
disable_retries: bool = False,
5158+
persistent_resource_id: Optional[str] = None,
51025159
) -> Optional[models.Model]:
51035160
"""Packages local script and launches training_job.
51045161
Args:
@@ -5284,6 +5341,13 @@ def _run(
52845341
Indicates if the job should retry for internal errors after the
52855342
job starts running. If True, overrides
52865343
`restart_job_on_worker_restart` to False.
5344+
persistent_resource_id (str):
5345+
Optional. The ID of the PersistentResource in the same Project
5346+
and Location. If this is specified, the job will be run on
5347+
existing machines held by the PersistentResource instead of
5348+
on-demand short-live machines. The network, CMEK, and node pool
5349+
configs on the job should be consistent with those on the
5350+
PersistentResource, otherwise, the job will be rejected.
52875351
52885352
Returns:
52895353
model: The trained Vertex AI Model resource or None if training did not
@@ -5331,6 +5395,7 @@ def _run(
53315395
enable_dashboard_access=enable_dashboard_access,
53325396
tensorboard=tensorboard,
53335397
disable_retries=disable_retries,
5398+
persistent_resource_id=persistent_resource_id,
53345399
)
53355400

53365401
model = self._run_job(
@@ -7249,6 +7314,7 @@ def run(
72497314
sync=True,
72507315
create_request_timeout: Optional[float] = None,
72517316
disable_retries: bool = False,
7317+
persistent_resource_id: Optional[str] = None,
72527318
) -> Optional[models.Model]:
72537319
"""Runs the custom training job.
72547320
@@ -7530,6 +7596,13 @@ def run(
75307596
Indicates if the job should retry for internal errors after the
75317597
job starts running. If True, overrides
75327598
`restart_job_on_worker_restart` to False.
7599+
persistent_resource_id (str):
7600+
Optional. The ID of the PersistentResource in the same Project
7601+
and Location. If this is specified, the job will be run on
7602+
existing machines held by the PersistentResource instead of
7603+
on-demand short-live machines. The network, CMEK, and node pool
7604+
configs on the job should be consistent with those on the
7605+
PersistentResource, otherwise, the job will be rejected.
75337606
75347607
Returns:
75357608
model: The trained Vertex AI Model resource or None if training did not
@@ -7586,6 +7659,7 @@ def run(
75867659
sync=sync,
75877660
create_request_timeout=create_request_timeout,
75887661
disable_retries=disable_retries,
7662+
persistent_resource_id=persistent_resource_id,
75897663
)
75907664

75917665
@base.optional_sync(construct_object_on_arg="managed_model")
@@ -7630,6 +7704,7 @@ def _run(
76307704
sync=True,
76317705
create_request_timeout: Optional[float] = None,
76327706
disable_retries: bool = False,
7707+
persistent_resource_id: Optional[str] = None,
76337708
) -> Optional[models.Model]:
76347709
"""Packages local script and launches training_job.
76357710
@@ -7800,6 +7875,13 @@ def _run(
78007875
Indicates if the job should retry for internal errors after the
78017876
job starts running. If True, overrides
78027877
`restart_job_on_worker_restart` to False.
7878+
persistent_resource_id (str):
7879+
Optional. The ID of the PersistentResource in the same Project
7880+
and Location. If this is specified, the job will be run on
7881+
existing machines held by the PersistentResource instead of
7882+
on-demand short-live machines. The network, CMEK, and node pool
7883+
configs on the job should be consistent with those on the
7884+
PersistentResource, otherwise, the job will be rejected.
78037885
78047886
Returns:
78057887
model: The trained Vertex AI Model resource or None if training did not
@@ -7847,6 +7929,7 @@ def _run(
78477929
enable_dashboard_access=enable_dashboard_access,
78487930
tensorboard=tensorboard,
78497931
disable_retries=disable_retries,
7932+
persistent_resource_id=persistent_resource_id,
78507933
)
78517934

78527935
model = self._run_job(

0 commit comments

Comments
 (0)