Skip to content

Commit db518b0

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add disable_retries option to custom jobs.
PiperOrigin-RevId: 557870565
1 parent 4e76a6e commit db518b0

File tree

9 files changed

+137
-4
lines changed

9 files changed

+137
-4
lines changed

google/cloud/aiplatform/jobs.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,7 @@ def run(
16291629
tensorboard: Optional[str] = None,
16301630
sync: bool = True,
16311631
create_request_timeout: Optional[float] = None,
1632+
disable_retries: bool = False,
16321633
) -> None:
16331634
"""Run this configured CustomJob.
16341635
@@ -1686,6 +1687,10 @@ def run(
16861687
will unblock and it will be executed in a concurrent Future.
16871688
create_request_timeout (float):
16881689
Optional. The timeout for the create request in seconds.
1690+
disable_retries (bool):
1691+
Indicates if the job should retry for internal errors after the
1692+
job starts running. If True, overrides
1693+
`restart_job_on_worker_restart` to False.
16891694
"""
16901695
network = network or initializer.global_config.network
16911696

@@ -1700,6 +1705,7 @@ def run(
17001705
tensorboard=tensorboard,
17011706
sync=sync,
17021707
create_request_timeout=create_request_timeout,
1708+
disable_retries=disable_retries,
17031709
)
17041710

17051711
@base.optional_sync()
@@ -1715,6 +1721,7 @@ def _run(
17151721
tensorboard: Optional[str] = None,
17161722
sync: bool = True,
17171723
create_request_timeout: Optional[float] = None,
1724+
disable_retries: bool = False,
17181725
) -> None:
17191726
"""Helper method to ensure network synchronization and to run the configured CustomJob.
17201727
@@ -1770,6 +1777,10 @@ def _run(
17701777
will unblock and it will be executed in a concurrent Future.
17711778
create_request_timeout (float):
17721779
Optional. The timeout for the create request in seconds.
1780+
disable_retries (bool):
1781+
Indicates if the job should retry for internal errors after the
1782+
job starts running. If True, overrides
1783+
`restart_job_on_worker_restart` to False.
17731784
"""
17741785
self.submit(
17751786
service_account=service_account,
@@ -1781,6 +1792,7 @@ def _run(
17811792
experiment_run=experiment_run,
17821793
tensorboard=tensorboard,
17831794
create_request_timeout=create_request_timeout,
1795+
disable_retries=disable_retries,
17841796
)
17851797

17861798
self._block_until_complete()
@@ -1797,6 +1809,7 @@ def submit(
17971809
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
17981810
tensorboard: Optional[str] = None,
17991811
create_request_timeout: Optional[float] = None,
1812+
disable_retries: bool = False,
18001813
) -> None:
18011814
"""Submit the configured CustomJob.
18021815
@@ -1849,6 +1862,10 @@ def submit(
18491862
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
18501863
create_request_timeout (float):
18511864
Optional. The timeout for the create request in seconds.
1865+
disable_retries (bool):
1866+
Indicates if the job should retry for internal errors after the
1867+
job starts running. If True, overrides
1868+
`restart_job_on_worker_restart` to False.
18521869
18531870
Raises:
18541871
ValueError:
@@ -1869,11 +1886,12 @@ def submit(
18691886
if network:
18701887
self._gca_resource.job_spec.network = network
18711888

1872-
if timeout or restart_job_on_worker_restart:
1889+
if timeout or restart_job_on_worker_restart or disable_retries:
18731890
timeout = duration_pb2.Duration(seconds=timeout) if timeout else None
18741891
self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling(
18751892
timeout=timeout,
18761893
restart_job_on_worker_restart=restart_job_on_worker_restart,
1894+
disable_retries=disable_retries,
18771895
)
18781896

18791897
if enable_web_access:
@@ -2287,6 +2305,7 @@ def run(
22872305
tensorboard: Optional[str] = None,
22882306
sync: bool = True,
22892307
create_request_timeout: Optional[float] = None,
2308+
disable_retries: bool = False,
22902309
) -> None:
22912310
"""Run this configured CustomJob.
22922311
@@ -2331,6 +2350,10 @@ def run(
23312350
will unblock and it will be executed in a concurrent Future.
23322351
create_request_timeout (float):
23332352
Optional. The timeout for the create request in seconds.
2353+
disable_retries (bool):
2354+
Indicates if the job should retry for internal errors after the
2355+
job starts running. If True, overrides
2356+
`restart_job_on_worker_restart` to False.
23342357
"""
23352358
network = network or initializer.global_config.network
23362359

@@ -2343,6 +2366,7 @@ def run(
23432366
tensorboard=tensorboard,
23442367
sync=sync,
23452368
create_request_timeout=create_request_timeout,
2369+
disable_retries=disable_retries,
23462370
)
23472371

23482372
@base.optional_sync()
@@ -2356,6 +2380,7 @@ def _run(
23562380
tensorboard: Optional[str] = None,
23572381
sync: bool = True,
23582382
create_request_timeout: Optional[float] = None,
2383+
disable_retries: bool = False,
23592384
) -> None:
23602385
"""Helper method to ensure network synchronization and to run the configured CustomJob.
23612386
@@ -2398,19 +2423,24 @@ def _run(
23982423
will unblock and it will be executed in a concurrent Future.
23992424
create_request_timeout (float):
24002425
Optional. The timeout for the create request in seconds.
2426+
disable_retries (bool):
2427+
Indicates if the job should retry for internal errors after the
2428+
job starts running. If True, overrides
2429+
`restart_job_on_worker_restart` to False.
24012430
"""
24022431
if service_account:
24032432
self._gca_resource.trial_job_spec.service_account = service_account
24042433

24052434
if network:
24062435
self._gca_resource.trial_job_spec.network = network
24072436

2408-
if timeout or restart_job_on_worker_restart:
2437+
if timeout or restart_job_on_worker_restart or disable_retries:
24092438
duration = duration_pb2.Duration(seconds=timeout) if timeout else None
24102439
self._gca_resource.trial_job_spec.scheduling = (
24112440
gca_custom_job_compat.Scheduling(
24122441
timeout=duration,
24132442
restart_job_on_worker_restart=restart_job_on_worker_restart,
2443+
disable_retries=disable_retries,
24142444
)
24152445
)
24162446

google/cloud/aiplatform/preview/jobs.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def submit(
238238
experiment_run: Optional[Union["aiplatform.ExperimentRun", str]] = None,
239239
tensorboard: Optional[str] = None,
240240
create_request_timeout: Optional[float] = None,
241+
disable_retries: bool = False,
241242
) -> None:
242243
"""Submit the configured CustomJob.
243244
@@ -290,6 +291,10 @@ def submit(
290291
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
291292
create_request_timeout (float):
292293
Optional. The timeout for the create request in seconds.
294+
disable_retries (bool):
295+
Indicates if the job should retry for internal errors after the
296+
job starts running. If True, overrides
297+
`restart_job_on_worker_restart` to False.
293298
294299
Raises:
295300
ValueError:
@@ -310,11 +315,12 @@ def submit(
310315
if network:
311316
self._gca_resource.job_spec.network = network
312317

313-
if timeout or restart_job_on_worker_restart:
318+
if timeout or restart_job_on_worker_restart or disable_retries:
314319
timeout = duration_pb2.Duration(seconds=timeout) if timeout else None
315320
self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling(
316321
timeout=timeout,
317322
restart_job_on_worker_restart=restart_job_on_worker_restart,
323+
disable_retries=disable_retries,
318324
)
319325

320326
if enable_web_access:

0 commit comments

Comments
 (0)