@@ -1629,6 +1629,7 @@ def run(
1629
1629
tensorboard : Optional [str ] = None ,
1630
1630
sync : bool = True ,
1631
1631
create_request_timeout : Optional [float ] = None ,
1632
+ disable_retries : bool = False ,
1632
1633
) -> None :
1633
1634
"""Run this configured CustomJob.
1634
1635
@@ -1686,6 +1687,10 @@ def run(
1686
1687
will unblock and it will be executed in a concurrent Future.
1687
1688
create_request_timeout (float):
1688
1689
Optional. The timeout for the create request in seconds.
1690
+ disable_retries (bool):
1691
+ Indicates if the job should retry for internal errors after the
1692
+ job starts running. If True, overrides
1693
+ `restart_job_on_worker_restart` to False.
1689
1694
"""
1690
1695
network = network or initializer .global_config .network
1691
1696
@@ -1700,6 +1705,7 @@ def run(
1700
1705
tensorboard = tensorboard ,
1701
1706
sync = sync ,
1702
1707
create_request_timeout = create_request_timeout ,
1708
+ disable_retries = disable_retries ,
1703
1709
)
1704
1710
1705
1711
@base .optional_sync ()
@@ -1715,6 +1721,7 @@ def _run(
1715
1721
tensorboard : Optional [str ] = None ,
1716
1722
sync : bool = True ,
1717
1723
create_request_timeout : Optional [float ] = None ,
1724
+ disable_retries : bool = False ,
1718
1725
) -> None :
1719
1726
"""Helper method to ensure network synchronization and to run the configured CustomJob.
1720
1727
@@ -1770,6 +1777,10 @@ def _run(
1770
1777
will unblock and it will be executed in a concurrent Future.
1771
1778
create_request_timeout (float):
1772
1779
Optional. The timeout for the create request in seconds.
1780
+ disable_retries (bool):
1781
+ Indicates if the job should retry for internal errors after the
1782
+ job starts running. If True, overrides
1783
+ `restart_job_on_worker_restart` to False.
1773
1784
"""
1774
1785
self .submit (
1775
1786
service_account = service_account ,
@@ -1781,6 +1792,7 @@ def _run(
1781
1792
experiment_run = experiment_run ,
1782
1793
tensorboard = tensorboard ,
1783
1794
create_request_timeout = create_request_timeout ,
1795
+ disable_retries = disable_retries ,
1784
1796
)
1785
1797
1786
1798
self ._block_until_complete ()
@@ -1797,6 +1809,7 @@ def submit(
1797
1809
experiment_run : Optional [Union ["aiplatform.ExperimentRun" , str ]] = None ,
1798
1810
tensorboard : Optional [str ] = None ,
1799
1811
create_request_timeout : Optional [float ] = None ,
1812
+ disable_retries : bool = False ,
1800
1813
) -> None :
1801
1814
"""Submit the configured CustomJob.
1802
1815
@@ -1849,6 +1862,10 @@ def submit(
1849
1862
https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training
1850
1863
create_request_timeout (float):
1851
1864
Optional. The timeout for the create request in seconds.
1865
+ disable_retries (bool):
1866
+ Indicates if the job should retry for internal errors after the
1867
+ job starts running. If True, overrides
1868
+ `restart_job_on_worker_restart` to False.
1852
1869
1853
1870
Raises:
1854
1871
ValueError:
@@ -1869,11 +1886,12 @@ def submit(
1869
1886
if network :
1870
1887
self ._gca_resource .job_spec .network = network
1871
1888
1872
- if timeout or restart_job_on_worker_restart :
1889
+ if timeout or restart_job_on_worker_restart or disable_retries :
1873
1890
timeout = duration_pb2 .Duration (seconds = timeout ) if timeout else None
1874
1891
self ._gca_resource .job_spec .scheduling = gca_custom_job_compat .Scheduling (
1875
1892
timeout = timeout ,
1876
1893
restart_job_on_worker_restart = restart_job_on_worker_restart ,
1894
+ disable_retries = disable_retries ,
1877
1895
)
1878
1896
1879
1897
if enable_web_access :
@@ -2287,6 +2305,7 @@ def run(
2287
2305
tensorboard : Optional [str ] = None ,
2288
2306
sync : bool = True ,
2289
2307
create_request_timeout : Optional [float ] = None ,
2308
+ disable_retries : bool = False ,
2290
2309
) -> None :
2291
2310
"""Run this configured CustomJob.
2292
2311
@@ -2331,6 +2350,10 @@ def run(
2331
2350
will unblock and it will be executed in a concurrent Future.
2332
2351
create_request_timeout (float):
2333
2352
Optional. The timeout for the create request in seconds.
2353
+ disable_retries (bool):
2354
+ Indicates if the job should retry for internal errors after the
2355
+ job starts running. If True, overrides
2356
+ `restart_job_on_worker_restart` to False.
2334
2357
"""
2335
2358
network = network or initializer .global_config .network
2336
2359
@@ -2343,6 +2366,7 @@ def run(
2343
2366
tensorboard = tensorboard ,
2344
2367
sync = sync ,
2345
2368
create_request_timeout = create_request_timeout ,
2369
+ disable_retries = disable_retries ,
2346
2370
)
2347
2371
2348
2372
@base .optional_sync ()
@@ -2356,6 +2380,7 @@ def _run(
2356
2380
tensorboard : Optional [str ] = None ,
2357
2381
sync : bool = True ,
2358
2382
create_request_timeout : Optional [float ] = None ,
2383
+ disable_retries : bool = False ,
2359
2384
) -> None :
2360
2385
"""Helper method to ensure network synchronization and to run the configured CustomJob.
2361
2386
@@ -2398,19 +2423,24 @@ def _run(
2398
2423
will unblock and it will be executed in a concurrent Future.
2399
2424
create_request_timeout (float):
2400
2425
Optional. The timeout for the create request in seconds.
2426
+ disable_retries (bool):
2427
+ Indicates if the job should retry for internal errors after the
2428
+ job starts running. If True, overrides
2429
+ `restart_job_on_worker_restart` to False.
2401
2430
"""
2402
2431
if service_account :
2403
2432
self ._gca_resource .trial_job_spec .service_account = service_account
2404
2433
2405
2434
if network :
2406
2435
self ._gca_resource .trial_job_spec .network = network
2407
2436
2408
- if timeout or restart_job_on_worker_restart :
2437
+ if timeout or restart_job_on_worker_restart or disable_retries :
2409
2438
duration = duration_pb2 .Duration (seconds = timeout ) if timeout else None
2410
2439
self ._gca_resource .trial_job_spec .scheduling = (
2411
2440
gca_custom_job_compat .Scheduling (
2412
2441
timeout = duration ,
2413
2442
restart_job_on_worker_restart = restart_job_on_worker_restart ,
2443
+ disable_retries = disable_retries ,
2414
2444
)
2415
2445
)
2416
2446
0 commit comments