@@ -1379,6 +1379,8 @@ def _prepare_training_task_inputs_and_output_dir(
1379
1379
base_output_dir : Optional [str ] = None ,
1380
1380
service_account : Optional [str ] = None ,
1381
1381
network : Optional [str ] = None ,
1382
+ timeout : Optional [int ] = None ,
1383
+ restart_job_on_worker_restart : bool = False ,
1382
1384
enable_web_access : bool = False ,
1383
1385
tensorboard : Optional [str ] = None ,
1384
1386
) -> Tuple [Dict , str ]:
@@ -1398,6 +1400,13 @@ def _prepare_training_task_inputs_and_output_dir(
1398
1400
should be peered. For example, projects/12345/global/networks/myVPC.
1399
1401
Private services access must already be configured for the network.
1400
1402
If left unspecified, the job is not peered with any network.
1403
+ timeout (int):
1404
+ The maximum job running time in seconds. The default is 7 days.
1405
+ restart_job_on_worker_restart (bool):
1406
+ Restarts the entire CustomJob if a worker
1407
+ gets restarted. This feature can be used by
1408
+ distributed training jobs that are not resilient
1409
+ to workers leaving and joining a job.
1401
1410
enable_web_access (bool):
1402
1411
Whether you want Vertex AI to enable interactive shell access
1403
1412
to training containers.
@@ -1442,6 +1451,14 @@ def _prepare_training_task_inputs_and_output_dir(
1442
1451
if enable_web_access :
1443
1452
training_task_inputs ["enable_web_access" ] = enable_web_access
1444
1453
1454
+ if timeout or restart_job_on_worker_restart :
1455
+ timeout = f"{ timeout } s" if timeout else None
1456
+ scheduling = {
1457
+ "timeout" : timeout ,
1458
+ "restart_job_on_worker_restart" : restart_job_on_worker_restart ,
1459
+ }
1460
+ training_task_inputs ["scheduling" ] = scheduling
1461
+
1445
1462
return training_task_inputs , base_output_dir
1446
1463
1447
1464
@property
@@ -1794,6 +1811,8 @@ def run(
1794
1811
test_filter_split : Optional [str ] = None ,
1795
1812
predefined_split_column_name : Optional [str ] = None ,
1796
1813
timestamp_split_column_name : Optional [str ] = None ,
1814
+ timeout : Optional [int ] = None ,
1815
+ restart_job_on_worker_restart : bool = False ,
1797
1816
enable_web_access : bool = False ,
1798
1817
tensorboard : Optional [str ] = None ,
1799
1818
sync = True ,
@@ -2014,6 +2033,13 @@ def run(
2014
2033
that piece is ignored by the pipeline.
2015
2034
2016
2035
Supported only for tabular and time series Datasets.
2036
+ timeout (int):
2037
+ The maximum job running time in seconds. The default is 7 days.
2038
+ restart_job_on_worker_restart (bool):
2039
+ Restarts the entire CustomJob if a worker
2040
+ gets restarted. This feature can be used by
2041
+ distributed training jobs that are not resilient
2042
+ to workers leaving and joining a job.
2017
2043
enable_web_access (bool):
2018
2044
Whether you want Vertex AI to enable interactive shell access
2019
2045
to training containers.
@@ -2080,6 +2106,8 @@ def run(
2080
2106
test_filter_split = test_filter_split ,
2081
2107
predefined_split_column_name = predefined_split_column_name ,
2082
2108
timestamp_split_column_name = timestamp_split_column_name ,
2109
+ timeout = timeout ,
2110
+ restart_job_on_worker_restart = restart_job_on_worker_restart ,
2083
2111
enable_web_access = enable_web_access ,
2084
2112
tensorboard = tensorboard ,
2085
2113
reduction_server_container_uri = reduction_server_container_uri
@@ -2117,6 +2145,8 @@ def _run(
2117
2145
test_filter_split : Optional [str ] = None ,
2118
2146
predefined_split_column_name : Optional [str ] = None ,
2119
2147
timestamp_split_column_name : Optional [str ] = None ,
2148
+ timeout : Optional [int ] = None ,
2149
+ restart_job_on_worker_restart : bool = False ,
2120
2150
enable_web_access : bool = False ,
2121
2151
tensorboard : Optional [str ] = None ,
2122
2152
reduction_server_container_uri : Optional [str ] = None ,
@@ -2237,6 +2267,13 @@ def _run(
2237
2267
that piece is ignored by the pipeline.
2238
2268
2239
2269
Supported only for tabular and time series Datasets.
2270
+ timeout (int):
2271
+ The maximum job running time in seconds. The default is 7 days.
2272
+ restart_job_on_worker_restart (bool):
2273
+ Restarts the entire CustomJob if a worker
2274
+ gets restarted. This feature can be used by
2275
+ distributed training jobs that are not resilient
2276
+ to workers leaving and joining a job.
2240
2277
enable_web_access (bool):
2241
2278
Whether you want Vertex AI to enable interactive shell access
2242
2279
to training containers.
@@ -2309,6 +2346,8 @@ def _run(
2309
2346
base_output_dir = base_output_dir ,
2310
2347
service_account = service_account ,
2311
2348
network = network ,
2349
+ timeout = timeout ,
2350
+ restart_job_on_worker_restart = restart_job_on_worker_restart ,
2312
2351
enable_web_access = enable_web_access ,
2313
2352
tensorboard = tensorboard ,
2314
2353
)
@@ -2598,6 +2637,8 @@ def run(
2598
2637
test_filter_split : Optional [str ] = None ,
2599
2638
predefined_split_column_name : Optional [str ] = None ,
2600
2639
timestamp_split_column_name : Optional [str ] = None ,
2640
+ timeout : Optional [int ] = None ,
2641
+ restart_job_on_worker_restart : bool = False ,
2601
2642
enable_web_access : bool = False ,
2602
2643
tensorboard : Optional [str ] = None ,
2603
2644
sync = True ,
@@ -2811,6 +2852,13 @@ def run(
2811
2852
that piece is ignored by the pipeline.
2812
2853
2813
2854
Supported only for tabular and time series Datasets.
2855
+ timeout (int):
2856
+ The maximum job running time in seconds. The default is 7 days.
2857
+ restart_job_on_worker_restart (bool):
2858
+ Restarts the entire CustomJob if a worker
2859
+ gets restarted. This feature can be used by
2860
+ distributed training jobs that are not resilient
2861
+ to workers leaving and joining a job.
2814
2862
enable_web_access (bool):
2815
2863
Whether you want Vertex AI to enable interactive shell access
2816
2864
to training containers.
@@ -2876,6 +2924,8 @@ def run(
2876
2924
test_filter_split = test_filter_split ,
2877
2925
predefined_split_column_name = predefined_split_column_name ,
2878
2926
timestamp_split_column_name = timestamp_split_column_name ,
2927
+ timeout = timeout ,
2928
+ restart_job_on_worker_restart = restart_job_on_worker_restart ,
2879
2929
enable_web_access = enable_web_access ,
2880
2930
tensorboard = tensorboard ,
2881
2931
reduction_server_container_uri = reduction_server_container_uri
@@ -2912,6 +2962,8 @@ def _run(
2912
2962
test_filter_split : Optional [str ] = None ,
2913
2963
predefined_split_column_name : Optional [str ] = None ,
2914
2964
timestamp_split_column_name : Optional [str ] = None ,
2965
+ timeout : Optional [int ] = None ,
2966
+ restart_job_on_worker_restart : bool = False ,
2915
2967
enable_web_access : bool = False ,
2916
2968
tensorboard : Optional [str ] = None ,
2917
2969
reduction_server_container_uri : Optional [str ] = None ,
@@ -2965,6 +3017,13 @@ def _run(
2965
3017
should be peered. For example, projects/12345/global/networks/myVPC.
2966
3018
Private services access must already be configured for the network.
2967
3019
If left unspecified, the job is not peered with any network.
3020
+ timeout (int):
3021
+ The maximum job running time in seconds. The default is 7 days.
3022
+ restart_job_on_worker_restart (bool):
3023
+ Restarts the entire CustomJob if a worker
3024
+ gets restarted. This feature can be used by
3025
+ distributed training jobs that are not resilient
3026
+ to workers leaving and joining a job.
2968
3027
bigquery_destination (str):
2969
3028
The BigQuery project location where the training data is to
2970
3029
be written to. In the given project a new dataset is created
@@ -3094,6 +3153,8 @@ def _run(
3094
3153
base_output_dir = base_output_dir ,
3095
3154
service_account = service_account ,
3096
3155
network = network ,
3156
+ timeout = timeout ,
3157
+ restart_job_on_worker_restart = restart_job_on_worker_restart ,
3097
3158
enable_web_access = enable_web_access ,
3098
3159
tensorboard = tensorboard ,
3099
3160
)
@@ -5373,6 +5434,8 @@ def run(
5373
5434
test_filter_split : Optional [str ] = None ,
5374
5435
predefined_split_column_name : Optional [str ] = None ,
5375
5436
timestamp_split_column_name : Optional [str ] = None ,
5437
+ timeout : Optional [int ] = None ,
5438
+ restart_job_on_worker_restart : bool = False ,
5376
5439
enable_web_access : bool = False ,
5377
5440
tensorboard : Optional [str ] = None ,
5378
5441
sync = True ,
@@ -5586,6 +5649,13 @@ def run(
5586
5649
that piece is ignored by the pipeline.
5587
5650
5588
5651
Supported only for tabular and time series Datasets.
5652
+ timeout (int):
5653
+ The maximum job running time in seconds. The default is 7 days.
5654
+ restart_job_on_worker_restart (bool):
5655
+ Restarts the entire CustomJob if a worker
5656
+ gets restarted. This feature can be used by
5657
+ distributed training jobs that are not resilient
5658
+ to workers leaving and joining a job.
5589
5659
enable_web_access (bool):
5590
5660
Whether you want Vertex AI to enable interactive shell access
5591
5661
to training containers.
@@ -5646,6 +5716,8 @@ def run(
5646
5716
predefined_split_column_name = predefined_split_column_name ,
5647
5717
timestamp_split_column_name = timestamp_split_column_name ,
5648
5718
bigquery_destination = bigquery_destination ,
5719
+ timeout = timeout ,
5720
+ restart_job_on_worker_restart = restart_job_on_worker_restart ,
5649
5721
enable_web_access = enable_web_access ,
5650
5722
tensorboard = tensorboard ,
5651
5723
reduction_server_container_uri = reduction_server_container_uri
@@ -5682,6 +5754,8 @@ def _run(
5682
5754
predefined_split_column_name : Optional [str ] = None ,
5683
5755
timestamp_split_column_name : Optional [str ] = None ,
5684
5756
bigquery_destination : Optional [str ] = None ,
5757
+ timeout : Optional [int ] = None ,
5758
+ restart_job_on_worker_restart : bool = False ,
5685
5759
enable_web_access : bool = False ,
5686
5760
tensorboard : Optional [str ] = None ,
5687
5761
reduction_server_container_uri : Optional [str ] = None ,
@@ -5785,6 +5859,13 @@ def _run(
5785
5859
that piece is ignored by the pipeline.
5786
5860
5787
5861
Supported only for tabular and time series Datasets.
5862
+ timeout (int):
5863
+ The maximum job running time in seconds. The default is 7 days.
5864
+ restart_job_on_worker_restart (bool):
5865
+ Restarts the entire CustomJob if a worker
5866
+ gets restarted. This feature can be used by
5867
+ distributed training jobs that are not resilient
5868
+ to workers leaving and joining a job.
5788
5869
enable_web_access (bool):
5789
5870
Whether you want Vertex AI to enable interactive shell access
5790
5871
to training containers.
@@ -5851,6 +5932,8 @@ def _run(
5851
5932
base_output_dir = base_output_dir ,
5852
5933
service_account = service_account ,
5853
5934
network = network ,
5935
+ timeout = timeout ,
5936
+ restart_job_on_worker_restart = restart_job_on_worker_restart ,
5854
5937
enable_web_access = enable_web_access ,
5855
5938
tensorboard = tensorboard ,
5856
5939
)
0 commit comments