|
44 | 44 | from google.cloud.aiplatform.compat.types import (
|
45 | 45 | training_pipeline as gca_training_pipeline,
|
46 | 46 | study as gca_study_compat,
|
| 47 | + custom_job as gca_custom_job_compat, |
47 | 48 | )
|
48 | 49 |
|
49 | 50 | from google.cloud.aiplatform.utils import _timestamped_gcs_dir
|
@@ -1525,6 +1526,7 @@ def _prepare_training_task_inputs_and_output_dir(
|
1525 | 1526 | tensorboard: Optional[str] = None,
|
1526 | 1527 | disable_retries: bool = False,
|
1527 | 1528 | persistent_resource_id: Optional[str] = None,
|
| 1529 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
1528 | 1530 | ) -> Tuple[Dict, str]:
|
1529 | 1531 | """Prepares training task inputs and output directory for custom job.
|
1530 | 1532 |
|
@@ -1582,6 +1584,8 @@ def _prepare_training_task_inputs_and_output_dir(
|
1582 | 1584 | on-demand short-live machines. The network, CMEK, and node pool
|
1583 | 1585 | configs on the job should be consistent with those on the
|
1584 | 1586 | PersistentResource, otherwise, the job will be rejected.
|
| 1587 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 1588 | + Optional. Indicates the job scheduling strategy. |
1585 | 1589 |
|
1586 | 1590 | Returns:
|
1587 | 1591 | Training task inputs and Output directory for custom job.
|
@@ -1612,12 +1616,18 @@ def _prepare_training_task_inputs_and_output_dir(
|
1612 | 1616 | if persistent_resource_id:
|
1613 | 1617 | training_task_inputs["persistent_resource_id"] = persistent_resource_id
|
1614 | 1618 |
|
1615 |
| - if timeout or restart_job_on_worker_restart or disable_retries: |
| 1619 | + if ( |
| 1620 | + timeout |
| 1621 | + or restart_job_on_worker_restart |
| 1622 | + or disable_retries |
| 1623 | + or scheduling_strategy |
| 1624 | + ): |
1616 | 1625 | timeout = f"{timeout}s" if timeout else None
|
1617 | 1626 | scheduling = {
|
1618 | 1627 | "timeout": timeout,
|
1619 | 1628 | "restart_job_on_worker_restart": restart_job_on_worker_restart,
|
1620 | 1629 | "disable_retries": disable_retries,
|
| 1630 | + "strategy": scheduling_strategy, |
1621 | 1631 | }
|
1622 | 1632 | training_task_inputs["scheduling"] = scheduling
|
1623 | 1633 |
|
@@ -3005,6 +3015,7 @@ def run(
|
3005 | 3015 | disable_retries: bool = False,
|
3006 | 3016 | persistent_resource_id: Optional[str] = None,
|
3007 | 3017 | tpu_topology: Optional[str] = None,
|
| 3018 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
3008 | 3019 | ) -> Optional[models.Model]:
|
3009 | 3020 | """Runs the custom training job.
|
3010 | 3021 |
|
@@ -3360,6 +3371,8 @@ def run(
|
3360 | 3371 | details on the TPU topology, refer to
|
3361 | 3372 | https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology must
|
3362 | 3373 | be a supported value for the TPU machine type.
|
| 3374 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 3375 | + Optional. Indicates the job scheduling strategy. |
3363 | 3376 |
|
3364 | 3377 | Returns:
|
3365 | 3378 | The trained Vertex AI model resource or None if the training
|
@@ -3424,6 +3437,7 @@ def run(
|
3424 | 3437 | create_request_timeout=create_request_timeout,
|
3425 | 3438 | disable_retries=disable_retries,
|
3426 | 3439 | persistent_resource_id=persistent_resource_id,
|
| 3440 | + scheduling_strategy=scheduling_strategy, |
3427 | 3441 | )
|
3428 | 3442 |
|
3429 | 3443 | def submit(
|
@@ -3477,6 +3491,7 @@ def submit(
|
3477 | 3491 | disable_retries: bool = False,
|
3478 | 3492 | persistent_resource_id: Optional[str] = None,
|
3479 | 3493 | tpu_topology: Optional[str] = None,
|
| 3494 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
3480 | 3495 | ) -> Optional[models.Model]:
|
3481 | 3496 | """Submits the custom training job without blocking until completion.
|
3482 | 3497 |
|
@@ -3777,6 +3792,8 @@ def submit(
|
3777 | 3792 | details on the TPU topology, refer to
|
3778 | 3793 | https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology must
|
3779 | 3794 | be a supported value for the TPU machine type.
|
| 3795 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 3796 | + Optional. Indicates the job scheduling strategy. |
3780 | 3797 |
|
3781 | 3798 | Returns:
|
3782 | 3799 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -3841,6 +3858,7 @@ def submit(
|
3841 | 3858 | block=False,
|
3842 | 3859 | disable_retries=disable_retries,
|
3843 | 3860 | persistent_resource_id=persistent_resource_id,
|
| 3861 | + scheduling_strategy=scheduling_strategy, |
3844 | 3862 | )
|
3845 | 3863 |
|
3846 | 3864 | @base.optional_sync(construct_object_on_arg="managed_model")
|
@@ -3888,6 +3906,7 @@ def _run(
|
3888 | 3906 | block: Optional[bool] = True,
|
3889 | 3907 | disable_retries: bool = False,
|
3890 | 3908 | persistent_resource_id: Optional[str] = None,
|
| 3909 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
3891 | 3910 | ) -> Optional[models.Model]:
|
3892 | 3911 | """Packages local script and launches training_job.
|
3893 | 3912 |
|
@@ -4084,6 +4103,8 @@ def _run(
|
4084 | 4103 | on-demand short-live machines. The network, CMEK, and node pool
|
4085 | 4104 | configs on the job should be consistent with those on the
|
4086 | 4105 | PersistentResource, otherwise, the job will be rejected.
|
| 4106 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 4107 | + Optional. Indicates the job scheduling strategy. |
4087 | 4108 |
|
4088 | 4109 | Returns:
|
4089 | 4110 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -4138,6 +4159,7 @@ def _run(
|
4138 | 4159 | tensorboard=tensorboard,
|
4139 | 4160 | disable_retries=disable_retries,
|
4140 | 4161 | persistent_resource_id=persistent_resource_id,
|
| 4162 | + scheduling_strategy=scheduling_strategy, |
4141 | 4163 | )
|
4142 | 4164 |
|
4143 | 4165 | model = self._run_job(
|
@@ -4462,6 +4484,7 @@ def run(
|
4462 | 4484 | disable_retries: bool = False,
|
4463 | 4485 | persistent_resource_id: Optional[str] = None,
|
4464 | 4486 | tpu_topology: Optional[str] = None,
|
| 4487 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
4465 | 4488 | ) -> Optional[models.Model]:
|
4466 | 4489 | """Runs the custom training job.
|
4467 | 4490 |
|
@@ -4755,6 +4778,8 @@ def run(
|
4755 | 4778 | details on the TPU topology, refer to
|
4756 | 4779 | https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
|
4757 | 4780 | must be a supported value for the TPU machine type.
|
| 4781 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 4782 | + Optional. Indicates the job scheduling strategy. |
4758 | 4783 |
|
4759 | 4784 | Returns:
|
4760 | 4785 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -4818,6 +4843,7 @@ def run(
|
4818 | 4843 | create_request_timeout=create_request_timeout,
|
4819 | 4844 | disable_retries=disable_retries,
|
4820 | 4845 | persistent_resource_id=persistent_resource_id,
|
| 4846 | + scheduling_strategy=scheduling_strategy, |
4821 | 4847 | )
|
4822 | 4848 |
|
4823 | 4849 | def submit(
|
@@ -4871,6 +4897,7 @@ def submit(
|
4871 | 4897 | disable_retries: bool = False,
|
4872 | 4898 | persistent_resource_id: Optional[str] = None,
|
4873 | 4899 | tpu_topology: Optional[str] = None,
|
| 4900 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
4874 | 4901 | ) -> Optional[models.Model]:
|
4875 | 4902 | """Submits the custom training job without blocking until completion.
|
4876 | 4903 |
|
@@ -5164,6 +5191,8 @@ def submit(
|
5164 | 5191 | details on the TPU topology, refer to
|
5165 | 5192 | https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
|
5166 | 5193 | must be a supported value for the TPU machine type.
|
| 5194 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 5195 | + Optional. Indicates the job scheduling strategy. |
5167 | 5196 |
|
5168 | 5197 | Returns:
|
5169 | 5198 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -5227,6 +5256,7 @@ def submit(
|
5227 | 5256 | block=False,
|
5228 | 5257 | disable_retries=disable_retries,
|
5229 | 5258 | persistent_resource_id=persistent_resource_id,
|
| 5259 | + scheduling_strategy=scheduling_strategy, |
5230 | 5260 | )
|
5231 | 5261 |
|
5232 | 5262 | @base.optional_sync(construct_object_on_arg="managed_model")
|
@@ -5273,6 +5303,7 @@ def _run(
|
5273 | 5303 | block: Optional[bool] = True,
|
5274 | 5304 | disable_retries: bool = False,
|
5275 | 5305 | persistent_resource_id: Optional[str] = None,
|
| 5306 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
5276 | 5307 | ) -> Optional[models.Model]:
|
5277 | 5308 | """Packages local script and launches training_job.
|
5278 | 5309 | Args:
|
@@ -5465,6 +5496,8 @@ def _run(
|
5465 | 5496 | on-demand short-live machines. The network, CMEK, and node pool
|
5466 | 5497 | configs on the job should be consistent with those on the
|
5467 | 5498 | PersistentResource, otherwise, the job will be rejected.
|
| 5499 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 5500 | + Optional. Indicates the job scheduling strategy. |
5468 | 5501 |
|
5469 | 5502 | Returns:
|
5470 | 5503 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -5513,6 +5546,7 @@ def _run(
|
5513 | 5546 | tensorboard=tensorboard,
|
5514 | 5547 | disable_retries=disable_retries,
|
5515 | 5548 | persistent_resource_id=persistent_resource_id,
|
| 5549 | + scheduling_strategy=scheduling_strategy, |
5516 | 5550 | )
|
5517 | 5551 |
|
5518 | 5552 | model = self._run_job(
|
@@ -7537,6 +7571,7 @@ def run(
|
7537 | 7571 | disable_retries: bool = False,
|
7538 | 7572 | persistent_resource_id: Optional[str] = None,
|
7539 | 7573 | tpu_topology: Optional[str] = None,
|
| 7574 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
7540 | 7575 | ) -> Optional[models.Model]:
|
7541 | 7576 | """Runs the custom training job.
|
7542 | 7577 |
|
@@ -7831,6 +7866,8 @@ def run(
|
7831 | 7866 | details on the TPU topology, refer to
|
7832 | 7867 | https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config. The topology
|
7833 | 7868 | must be a supported value for the TPU machine type.
|
| 7869 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 7870 | + Optional. Indicates the job scheduling strategy. |
7834 | 7871 |
|
7835 | 7872 | Returns:
|
7836 | 7873 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -7889,6 +7926,7 @@ def run(
|
7889 | 7926 | create_request_timeout=create_request_timeout,
|
7890 | 7927 | disable_retries=disable_retries,
|
7891 | 7928 | persistent_resource_id=persistent_resource_id,
|
| 7929 | + scheduling_strategy=scheduling_strategy, |
7892 | 7930 | )
|
7893 | 7931 |
|
7894 | 7932 | @base.optional_sync(construct_object_on_arg="managed_model")
|
@@ -7934,6 +7972,7 @@ def _run(
|
7934 | 7972 | create_request_timeout: Optional[float] = None,
|
7935 | 7973 | disable_retries: bool = False,
|
7936 | 7974 | persistent_resource_id: Optional[str] = None,
|
| 7975 | + scheduling_strategy: Optional[gca_custom_job_compat.Scheduling.Strategy] = None, |
7937 | 7976 | ) -> Optional[models.Model]:
|
7938 | 7977 | """Packages local script and launches training_job.
|
7939 | 7978 |
|
@@ -8111,6 +8150,8 @@ def _run(
|
8111 | 8150 | on-demand short-live machines. The network, CMEK, and node pool
|
8112 | 8151 | configs on the job should be consistent with those on the
|
8113 | 8152 | PersistentResource, otherwise, the job will be rejected.
|
| 8153 | + scheduling_strategy (gca_custom_job_compat.Scheduling.Strategy): |
| 8154 | + Optional. Indicates the job scheduling strategy. |
8114 | 8155 |
|
8115 | 8156 | Returns:
|
8116 | 8157 | model: The trained Vertex AI Model resource or None if training did not
|
@@ -8159,6 +8200,7 @@ def _run(
|
8159 | 8200 | tensorboard=tensorboard,
|
8160 | 8201 | disable_retries=disable_retries,
|
8161 | 8202 | persistent_resource_id=persistent_resource_id,
|
| 8203 | + scheduling_strategy=scheduling_strategy, |
8162 | 8204 | )
|
8163 | 8205 |
|
8164 | 8206 | model = self._run_job(
|
|
0 commit comments