Skip to content

Commit 961da42

Browse files
yinghsienwucopybara-github
authored andcommitted
feat: support autoscaling in Ray on Vertex
PiperOrigin-RevId: 668047841
1 parent f334321 commit 961da42

File tree

5 files changed

+79
-19
lines changed

5 files changed

+79
-19
lines changed

google/cloud/aiplatform/vertex_ray/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from google.cloud.aiplatform.vertex_ray import data
3737

3838
from google.cloud.aiplatform.vertex_ray.util.resources import (
39+
AutoscalingSpec,
3940
Resources,
4041
NodeImages,
4142
PscIConfig,
@@ -60,6 +61,7 @@
6061
"get_ray_cluster",
6162
"list_ray_clusters",
6263
"update_ray_cluster",
64+
"AutoscalingSpec",
6365
"Resources",
6466
"NodeImages",
6567
"PscIConfig",

google/cloud/aiplatform/vertex_ray/cluster_init.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,11 @@ def create_ray_cluster(
176176
"[Ray on Vertex AI]: For head_node_type, "
177177
+ "Resources.node_count must be 1."
178178
)
179+
if head_node_type.autoscaling_spec is not None:
180+
raise ValueError(
181+
"[Ray on Vertex AI]: For head_node_type, "
182+
+ "Resources.autoscaling_spec must be None."
183+
)
179184
if (
180185
head_node_type.accelerator_type is None
181186
and head_node_type.accelerator_count > 0
@@ -225,18 +230,38 @@ def create_ray_cluster(
225230
"[Ray on Vertex]: accelerator_type must be specified when"
226231
+ " accelerator_count is set to a value other than 0."
227232
)
228-
# Worker and head share the same MachineSpec, merge them into the
229-
# same ResourcePool
230233
additional_replica_count = resources._check_machine_spec_identical(
231234
head_node_type, worker_node_type
232235
)
233-
resource_pool_0.replica_count = (
234-
resource_pool_0.replica_count + additional_replica_count
235-
)
236+
if worker_node_type.autoscaling_spec is None:
237+
# Worker and head share the same MachineSpec, merge them into the
238+
# same ResourcePool
239+
resource_pool_0.replica_count = (
240+
resource_pool_0.replica_count + additional_replica_count
241+
)
242+
else:
243+
if additional_replica_count > 0:
244+
# Autoscaling for single ResourcePool (homogeneous cluster).
245+
resource_pool_0.replica_count = None
246+
resource_pool_0.autoscaling_spec.min_replica_count = (
247+
worker_node_type.autoscaling_spec.min_replica_count
248+
)
249+
resource_pool_0.autoscaling_spec.max_replica_count = (
250+
worker_node_type.autoscaling_spec.max_replica_count
251+
)
236252
if additional_replica_count == 0:
237253
resource_pool = ResourcePool()
238254
resource_pool.id = f"worker-pool{i+1}"
239-
resource_pool.replica_count = worker_node_type.node_count
255+
if worker_node_type.autoscaling_spec is None:
256+
resource_pool.replica_count = worker_node_type.node_count
257+
else:
258+
# Autoscaling for worker ResourcePool.
259+
resource_pool.autoscaling_spec.min_replica_count = (
260+
worker_node_type.autoscaling_spec.min_replica_count
261+
)
262+
resource_pool.autoscaling_spec.max_replica_count = (
263+
worker_node_type.autoscaling_spec.max_replica_count
264+
)
240265
resource_pool.machine_spec.machine_type = worker_node_type.machine_type
241266
resource_pool.machine_spec.accelerator_count = (
242267
worker_node_type.accelerator_count

google/cloud/aiplatform/vertex_ray/util/_gapic_utils.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
)
2828
from google.cloud.aiplatform.vertex_ray.util import _validation_utils
2929
from google.cloud.aiplatform.vertex_ray.util.resources import (
30+
AutoscalingSpec,
3031
Cluster,
3132
PscIConfig,
3233
Resources,
@@ -253,17 +254,27 @@ def persistent_resource_to_cluster(
253254
if _OFFICIAL_IMAGE in worker_image_uri:
254255
# Official training image is not custom
255256
worker_image_uri = None
256-
worker_node_types.append(
257-
Resources(
258-
machine_type=resource_pools[i + 1].machine_spec.machine_type,
259-
accelerator_type=accelerator_type,
260-
accelerator_count=resource_pools[i + 1].machine_spec.accelerator_count,
261-
boot_disk_type=resource_pools[i + 1].disk_spec.boot_disk_type,
262-
boot_disk_size_gb=resource_pools[i + 1].disk_spec.boot_disk_size_gb,
263-
node_count=resource_pools[i + 1].replica_count,
264-
custom_image=worker_image_uri,
265-
)
257+
258+
resource = Resources(
259+
machine_type=resource_pools[i + 1].machine_spec.machine_type,
260+
accelerator_type=accelerator_type,
261+
accelerator_count=resource_pools[i + 1].machine_spec.accelerator_count,
262+
boot_disk_type=resource_pools[i + 1].disk_spec.boot_disk_type,
263+
boot_disk_size_gb=resource_pools[i + 1].disk_spec.boot_disk_size_gb,
264+
node_count=resource_pools[i + 1].replica_count,
265+
custom_image=worker_image_uri,
266266
)
267+
if resource_pools[i + 1].autoscaling_spec:
268+
resource.autoscaling_spec = AutoscalingSpec(
269+
min_replica_count=resource_pools[
270+
i + 1
271+
].autoscaling_spec.min_replica_count,
272+
max_replica_count=resource_pools[
273+
i + 1
274+
].autoscaling_spec.max_replica_count,
275+
)
276+
277+
worker_node_types.append(resource)
267278

268279
cluster.head_node_type = head_node_type
269280
cluster.worker_node_types = worker_node_types

google/cloud/aiplatform/vertex_ray/util/resources.py

+15
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,19 @@
1919
from google.cloud.aiplatform_v1beta1.types import PersistentResource
2020

2121

22+
@dataclasses.dataclass
23+
class AutoscalingSpec:
24+
"""Autoscaling spec for a ray cluster node.
25+
26+
Attributes:
27+
min_replica_count: The minimum number of replicas in the cluster.
28+
max_replica_count: The maximum number of replicas in the cluster.
29+
"""
30+
31+
min_replica_count: int = 1
32+
max_replica_count: int = 2
33+
34+
2235
@dataclasses.dataclass
2336
class Resources:
2437
"""Resources for a ray cluster node.
@@ -39,6 +52,7 @@ class Resources:
3952
be either unspecified or within the range of [100, 64000].
4053
custom_image: Custom image for this resource (e.g.
4154
us-docker.pkg.dev/my-project/ray-gpu.2-9.py310-tf:latest).
55+
autoscaling_spec: Autoscaling spec for this resource.
4256
"""
4357

4458
machine_type: Optional[str] = "n1-standard-16"
@@ -48,6 +62,7 @@ class Resources:
4862
boot_disk_type: Optional[str] = "pd-ssd"
4963
boot_disk_size_gb: Optional[int] = 100
5064
custom_image: Optional[str] = None
65+
autoscaling_spec: Optional[AutoscalingSpec] = None
5166

5267

5368
@dataclasses.dataclass

tests/unit/vertex_ray/test_constants.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from google.cloud.aiplatform.vertex_ray.util.resources import Cluster
2222
from google.cloud.aiplatform.vertex_ray.util.resources import (
23+
AutoscalingSpec,
2324
PscIConfig,
2425
Resources,
2526
)
@@ -274,7 +275,7 @@ class ClusterConstants:
274275
TEST_WORKER_NODE_TYPES_2_POOLS = [
275276
Resources(
276277
machine_type="n1-standard-16",
277-
node_count=4,
278+
autoscaling_spec=AutoscalingSpec(min_replica_count=1, max_replica_count=4),
278279
accelerator_type="NVIDIA_TESLA_P100",
279280
accelerator_count=1,
280281
)
@@ -283,7 +284,7 @@ class ClusterConstants:
283284
TEST_WORKER_NODE_TYPES_2_POOLS_CUSTOM_IMAGE = [
284285
Resources(
285286
machine_type="n1-standard-16",
286-
node_count=4,
287+
autoscaling_spec=AutoscalingSpec(min_replica_count=1, max_replica_count=4),
287288
accelerator_type="NVIDIA_TESLA_P100",
288289
accelerator_count=1,
289290
custom_image=TEST_CUSTOM_IMAGE,
@@ -311,7 +312,10 @@ class ClusterConstants:
311312
boot_disk_type="pd-ssd",
312313
boot_disk_size_gb=100,
313314
),
314-
replica_count=4,
315+
autoscaling_spec=ResourcePool.AutoscalingSpec(
316+
min_replica_count=1,
317+
max_replica_count=4,
318+
),
315319
)
316320
TEST_REQUEST_RUNNING_2_POOLS = PersistentResource(
317321
resource_pools=[TEST_RESOURCE_POOL_1, TEST_RESOURCE_POOL_2],
@@ -344,6 +348,8 @@ class ClusterConstants:
344348
psc_interface_config=None,
345349
network=ProjectConstants.TEST_VPC_NETWORK,
346350
)
351+
# Responses
352+
TEST_RESOURCE_POOL_2.replica_count = 1
347353
TEST_RESPONSE_RUNNING_2_POOLS = PersistentResource(
348354
name=TEST_VERTEX_RAY_PR_ADDRESS,
349355
resource_pools=[TEST_RESOURCE_POOL_1, TEST_RESOURCE_POOL_2],
@@ -425,6 +431,7 @@ class ClusterConstants:
425431
dashboard_address=TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
426432
ray_metric_enabled=True,
427433
ray_logs_enabled=True,
434+
labels={},
428435
)
429436
TEST_CLUSTER_BYOSA = Cluster(
430437
cluster_resource_name=TEST_VERTEX_RAY_PR_ADDRESS,

0 commit comments

Comments
 (0)