feat: support custom image for Ray cluster creation

yinghsienwu · copybara-github · commit d7271899a538 · 2024-02-13T11:26:04.000-08:00
PiperOrigin-RevId: 606689613
diff --git a/google/cloud/aiplatform/preview/vertex_ray/__init__.py b/google/cloud/aiplatform/preview/vertex_ray/__init__.py
@@ -34,6 +34,7 @@
 )
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import (
     Resources,
+    NodeImages,
 )
 
 from google.cloud.aiplatform.preview.vertex_ray.dashboard_sdk import (
@@ -55,4 +56,5 @@
     "list_ray_clusters",
     "update_ray_cluster",
     "Resources",
+    "NodeImages",
 )
diff --git a/google/cloud/aiplatform/preview/vertex_ray/cluster_init.py b/google/cloud/aiplatform/preview/vertex_ray/cluster_init.py
@@ -17,6 +17,7 @@
 
 import copy
 import logging
+import time
 from typing import Dict, List, Optional
 
 from google.cloud.aiplatform import initializer
@@ -47,6 +48,7 @@ def create_ray_cluster(
     network: Optional[str] = None,
     cluster_name: Optional[str] = None,
     worker_node_types: Optional[List[resources.Resources]] = None,
+    custom_images: Optional[resources.NodeImages] = None,
     labels: Optional[Dict[str, str]] = None,
 ) -> str:
     """Create a ray cluster on the Vertex AI.
@@ -97,6 +99,8 @@ def create_ray_cluster(
             or hyphen.
         worker_node_types: The list of Resources of the worker nodes. The same
             Resources object should not appear multiple times in the list.
+        custom_images: The NodeImages which specifies head node and worker nodes
+            images. Allowlist only.
         labels:
             The labels with user-defined metadata to organize Ray cluster.
 
@@ -157,6 +161,9 @@ def create_ray_cluster(
     image_uri = _validation_utils.get_image_uri(
         ray_version, python_version, enable_cuda
     )
+    if custom_images is not None:
+        if not (custom_images.head is None or custom_images.worker is None):
+            image_uri = custom_images.head
     resource_pool_images[resource_pool_0.id] = image_uri
 
     worker_pools = []
@@ -199,6 +206,9 @@ def create_ray_cluster(
                 image_uri = _validation_utils.get_image_uri(
                     ray_version, python_version, enable_cuda
                 )
+                if custom_images is not None:
+                    if not (custom_images.head is None or custom_images.worker is None):
+                        image_uri = custom_images.worker
                 resource_pool_images[resource_pool.id] = image_uri
 
             i += 1
@@ -425,6 +435,12 @@ def update_ray_cluster(
         ) from e
 
     # block before returning
+    start_time = time.time()
     response = operation_future.result()
-    print("[Ray on Vertex AI]: Successfully updated the cluster.")
+    duration = (time.time() - start_time) // 60
+    print(
+        "[Ray on Vertex AI]: Successfully updated the cluster ({} mininutes elapsed).".format(
+            duration
+        )
+    )
     return response.name
diff --git a/google/cloud/aiplatform/preview/vertex_ray/util/_gapic_utils.py b/google/cloud/aiplatform/preview/vertex_ray/util/_gapic_utils.py
@@ -28,6 +28,7 @@
 from google.cloud.aiplatform.preview.vertex_ray.util import _validation_utils
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import (
     Cluster,
+    NodeImages,
     Resources,
 )
 from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
@@ -156,14 +157,24 @@ def persistent_resource_to_cluster(
         )
         return
 
-    image_uri = persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-        "head-node"
-    ]
-    if not image_uri:
-        image_uri = persistent_resource.resource_runtime_spec.ray_spec.image_uri
+    head_image_uri = (
+        persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+            "head-node"
+        ]
+    )
+    worker_image_uri = (
+        persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images.get(
+            "worker-pool1", None
+        )
+    )
+    if worker_image_uri is None:
+        worker_image_uri = head_image_uri
+
+    if not head_image_uri:
+        head_image_uri = persistent_resource.resource_runtime_spec.ray_spec.image_uri
     try:
         python_version, ray_version = _validation_utils.get_versions_from_image_uri(
-            image_uri
+            head_image_uri
         )
     except IndexError:
         logging.info(
@@ -173,6 +184,7 @@ def persistent_resource_to_cluster(
         return
     cluster.python_version = python_version
     cluster.ray_version = ray_version
+    cluster.node_images = NodeImages(head=head_image_uri, worker=worker_image_uri)
 
     resource_pools = persistent_resource.resource_pools
 
diff --git a/google/cloud/aiplatform/preview/vertex_ray/util/resources.py b/google/cloud/aiplatform/preview/vertex_ray/util/resources.py
@@ -47,6 +47,24 @@ class Resources:
     boot_disk_size_gb: Optional[int] = 100
 
 
+@dataclasses.dataclass
+class NodeImages:
+    """
+    Custom images for a ray cluster. We currently support Ray v2.4 and python v3.10.
+    The custom images must be extended from the following base images:
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.2-4.py310:latest" or
+    "{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.2-4.py310:latest". In
+    order to use custom images, need to specify both head and worker images.
+
+    Attributes:
+        head: head node image (eg. us-docker.pkg.dev/my-project/ray-cpu.2-4.py310-tf:latest).
+        worker: worker node image (eg. us-docker.pkg.dev/my-project/ray-gpu.2-4.py310-tf:latest).
+    """
+
+    head: str = None
+    worker: str = None
+
+
 @dataclasses.dataclass
 class Cluster:
     """Ray cluster (output only).
@@ -69,6 +87,7 @@ class Cluster:
             duplicate the elements in the list.
         dashboard_address: For Ray Job API (JobSubmissionClient), with this
            cluster connection doesn't require VPC peering.
+        node_images: The NodeImages for a ray cluster.
         labels:
             The labels with user-defined metadata to organize Ray cluster.
 
@@ -87,6 +106,7 @@ class Cluster:
     head_node_type: Resources = None
     worker_node_types: List[Resources] = None
     dashboard_address: str = None
+    node_images: NodeImages = None
     labels: Dict[str, str] = None
 
 
diff --git a/tests/unit/vertex_ray/test_cluster_init.py b/tests/unit/vertex_ray/test_cluster_init.py
@@ -20,6 +20,7 @@
 from google.cloud.aiplatform.preview import vertex_ray
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import (
     Resources,
+    NodeImages,
 )
 from google.cloud.aiplatform_v1beta1.services.persistent_resource_service import (
     PersistentResourceServiceClient,
@@ -80,6 +81,18 @@ def get_persistent_resource_1_pool_mock():
         yield get_persistent_resource_1_pool_mock
 
 
+@pytest.fixture
+def get_persistent_resource_1_pool_custom_image_mock():
+    with mock.patch.object(
+        PersistentResourceServiceClient,
+        "get_persistent_resource",
+    ) as get_persistent_resource_1_pool_custom_image_mock:
+        get_persistent_resource_1_pool_custom_image_mock.return_value = (
+            tc.ClusterConstants._TEST_RESPONSE_RUNNING_1_POOL_CUSTOM_IMAGES
+        )
+        yield get_persistent_resource_1_pool_custom_image_mock
+
+
 @pytest.fixture
 def create_persistent_resource_2_pools_mock():
     with mock.patch.object(
@@ -234,6 +247,35 @@ def test_create_ray_cluster_1_pool_gpu_success(
             request,
         )
 
+    @pytest.mark.usefixtures("get_persistent_resource_1_pool_custom_image_mock")
+    def test_create_ray_cluster_1_pool_custom_image_success(
+        self, create_persistent_resource_1_pool_mock
+    ):
+        """If head and worker nodes are duplicate, merge to head pool."""
+        custom_images = NodeImages(
+            head=tc.ClusterConstants._TEST_CUSTOM_IMAGE,
+            worker=tc.ClusterConstants._TEST_CUSTOM_IMAGE,
+        )
+        cluster_name = vertex_ray.create_ray_cluster(
+            head_node_type=tc.ClusterConstants._TEST_HEAD_NODE_TYPE_1_POOL,
+            worker_node_types=tc.ClusterConstants._TEST_WORKER_NODE_TYPES_1_POOL,
+            network=tc.ProjectConstants._TEST_VPC_NETWORK,
+            cluster_name=tc.ClusterConstants._TEST_VERTEX_RAY_PR_ID,
+            custom_images=custom_images,
+        )
+
+        assert tc.ClusterConstants._TEST_VERTEX_RAY_PR_ADDRESS == cluster_name
+
+        request = persistent_resource_service.CreatePersistentResourceRequest(
+            parent=tc.ProjectConstants._TEST_PARENT,
+            persistent_resource=tc.ClusterConstants._TEST_REQUEST_RUNNING_1_POOL_CUSTOM_IMAGES,
+            persistent_resource_id=tc.ClusterConstants._TEST_VERTEX_RAY_PR_ID,
+        )
+
+        create_persistent_resource_1_pool_mock.assert_called_with(
+            request,
+        )
+
     @pytest.mark.usefixtures("get_persistent_resource_1_pool_mock")
     def test_create_ray_cluster_1_pool_gpu_with_labels_success(
         self, create_persistent_resource_1_pool_mock
diff --git a/tests/unit/vertex_ray/test_constants.py b/tests/unit/vertex_ray/test_constants.py
@@ -20,6 +20,7 @@
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import Cluster
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import (
     Resources,
+    NodeImages,
 )
 from google.cloud.aiplatform_v1beta1.types.machine_resources import DiskSpec
 from google.cloud.aiplatform_v1beta1.types.machine_resources import (
@@ -82,6 +83,7 @@ class ClusterConstants:
     )
     _TEST_CPU_IMAGE = "us-docker.pkg.dev/vertex-ai/training/ray-cpu.2-4.py310:latest"
     _TEST_GPU_IMAGE = "us-docker.pkg.dev/vertex-ai/training/ray-gpu.2-4.py310:latest"
+    _TEST_CUSTOM_IMAGE = "us-docker.pkg.dev/my-project/ray-custom.2-4.py310:latest"
     # RUNNING Persistent Cluster w/o Ray
     _TEST_RESPONSE_NO_RAY_RUNNING = PersistentResource(
         name=_TEST_VERTEX_RAY_PR_ADDRESS,
@@ -127,6 +129,13 @@ class ClusterConstants:
         network=ProjectConstants._TEST_VPC_NETWORK,
         labels=_TEST_LABELS,
     )
+    _TEST_REQUEST_RUNNING_1_POOL_CUSTOM_IMAGES = PersistentResource(
+        resource_pools=[_TEST_RESOURCE_POOL_0],
+        resource_runtime_spec=ResourceRuntimeSpec(
+            ray_spec=RaySpec(resource_pool_images={"head-node": _TEST_CUSTOM_IMAGE}),
+        ),
+        network=ProjectConstants._TEST_VPC_NETWORK,
+    )
     # Get response has generated name, and URIs
     _TEST_RESPONSE_RUNNING_1_POOL = PersistentResource(
         name=_TEST_VERTEX_RAY_PR_ADDRESS,
@@ -143,6 +152,22 @@ class ClusterConstants:
         ),
         state="RUNNING",
     )
+    # Get response has generated name, and URIs
+    _TEST_RESPONSE_RUNNING_1_POOL_CUSTOM_IMAGES = PersistentResource(
+        name=_TEST_VERTEX_RAY_PR_ADDRESS,
+        resource_pools=[_TEST_RESOURCE_POOL_0],
+        resource_runtime_spec=ResourceRuntimeSpec(
+            ray_spec=RaySpec(resource_pool_images={"head-node": _TEST_CUSTOM_IMAGE}),
+        ),
+        network=ProjectConstants._TEST_VPC_NETWORK,
+        resource_runtime=ResourceRuntime(
+            access_uris={
+                "RAY_DASHBOARD_URI": _TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
+                "RAY_HEAD_NODE_INTERNAL_IP": _TEST_VERTEX_RAY_HEAD_NODE_IP,
+            }
+        ),
+        state="RUNNING",
+    )
     # 2_POOL: worker_node_types and head_node_type have different MachineSpecs
     _TEST_HEAD_NODE_TYPE_2_POOLS = Resources()
     _TEST_WORKER_NODE_TYPES_2_POOLS = [
@@ -213,6 +238,7 @@ class ClusterConstants:
         head_node_type=_TEST_HEAD_NODE_TYPE_1_POOL,
         worker_node_types=_TEST_WORKER_NODE_TYPES_1_POOL,
         dashboard_address=_TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
+        node_images=NodeImages(head=_TEST_CPU_IMAGE, worker=_TEST_CPU_IMAGE),
     )
     _TEST_CLUSTER_2 = Cluster(
         cluster_resource_name=_TEST_VERTEX_RAY_PR_ADDRESS,
@@ -223,6 +249,7 @@ class ClusterConstants:
         head_node_type=_TEST_HEAD_NODE_TYPE_2_POOLS,
         worker_node_types=_TEST_WORKER_NODE_TYPES_2_POOLS,
         dashboard_address=_TEST_VERTEX_RAY_DASHBOARD_ADDRESS,
+        node_images=NodeImages(head=_TEST_CPU_IMAGE, worker=_TEST_GPU_IMAGE),
     )
     _TEST_BEARER_TOKEN = "test-bearer-token"
     _TEST_HEADERS = {

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@`
`34`	`34`	`)`
`35`	`35`	`from google.cloud.aiplatform.preview.vertex_ray.util.resources import (`
`36`	`36`	`Resources,`
	`37`	`+ NodeImages,`
`37`	`38`	`)`
`38`	`39`
`39`	`40`	`from google.cloud.aiplatform.preview.vertex_ray.dashboard_sdk import (`
`@@ -55,4 +56,5 @@`
`55`	`56`	`"list_ray_clusters",`
`56`	`57`	`"update_ray_cluster",`
`57`	`58`	`"Resources",`
	`59`	`+ "NodeImages",`
`58`	`60`	`)`