googleapis
diff --git a/‎google/cloud/aiplatform/preview/vertex_ray/client_builder.py
+20-3 b/‎google/cloud/aiplatform/preview/vertex_ray/client_builder.py
+20-3
diff --git a/‎google/cloud/aiplatform/preview/vertex_ray/cluster_init.py
+51-25 b/‎google/cloud/aiplatform/preview/vertex_ray/cluster_init.py
+51-25
diff --git a/‎google/cloud/aiplatform/preview/vertex_ray/util/_gapic_utils.py
+35-21 b/‎google/cloud/aiplatform/preview/vertex_ray/util/_gapic_utils.py
+35-21
diff --git a/‎google/cloud/aiplatform/preview/vertex_ray/util/_validation_utils.py
+20-10 b/‎google/cloud/aiplatform/preview/vertex_ray/util/_validation_utils.py
+20-10
@@ -120,9 +120,26 @@ def __init__(self, address: Optional[str]) -> None:
             )
         local_ray_verion = _validation_utils.get_local_ray_version()
         if cluster.ray_version != local_ray_verion:
-            raise ValueError(
-                f"[Ray on Vertex AI]: Local runtime has Ray version {local_ray_verion}, but the cluster runtime has {cluster.ray_version}. Please ensure that the Ray versions match."
-            )
+            if cluster.head_node_type.custom_image is None:
+                install_ray_version = _validation_utils.SUPPORTED_RAY_VERSIONS.get(
+                    cluster.ray_version
+                )
+                logging.info(
+                    "[Ray on Vertex]: Local runtime has Ray version %s"
+                    ", but the requested cluster runtime has %s. Please "
+                    "ensure that the Ray versions match for client connectivity. You may "
+                    '"pip install --user --force-reinstall ray[default]==%s"'
+                    " and restart runtime before cluster connection.",
+                    local_ray_verion,
+                    cluster.ray_version,
+                    install_ray_version,
+                )
+            else:
+                logging.info(
+                    "[Ray on Vertex]: Local runtime has Ray version %s."
+                    "Please ensure that the Ray versions match for client connectivity.",
+                    local_ray_verion,
+                )
         super().__init__(address)
 
     def connect(self) -> _VertexRayClientContext:
 
@@ -43,8 +43,8 @@
 
 def create_ray_cluster(
     head_node_type: Optional[resources.Resources] = resources.Resources(),
-    python_version: Optional[str] = "3_10",
-    ray_version: Optional[str] = "2_4",
+    python_version: Optional[str] = "3.10",
+    ray_version: Optional[str] = "2.9",
     network: Optional[str] = None,
     cluster_name: Optional[str] = None,
     worker_node_types: Optional[List[resources.Resources]] = None,
@@ -62,19 +62,22 @@ def create_ray_cluster(
         node_count=1,
         accelerator_type="NVIDIA_TESLA_K80",
         accelerator_count=1,
+        custom_image="us-docker.pkg.dev/my-project/ray-cpu-image.2.9:latest",  # Optional
     )
 
     worker_node_types = [Resources(
         machine_type="n1-standard-8",
         node_count=2,
         accelerator_type="NVIDIA_TESLA_K80",
         accelerator_count=1,
+        custom_image="us-docker.pkg.dev/my-project/ray-gpu-image.2.9:latest",  # Optional
     )]
 
     cluster_resource_name = vertex_ray.create_ray_cluster(
         head_node_type=head_node_type,
         network="projects/my-project-number/global/networks/my-vpc-name",
         worker_node_types=worker_node_types,
+        ray_version="2.9",
     )
 
     After a ray cluster is set up, you can call
@@ -100,7 +103,10 @@ def create_ray_cluster(
         worker_node_types: The list of Resources of the worker nodes. The same
             Resources object should not appear multiple times in the list.
         custom_images: The NodeImages which specifies head node and worker nodes
-            images. Allowlist only.
+            images. All the workers will share the same image. If each Resource
+            has a specific custom image, use `Resources.custom_image` for
+            head/worker_node_type(s). Note that configuring `Resources.custom_image`
+            will override `custom_images` here. Allowlist only.
         labels:
             The labels with user-defined metadata to organize Ray cluster.
 
@@ -121,14 +127,24 @@ def create_ray_cluster(
 
     local_ray_verion = _validation_utils.get_local_ray_version()
     if ray_version != local_ray_verion:
-        install_ray_version = ".".join(ray_version.split("_"))
-        logging.info(
-            f"[Ray on Vertex]: Local runtime has Ray version {local_ray_verion}"
-            + f", but the requested cluster runtime has {ray_version}. Please "
-            + "ensure that the Ray versions match for client connectivity. You may "
-            + f'"pip install --user --force-reinstall ray[default]=={install_ray_version}"'
-            + " and restart runtime before cluster connection."
-        )
+        if custom_images is None and head_node_type.custom_image is None:
+            install_ray_version = "2.9.3" if ray_version == "2.9" else "2.4.0"
+            logging.info(
+                "[Ray on Vertex]: Local runtime has Ray version %s"
+                ", but the requested cluster runtime has %s. Please "
+                "ensure that the Ray versions match for client connectivity. You may "
+                '"pip install --user --force-reinstall ray[default]==%s"'
+                " and restart runtime before cluster connection.",
+                local_ray_verion,
+                ray_version,
+                install_ray_version,
+            )
+        else:
+            logging.info(
+                "[Ray on Vertex]: Local runtime has Ray version %s."
+                "Please ensure that the Ray versions match for client connectivity.",
+                local_ray_verion,
+            )
 
     if cluster_name is None:
         cluster_name = "ray-cluster-" + utils.timestamped_unique_name()
@@ -161,15 +177,18 @@ def create_ray_cluster(
     resource_pool_0.disk_spec.boot_disk_size_gb = head_node_type.boot_disk_size_gb
 
     enable_cuda = True if head_node_type.accelerator_count > 0 else False
-    image_uri = _validation_utils.get_image_uri(
-        ray_version, python_version, enable_cuda
-    )
-    if custom_images is not None:
-        if custom_images.head is None or custom_images.worker is None:
-            raise ValueError(
-                "[Ray on Vertex AI]: custom_images.head and custom_images.worker must be specified when custom_images is set."
-            )
+    if head_node_type.custom_image is not None:
+        image_uri = head_node_type.custom_image
+    elif custom_images is None:
+        image_uri = _validation_utils.get_image_uri(
+            ray_version, python_version, enable_cuda
+        )
+    elif custom_images.head is not None and custom_images.worker is not None:
         image_uri = custom_images.head
+    else:
+        raise ValueError(
+            "[Ray on Vertex AI]: custom_images.head and custom_images.worker must be specified when custom_images is set."
+        )
 
     resource_pool_images[resource_pool_0.id] = image_uri
 
@@ -210,11 +229,16 @@ def create_ray_cluster(
                 )
                 worker_pools.append(resource_pool)
                 enable_cuda = True if worker_node_type.accelerator_count > 0 else False
-                image_uri = _validation_utils.get_image_uri(
-                    ray_version, python_version, enable_cuda
-                )
-                if custom_images is not None:
+
+                if worker_node_type.custom_image is not None:
+                    image_uri = worker_node_type.custom_image
+                elif custom_images is None:
+                    image_uri = _validation_utils.get_image_uri(
+                        ray_version, python_version, enable_cuda
+                    )
+                else:
                     image_uri = custom_images.worker
+
                 resource_pool_images[resource_pool.id] = image_uri
 
             i += 1
@@ -395,8 +419,10 @@ def update_ray_cluster(
     if len(worker_node_types) != len(previous_worker_node_types):
         raise ValueError(
             "[Ray on Vertex AI]: Desired number of worker_node_types "
-            + f"({len(worker_node_types)}) does not match the number of the "
-            + f"existing worker_node_type({len(previous_worker_node_types)})."
+            + "(%i) does not match the number of the "
+            + "existing worker_node_type(%i).",
+            len(worker_node_types),
+            len(previous_worker_node_types),
         )
 
     # Merge worker_node_type and head_node_type if they share
 
@@ -28,7 +28,6 @@
 from google.cloud.aiplatform.preview.vertex_ray.util import _validation_utils
 from google.cloud.aiplatform.preview.vertex_ray.util.resources import (
     Cluster,
-    NodeImages,
     Resources,
 )
 from google.cloud.aiplatform_v1beta1.types.persistent_resource import (
@@ -39,6 +38,10 @@
 )
 
 
+_PRIVATE_PREVIEW_IMAGE = "-docker.pkg.dev/vertex-ai/training/tf-"
+_OFFICIAL_IMAGE = "-docker.pkg.dev/vertex-ai/training/ray-"
+
+
 def create_persistent_resource_client():
     # location is inhereted from the global configuration at aiplatform.init().
     return initializer.global_config.create_client(
@@ -131,7 +134,7 @@ def get_persistent_resource(
 
 def persistent_resource_to_cluster(
     persistent_resource: PersistentResource,
-) -> Cluster:
+) -> Optional[Cluster]:
     """Format a PersistentResource to a dictionary.
 
     Args:
@@ -156,51 +159,52 @@ def persistent_resource_to_cluster(
             persistent_resource.name,
         )
         return
+    resource_pools = persistent_resource.resource_pools
 
+    head_resource_pool = resource_pools[0]
+    head_id = head_resource_pool.id
     head_image_uri = (
-        persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
-            "head-node"
-        ]
+        persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[head_id]
     )
-    worker_image_uri = (
-        persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images.get(
-            "worker-pool1", None
-        )
-    )
-    if worker_image_uri is None:
-        worker_image_uri = head_image_uri
 
     if not head_image_uri:
         head_image_uri = persistent_resource.resource_runtime_spec.ray_spec.image_uri
+
     try:
         python_version, ray_version = _validation_utils.get_versions_from_image_uri(
             head_image_uri
         )
     except IndexError:
-        logging.info(
-            "[Ray on Vertex AI]: The image of cluster %s is outdated. It is recommended to delete and recreate the cluster to obtain the latest image.",
-            persistent_resource.name,
-        )
-        return
+        if _PRIVATE_PREVIEW_IMAGE in head_image_uri:
+            # If using outdated images
+            logging.info(
+                "[Ray on Vertex AI]: The image of cluster %s is outdated. It is recommended to delete and recreate the cluster to obtain the latest image.",
+                persistent_resource.name,
+            )
+            return None
+        else:
+            # Custom image might also cause IndexError
+            python_version = None
+            ray_version = None
     cluster.python_version = python_version
     cluster.ray_version = ray_version
-    cluster.node_images = NodeImages(head=head_image_uri, worker=worker_image_uri)
 
-    resource_pools = persistent_resource.resource_pools
-
-    head_resource_pool = resource_pools[0]
     accelerator_type = head_resource_pool.machine_spec.accelerator_type
     if accelerator_type.value != 0:
         accelerator_type = accelerator_type.name
     else:
         accelerator_type = None
+    if _OFFICIAL_IMAGE in head_image_uri:
+        # Official training image is not custom
+        head_image_uri = None
     head_node_type = Resources(
         machine_type=head_resource_pool.machine_spec.machine_type,
         accelerator_type=accelerator_type,
         accelerator_count=head_resource_pool.machine_spec.accelerator_count,
         boot_disk_type=head_resource_pool.disk_spec.boot_disk_type,
         boot_disk_size_gb=head_resource_pool.disk_spec.boot_disk_size_gb,
         node_count=1,
+        custom_image=head_image_uri,
     )
     worker_node_types = []
     if head_resource_pool.replica_count > 1:
@@ -215,6 +219,7 @@ def persistent_resource_to_cluster(
                 boot_disk_type=head_resource_pool.disk_spec.boot_disk_type,
                 boot_disk_size_gb=head_resource_pool.disk_spec.boot_disk_size_gb,
                 node_count=worker_node_count,
+                custom_image=head_image_uri,
             )
         )
     for i in range(len(resource_pools) - 1):
@@ -225,6 +230,14 @@ def persistent_resource_to_cluster(
             accelerator_type = accelerator_type.name
         else:
             accelerator_type = None
+        worker_image_uri = (
+            persistent_resource.resource_runtime_spec.ray_spec.resource_pool_images[
+                resource_pools[i + 1].id
+            ]
+        )
+        if _OFFICIAL_IMAGE in worker_image_uri:
+            # Official training image is not custom
+            worker_image_uri = None
         worker_node_types.append(
             Resources(
                 machine_type=resource_pools[i + 1].machine_spec.machine_type,
@@ -233,6 +246,7 @@ def persistent_resource_to_cluster(
                 boot_disk_type=resource_pools[i + 1].disk_spec.boot_disk_type,
                 boot_disk_size_gb=resource_pools[i + 1].disk_spec.boot_disk_size_gb,
                 node_count=resource_pools[i + 1].replica_count,
+                custom_image=worker_image_uri,
             )
         )
 
 
@@ -20,10 +20,13 @@
 import logging
 import ray
 import re
+from immutabledict import immutabledict
 
 from google.cloud.aiplatform import initializer
 from google.cloud.aiplatform.utils import resource_manager_utils
 
+SUPPORTED_RAY_VERSIONS = immutabledict({"2.4": "2.4.0", "2.9": "2.9.3"})
+SUPPORTED_PY_VERSION = ["3.10"]
 
 # Artifact Repository available regions.
 _AVAILABLE_REGIONS = ["us", "europe", "asia"]
@@ -73,25 +76,28 @@ def get_local_ray_version():
     ray_version = ray.__version__.split(".")
     if len(ray_version) == 3:
         ray_version = ray_version[:2]
-    return "_".join(ray_version)
+    return ".".join(ray_version)
 
 
 def get_image_uri(ray_version, python_version, enable_cuda):
     """Image uri for a given ray version and python version."""
-    if ray_version not in ["2_4", "2_9"]:
+    if ray_version not in SUPPORTED_RAY_VERSIONS:
         raise ValueError(
-            "[Ray on Vertex AI]: The supported Ray versions are 2_4 (2.4.0) and 2_9 (2.9.3)."
+            "[Ray on Vertex AI]: The supported Ray versions are %s (%s) and %s (%s).",
+            list(SUPPORTED_RAY_VERSIONS.keys())[0],
+            list(SUPPORTED_RAY_VERSIONS.values())[0],
+            list(SUPPORTED_RAY_VERSIONS.keys())[1],
+            list(SUPPORTED_RAY_VERSIONS.values())[1],
         )
-    if python_version not in ["3_10"]:
-        raise ValueError("[Ray on Vertex AI]: The supported Python version is 3_10.")
+    if python_version not in SUPPORTED_PY_VERSION:
+        raise ValueError("[Ray on Vertex AI]: The supported Python version is 3.10.")
 
     location = initializer.global_config.location
     region = location.split("-")[0]
     if region not in _AVAILABLE_REGIONS:
         region = _DEFAULT_REGION
-    ray_version = ray_version.replace("_", "-")
+    ray_version = ray_version.replace(".", "-")
     if enable_cuda:
-        # TODO(b/292003337) update eligible image uris
         return f"{region}-docker.pkg.dev/vertex-ai/training/ray-gpu.{ray_version}.py310:latest"
     else:
         return f"{region}-docker.pkg.dev/vertex-ai/training/ray-cpu.{ray_version}.py310:latest"
@@ -101,9 +107,13 @@ def get_versions_from_image_uri(image_uri):
     """Get ray version and python version from image uri."""
     logging.info(f"[Ray on Vertex AI]: Getting versions from image uri: {image_uri}")
     image_label = image_uri.split("/")[-1].split(":")[0]
-    py_version = image_label[-3] + "_" + image_label[-2:]
-    ray_version = image_label.split(".")[1].replace("-", "_")
-    return py_version, ray_version
+    py_version = image_label[-3] + "." + image_label[-2:]
+    ray_version = image_label.split(".")[1].replace("-", ".")
+    if ray_version in SUPPORTED_RAY_VERSIONS and py_version in SUPPORTED_PY_VERSION:
+        return py_version, ray_version
+    else:
+        # May not parse custom image and get the versions correctly
+        return None, None
 
 
 def valid_dashboard_address(address):