Upgrade ray version; shrink worker resource allocation

artemvmin · artemvmin · commit d4985d49c36d · 2024-03-06T20:00:59.000Z
diff --git a/applications/rag/README.md b/applications/rag/README.md
@@ -1,6 +1,6 @@
 # RAG-on-GKE Application
 
-**NOTE:** This solution is in beta/a work in progress - please expect friction while using it.
+**NOTE:** This solution is in beta. Please expect friction while using it.
 
 This is a sample to deploy a RAG application on GKE. Retrieval Augmented Generation (RAG) is a popular approach for boosting the accuracy of LLM responses, particularly for domain specific or private data sets. The basic idea is to have a semantically searchable knowledge base (often using vector search), which is used to retrieve relevant snippets for a given prompt to provide additional context to the LLM. Augmenting the knowledge base with additional data is typically cheaper than fine tuning and is more scalable when incorporating current events and other rapidly changing data spaces.
 
@@ -32,7 +32,7 @@ CLUSTER_REGION=us-central1
 ```
 2. Use the following instructions to create a GKE cluster. We recommend using Autopilot for a simpler setup.
 
-##### Autopilot
+##### Autopilot (recommended)
 
 RAG requires the latest Autopilot features, available on GKE cluster version `1.29.1-gke.1575000`+
 ```
@@ -46,7 +46,7 @@ gcloud container clusters create-auto ${CLUSTER_NAME:?} \
   --cluster-version ${CLUSTER_VERSION:?}
 ```
 
-##### Standard (recommended)
+##### Standard
 
 1. To create a GKE Standard cluster using Terraform, follow the [instructions here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/infrastructure/README.md). Use the preconfigured node pools in `/infrastructure/platform.tfvars` as this solution requires T4s and L4s.
 
@@ -105,6 +105,7 @@ gcloud container clusters get-credentials ${CLUSTER_NAME:?} --location ${CLUSTER
 ```
 kubectl port-forward -n ${NAMESPACE:?} deployment/mistral-7b-instruct 8080:8080
 ```
+
     * In a new terminal, try a few prompts:
 ```
 export USER_PROMPT="How to deploy a container on K8s?"
@@ -119,6 +120,7 @@ curl 127.0.0.1:8080/generate -X POST \
 }
 EOF
 ```
+
     * At the end of the smoke test with the TGI server, stop port forwarding by using Ctrl-C on the original terminal.
 
 5. Verify the frontend chat interface is setup:
@@ -167,8 +169,8 @@ This step generates the vector embeddings for your input dataset. Currently, the
     * `os.environ['KAGGLE_KEY']`
 
 9. Run all the cells in the notebook. This will generate vector embeddings for the input dataset (`denizbilginn/google-maps-restaurant-reviews`) and store them in the `pgvector-instance` via a Ray job.
-    * Once submitted, Ray will take several minutes to create the runtime environment and optionally scale up Ray worker nodes. During this time, the job status will remain PENDING.
-    * When the job status is SUCCEEDED, the vector embeddings have been generated and we are ready to launch the frontend chat interface.
+    * If the Ray job has FAILED, re-run the cell.
+    * When the Ray job has SUCCEEDED, we are ready to launch the frontend chat interface.
 
 ### Launch the Frontend Chat Interface
 
diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
@@ -252,7 +252,7 @@
    "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df",
    "metadata": {},
    "outputs": [],
-   "source": [
+  "source": [
     "job_id = client.submit_job(\n",
     "    entrypoint=\"python test.py\",\n",
     "    # Path to the local directory that contains the entrypoint file.\n",
@@ -278,10 +278,9 @@
     "    status = client.get_job_status(job_id)\n",
     "    if status != prev_status:\n",
     "        print(\"Job status:\", status)\n",
+    "        print(\"Job info:\", client.get_job_info(job_id).message)\n",
     "        prev_status = status\n",
     "    if status.is_terminal():\n",
-    "        if status == 'FAILED':\n",
-    "            print(\"Job info:\", client.get_job_info(job_id))\n",
     "        break\n",
     "    time.sleep(5)\n"
    ]
diff --git a/modules/kuberay-cluster/kuberay-autopilot-values.yaml b/modules/kuberay-cluster/kuberay-autopilot-values.yaml
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 image:
   # Replace this with your own image if needed.
   repository: rayproject/ray
-  tag: 2.6.1-py310-gpu
+  tag: 2.7.1-py310-gpu
   pullPolicy: IfNotPresent
 
 nameOverride: "kuberay"
@@ -64,8 +64,6 @@ head:
   # containerEnv specifies environment variables for the Ray container,
   # Follows standard K8s container env schema.
   containerEnv:
-  # - name: EXAMPLE_ENV
-  #   value: "1"
     - name: RAY_memory_monitor_refresh_ms
       value: "0"
     - name: RAY_GRAFANA_IFRAME_HOST
@@ -90,18 +88,18 @@ head:
   # for further guidance.
   resources:
     limits:
-      cpu: "8"
+      cpu: "1"
       # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head.
-      memory: "20G"
+      memory: "8G"
       ephemeral-storage: 20Gi
     requests:
-      cpu: "8"
-      memory: "20G"
+      cpu: "1"
+      memory: "8G"
       ephemeral-storage: 20Gi
   annotations:
     gke-gcsfuse/volumes: "true"
-    gke-gcsfuse/cpu-limit: "2"
-    gke-gcsfuse/memory-limit: 20Gi
+    gke-gcsfuse/cpu-limit: "1"
+    gke-gcsfuse/memory-limit: 4Gi
     gke-gcsfuse/ephemeral-storage-limit: 20Gi
   nodeSelector:
     cloud.google.com/compute-class: "Performance"
@@ -158,8 +156,6 @@ worker:
   disabled: true
 
 # The map's key is used as the groupName.
-# For example, key:small-group in the map below
-# will be used as the groupName
 additionalWorkerGroups:
   cpuGroup:
     # Disabled by default
@@ -194,16 +190,16 @@ additionalWorkerGroups:
     resources:
       limits:
         cpu: 4
-        memory: "20G"
+        memory: "16G"
         ephemeral-storage: 20Gi
       requests:
         cpu: 4
-        memory: "20G"
+        memory: "16G"
         ephemeral-storage: 20Gi
     annotations:
       gke-gcsfuse/volumes: "true"
       gke-gcsfuse/cpu-limit: "2"
-      gke-gcsfuse/memory-limit: 20Gi
+      gke-gcsfuse/memory-limit: 8Gi
       gke-gcsfuse/ephemeral-storage-limit: 20Gi
     nodeSelector:
       cloud.google.com/compute-class: "Performance"
@@ -287,19 +283,19 @@ additionalWorkerGroups:
   # for further guidance.
     resources:
       limits:
-        cpu: "8"
+        cpu: "4"
         nvidia.com/gpu: "2"
-        memory: "40G"
+        memory: "16G"
         ephemeral-storage: 20Gi
       requests:
-        cpu: "8"
+        cpu: "4"
         nvidia.com/gpu: "2"
-        memory: "40G"
+        memory: "16G"
         ephemeral-storage: 20Gi
     annotations:
       gke-gcsfuse/volumes: "true"
       gke-gcsfuse/cpu-limit: "2"
-      gke-gcsfuse/memory-limit: 20Gi
+      gke-gcsfuse/memory-limit: 8Gi
       gke-gcsfuse/ephemeral-storage-limit: 20Gi
     nodeSelector:
       cloud.google.com/compute-class: "Accelerator"