Reverting the TGI image version for LLAMA multiple GPUs in GKE samples (#931)

raushan2016 · web-flow · commit c985e958f686 · 2025-01-15T12:16:23.000-08:00
diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
@@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
           }
         }
         container {
-          image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"
+          image = "ghcr.io/huggingface/text-generation-inference:1.4.3"
           name  = "mistral-7b-instruct"
 
           port {
@@ -130,7 +130,9 @@ resource "kubernetes_deployment" "inference_deployment" {
             mount_path = "/dev/shm"
             name       = "dshm"
           }
-
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           volume_mount {
             mount_path = "/data"
             name       = "data"
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/README.md
@@ -104,7 +104,7 @@ Pod Template:
   Labels:  app=mistral-7b
   Containers:
    mistral-7b:
-    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+    Image:      ghcr.io/huggingface/text-generation-inference:1.4.3
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mistral-7b-instruct-L4gpus/mistral-7b.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: mistral-7b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         resources:
           limits:
             nvidia.com/gpu: 1
@@ -47,6 +47,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: data
       volumes:
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/README.md
@@ -77,7 +77,7 @@ Create a node pool for deploying Mixtral 7B with quadpod deployment L4 GPU {4 x
 ```bash
 gcloud container node-pools create mixtral-moe-gpu-pool \
   --cluster=mixtral8x7-cluster-gke  \
-  --project=gke-aishared-dev \
+  --project=${PROJECT_ID} \
   --machine-type=g2-standard-48 \
   --ephemeral-storage-local-ssd=count=4 \
   --accelerator=type=nvidia-l4,count=4 \
@@ -127,7 +127,7 @@ Pod Template:
   Labels:  app=mixtral8x7b
   Containers:
    mixtral8x7b:
-    Image:      us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+    Image:      ghcr.io/huggingface/text-generation-inference:1.4.3
     Port:       8080/TCP
     Host Port:  0/TCP
     Limits:
diff --git a/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml b/tutorials-and-examples/genAI-LLM/deploying-mixtral-8x7b-instruct-L4-gpus/mixtral-8x7b.yaml
@@ -30,7 +30,7 @@ spec:
         cloud.google.com/gke-accelerator: "nvidia-l4"
       containers:
       - name: mixtral8x7b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         ports:
         - name: server-port
           containerPort: 8080
@@ -53,6 +53,9 @@ spec:
             memory: "42Gi"
             nvidia.com/gpu: "2"
         volumeMounts:
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: ephemeral-volume
           - mountPath: /dev/shm
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/README.md
@@ -76,7 +76,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         resources:
           limits:
             nvidia.com/gpu: 2
@@ -97,6 +97,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: data
       volumes:
diff --git a/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml b/tutorials-and-examples/genAI-LLM/serving-llama2-70b-on-l4-gpus/text-generation-interface.yaml
@@ -28,7 +28,7 @@ spec:
     spec:
       containers:
       - name: llama-2-70b
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310
+        image: ghcr.io/huggingface/text-generation-inference:1.4.3
         resources:
           limits:
             nvidia.com/gpu: 2
@@ -49,6 +49,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HF_HOME environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: data
       volumes:

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {`
`89`	`89`	`}`
`90`	`90`	`}`
`91`	`91`	`container {`
`92`		`- image = "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-2.ubuntu2204.py310"`
	`92`	`+ image = "ghcr.io/huggingface/text-generation-inference:1.4.3"`
`93`	`93`	`name = "mistral-7b-instruct"`
`94`	`94`
`95`	`95`	`port {`
`@@ -130,7 +130,9 @@ resource "kubernetes_deployment" "inference_deployment" {`
`130`	`130`	`mount_path = "/dev/shm"`
`131`	`131`	`name = "dshm"`
`132`	`132`	`}`
`133`		`-`
	`133`	`+ # mountPath is set to /data as it's the path where the HF_HOME environment`
	`134`	`+ # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be`
	`135`	`+ # stored`
`134`	`136`	`volume_mount {`
`135`	`137`	`mount_path = "/data"`
`136`	`138`	`name = "data"`