This repository was archived by the owner on Jun 23, 2025. It is now read-only.
File tree Expand file tree Collapse file tree 7 files changed +23
-9
lines changed
modules/inference-service
tutorials-and-examples/genAI-LLM
deploying-mistral-7b-instruct-L4gpus
deploying-mixtral-8x7b-instruct-L4-gpus
serving-llama2-70b-on-l4-gpus Expand file tree Collapse file tree 7 files changed +23
-9
lines changed Original file line number Diff line number Diff line change @@ -89,7 +89,7 @@ resource "kubernetes_deployment" "inference_deployment" {
89
89
}
90
90
}
91
91
container {
92
- image = " us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310 "
92
+ image = " ghcr. io/huggingface/ text-generation-inference:1.4.3 "
93
93
name = " mistral-7b-instruct"
94
94
95
95
port {
@@ -130,7 +130,9 @@ resource "kubernetes_deployment" "inference_deployment" {
130
130
mount_path = " /dev/shm"
131
131
name = " dshm"
132
132
}
133
-
133
+ # mountPath is set to /data as it's the path where the HF_HOME environment
134
+ # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
135
+ # stored
134
136
volume_mount {
135
137
mount_path = " /data"
136
138
name = " data"
Original file line number Diff line number Diff line change @@ -104,7 +104,7 @@ Pod Template:
104
104
Labels: app=mistral-7b
105
105
Containers:
106
106
mistral-7b:
107
- Image: us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310
107
+ Image: ghcr. io/huggingface/ text-generation-inference:1.4.3
108
108
Port: 8080/TCP
109
109
Host Port: 0/TCP
110
110
Limits:
Original file line number Diff line number Diff line change 28
28
spec :
29
29
containers :
30
30
- name : mistral-7b
31
- image : us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310
31
+ image : ghcr. io/huggingface/ text-generation-inference:1.4.3
32
32
resources :
33
33
limits :
34
34
nvidia.com/gpu : 1
47
47
volumeMounts :
48
48
- mountPath : /dev/shm
49
49
name : dshm
50
+ # mountPath is set to /data as it's the path where the HF_HOME environment
51
+ # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
52
+ # stored
50
53
- mountPath : /data
51
54
name : data
52
55
volumes :
Original file line number Diff line number Diff line change @@ -77,7 +77,7 @@ Create a node pool for deploying Mixtral 7B with quadpod deployment L4 GPU {4 x
77
77
``` bash
78
78
gcloud container node-pools create mixtral-moe-gpu-pool \
79
79
--cluster=mixtral8x7-cluster-gke \
80
- --project=gke-aishared-dev \
80
+ --project=${PROJECT_ID} \
81
81
--machine-type=g2-standard-48 \
82
82
--ephemeral-storage-local-ssd=count=4 \
83
83
--accelerator=type=nvidia-l4,count=4 \
@@ -127,7 +127,7 @@ Pod Template:
127
127
Labels: app=mixtral8x7b
128
128
Containers:
129
129
mixtral8x7b:
130
- Image: us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310
130
+ Image: ghcr. io/huggingface/ text-generation-inference:1.4.3
131
131
Port: 8080/TCP
132
132
Host Port: 0/TCP
133
133
Limits:
Original file line number Diff line number Diff line change 30
30
cloud.google.com/gke-accelerator : " nvidia-l4"
31
31
containers :
32
32
- name : mixtral8x7b
33
- image : us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310
33
+ image : ghcr. io/huggingface/ text-generation-inference:1.4.3
34
34
ports :
35
35
- name : server-port
36
36
containerPort : 8080
53
53
memory : " 42Gi"
54
54
nvidia.com/gpu : " 2"
55
55
volumeMounts :
56
+ # mountPath is set to /data as it's the path where the HF_HOME environment
57
+ # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
58
+ # stored
56
59
- mountPath : /data
57
60
name : ephemeral-volume
58
61
- mountPath : /dev/shm
Original file line number Diff line number Diff line change 76
76
spec :
77
77
containers :
78
78
- name : llama-2-70b
79
- image : us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310
79
+ image : ghcr. io/huggingface/ text-generation-inference:1.4.3
80
80
resources :
81
81
limits :
82
82
nvidia.com/gpu : 2
97
97
volumeMounts :
98
98
- mountPath : /dev/shm
99
99
name : dshm
100
+ # mountPath is set to /data as it's the path where the HF_HOME environment
101
+ # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
102
+ # stored
100
103
- mountPath : /data
101
104
name : data
102
105
volumes :
Original file line number Diff line number Diff line change 28
28
spec :
29
29
containers :
30
30
- name : llama-2-70b
31
- image : us-docker.pkg.dev/deeplearning-platform-release/gcr. io/huggingface- text-generation-inference-cu121.2-2.ubuntu2204.py310
31
+ image : ghcr. io/huggingface/ text-generation-inference:1.4.3
32
32
resources :
33
33
limits :
34
34
nvidia.com/gpu : 2
49
49
volumeMounts :
50
50
- mountPath : /dev/shm
51
51
name : dshm
52
+ # mountPath is set to /data as it's the path where the HF_HOME environment
53
+ # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
54
+ # stored
52
55
- mountPath : /data
53
56
name : data
54
57
volumes :
You can’t perform that action at this time.
0 commit comments