File tree 3 files changed +35
-4
lines changed
components/llm_service/src/config
3 files changed +35
-4
lines changed Original file line number Diff line number Diff line change 153
153
"temperature" : 0.2 ,
154
154
"top_p" : 0.95 ,
155
155
"top_k" : 40 ,
156
- "max_length " : 2048
156
+ "max_tokens " : 2048
157
157
}
158
158
},
159
159
"VertexAI-ModelGarden-LLAMA2-Chat" : {
Original file line number Diff line number Diff line change 1
1
# Deploying Gemma 2B
2
+ Reference: https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-gpu-vllm
2
3
3
4
## Pre-Requisites
4
5
Kubernetes cluster with L4 GPUs nodepool
6
+ ``` shell
7
+ export CLUSTER_NAME=" main-cluster"
8
+ export REGION=" us-central1"
9
+ gcloud container node-pools create gpu-node-pool \
10
+ --accelerator type=nvidia-l4,count=2,gpu-driver-version=latest \
11
+ --project=${PROJECT_ID} \
12
+ --location=${REGION} \
13
+ --node-locations=${REGION} -a \
14
+ --cluster=${CLUSTER_NAME} \
15
+ --service-account gke-sa@${PROJECT_ID} .iam.gserviceaccount.com \
16
+ --machine-type=g2-standard-24 \
17
+ --disk-type pd-balanced \
18
+ --disk-size 100 \
19
+ --num-nodes=1
20
+
21
+ gcloud container node-pools list --region=${REGION} --cluster=${CLUSTER_NAME}
22
+ ```
23
+
24
+
25
+ ## HuggingFace API Token
26
+ ``` shell
27
+ export HF_TOKEN=...
28
+ ```
29
+ Create secret:
30
+ ``` shell
31
+ kubectl create secret generic hf-secret \
32
+ --from-literal=hf_api_token=$HF_TOKEN \
33
+ --dry-run=client -o yaml | kubectl apply -f -
34
+ kubectl describe secret hf-secret
35
+ ```
5
36
6
37
## Deployment
7
38
Deploy Gemma 2B LLM using ` kubectl `
Original file line number Diff line number Diff line change @@ -11,13 +11,13 @@ spec:
11
11
metadata :
12
12
labels :
13
13
app : gemma-server
14
- ai.gke.io/model : gemma-2b-it
14
+ ai.gke.io/model : gemma-1.1- 2b-it
15
15
ai.gke.io/inference-server : vllm
16
16
examples.ai.gke.io/source : user-guide
17
17
spec :
18
18
containers :
19
19
- name : inference-server
20
- image : us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240220_0936_RC01
20
+ image : us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20240527_0916_RC00
21
21
resources :
22
22
requests :
23
23
cpu : " 2"
61
61
type : ClusterIP
62
62
ports :
63
63
- protocol : TCP
64
- port : 8000
64
+ port : 80
65
65
targetPort : 8000
You can’t perform that action at this time.
0 commit comments