Skip to content

Commit 2be1d05

Browse files
ferrarimarcoarueth
authored andcommitted
feat: online inferencing with gpus (downloader) (#138)
Implement a Kubernetes Job to download models from Hugging Face to Cloud Storage.
1 parent ddc8e17 commit 2be1d05

28 files changed

+638
-30
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
hugging-face-token.env
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
# Inference Platform reference architecture
22

33
[Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
4+
5+
## Inference
6+
7+
- [Online inference with GPUs](/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/README.md)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
configMapGenerator:
5+
- name: model-download-configmap
6+
envs:
7+
- model-download.env
8+
resources:
9+
- load-model-to-cloud-storage.yaml
10+
- service-account.yaml
11+
secretGenerator:
12+
- name: hugging-face-token-secret
13+
envs:
14+
- hugging-face-token.env
15+
16+
replacements:
17+
- source:
18+
kind: ConfigMap
19+
name: model-download-configmap
20+
fieldPath: data.IRA_BUCKET_NAME
21+
targets:
22+
- select:
23+
kind: Job
24+
name: transfer-model-to-gcs
25+
fieldPaths:
26+
- spec.template.spec.volumes.[name=gcsfuse].csi.volumeAttributes.bucketName
27+
options:
28+
delimiter: .
29+
index: 0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Copyright 2025 Google LLC
2+
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
---
15+
apiVersion: batch/v1
16+
kind: Job
17+
metadata:
18+
name: transfer-model-to-gcs
19+
spec:
20+
backoffLimit: 0
21+
template:
22+
metadata:
23+
labels:
24+
app: transfer-model-to-gcs
25+
annotations:
26+
gke-gcsfuse/volumes: "true"
27+
gke-gcsfuse/cpu-limit: "0"
28+
gke-gcsfuse/memory-limit: "0"
29+
gke-gcsfuse/ephemeral-storage-limit: "0"
30+
spec:
31+
nodeSelector:
32+
iam.gke.io/gke-metadata-server-enabled: "true"
33+
restartPolicy: Never
34+
terminationGracePeriodSeconds: 0
35+
serviceAccountName: ira-online-gpu-ksa
36+
containers:
37+
- name: transfer-model-to-gcs
38+
image: gcr.io/google.com/cloudsdktool/cloud-sdk:518.0.0-slim
39+
command:
40+
- bash
41+
- -c
42+
- |
43+
set -o errexit
44+
set -o nounset
45+
set -o pipefail
46+
47+
if [ -z "${HUGGING_FACE_TOKEN:-}" ]; then
48+
echo "Error: HUGGING_FACE_TOKEN is not set."
49+
exit 1
50+
fi
51+
if [ -z "${MODEL_ID:-}" ]; then
52+
echo "Error: MODEL_ID is not set."
53+
exit 1
54+
fi
55+
if [ -z "${IRA_BUCKET_NAME:-}" ]; then
56+
echo "Error: IRA_BUCKET_NAME is not set."
57+
exit 1
58+
fi
59+
60+
echo "Downloading ${MODEL_ID} to ${IRA_BUCKET_NAME} Cloud Storage bucket"
61+
echo "Debug Hugging Face token length: ${#HUGGING_FACE_TOKEN}"
62+
63+
pip3 install -U "huggingface_hub[cli]==0.30.2" --break-system-packages
64+
65+
huggingface-cli download --repo-type model ${MODEL_ID} --local-dir /local/temp --token ${HUGGING_FACE_TOKEN}
66+
67+
rm -rfv /local/temp/.cache
68+
mkdir -pv "/local/${MODEL_ID}"
69+
mv -v /local/temp/* "/local/${MODEL_ID}/"
70+
rm -rfv /local/temp /local/.gcsfuse_tmp
71+
env:
72+
- name: MODEL_ID
73+
valueFrom:
74+
configMapKeyRef:
75+
key: MODEL_ID
76+
name: model-download-configmap
77+
- name: IRA_BUCKET_NAME
78+
valueFrom:
79+
configMapKeyRef:
80+
key: IRA_BUCKET_NAME
81+
name: model-download-configmap
82+
- name: HUGGING_FACE_TOKEN
83+
valueFrom:
84+
secretKeyRef:
85+
name: hugging-face-token-secret
86+
key: HUGGING_FACE_TOKEN
87+
# If you want to consume less resources, don't install the hf_net
88+
# package, at the expense of download speed.
89+
resources:
90+
limits:
91+
cpu: 2000m
92+
memory: 8Gi
93+
requests:
94+
cpu: 2000m
95+
memory: 8Gi
96+
volumeMounts:
97+
- name: gcsfuse
98+
mountPath: /local
99+
volumes:
100+
- name: dshm
101+
emptyDir:
102+
medium: Memory
103+
- name: gcsfuse
104+
csi:
105+
driver: gcsfuse.csi.storage.gke.io
106+
volumeAttributes:
107+
bucketName: cloud-storage-bucket-name
108+
tolerations:
109+
- key: "on-demand"
110+
value: "true"
111+
operator: "Equal"
112+
effect: "NoSchedule"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
IRA_BUCKET_NAME=<IRA_BUCKET_NAME>
2+
MODEL_ID=<MODEL_ID>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright 2025 Google LLC
2+
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
---
15+
apiVersion: v1
16+
kind: ServiceAccount
17+
metadata:
18+
name: ira-online-gpu-ksa
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# Online inference with GPUs on Google Cloud
2+
3+
This reference architecture implements online inferencing using GPUs on Google
4+
Cloud. This reference architecture builds on top of the
5+
[Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md).
6+
7+
## Best practices for online inferencing on Google Cloud
8+
9+
### Accelerator selection
10+
11+
### Storage solution selection
12+
13+
### Model selection
14+
15+
### Observability
16+
17+
### Scalability
18+
19+
### Cost optimization
20+
21+
## Architecture
22+
23+
## Deploy the reference architecture
24+
25+
This reference architecture builds on top of the infrastructure that the
26+
[Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
27+
provides, and follows the best practices that the reference implementations
28+
establishes.
29+
30+
Before deploying the reference architecture described in this document, you
31+
deploy one instance of the Inference Platform reference implementation. The
32+
reference architecture supports deploying multiple instances of the reference
33+
architecture in the same project. To deploy the reference architecture, you do
34+
the following:
35+
36+
1. To enable deploying resources for the online inference reference
37+
architecture, initialize the following configuration variables in
38+
`platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch.auto.tfvars`:
39+
40+
```hcl
41+
ira_use_case_flavor = "ira-online-gpu"
42+
```
43+
44+
1. Deploy an instance of the Inference Platform reference implementation. For
45+
more information about how to deploy an instance of the reference
46+
architecture, see
47+
[Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
48+
49+
After you deploy the reference architecture instances, continue following
50+
this document.
51+
52+
## Download the model to Cloud Storage
53+
54+
1. Take note of the name of the Cloud Storage bucket where the model will be
55+
downloaded:
56+
57+
```shell
58+
terraform -chdir="${ACP_PLATFORM_USE_CASE_DIR}/terraform/cloud_storage" init \
59+
&& terraform -chdir="${ACP_PLATFORM_USE_CASE_DIR}/terraform/cloud_storage" output -json ira_google_storage_bucket_names
60+
```
61+
62+
The output might contain multiple bucket names. The name of the bucket where
63+
the model will be downloaded ends with the `ira-model` suffix.
64+
65+
1. Initialize the configuration variables to set the name of the Cloud Storage
66+
bucket where the model will be downloaded in
67+
`platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/model-download.env`:
68+
69+
```shell
70+
IRA_BUCKET_NAME=<IRA_BUCKET_NAME>
71+
MODEL_ID=<MODEL_ID>
72+
```
73+
74+
Where:
75+
76+
- `<IRA_BUCKET_NAME>` is the name of the Cloud Storage bucket where the
77+
model will be downloaded.
78+
- `MODEL_ID>` is the fully qualified model identifier.
79+
80+
- For Gemma, the fully qualified model identifier is:
81+
`google/gemma-3-27b-it`
82+
- For Llama 4, the fully qualified model identifier is:
83+
`meta-llama/Llama-4-Scout-17B-16E-Instruct`
84+
- For Llama 3.3, the fully qualified model identifier is:
85+
`meta-llama/Llama-3.3-70B-Instruct`
86+
87+
1. [Generate a Hugging Face token](https://huggingface.co/docs/hub/security-tokens).
88+
Make sure to grant the
89+
`Read access to contents of all public gated repos you can access`
90+
permission to the Hugging Face token.
91+
92+
1. Store the Hugging Face token in
93+
`platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/hugging-face-token.env`:
94+
95+
```shell
96+
HUGGING_FACE_TOKEN=<HUGGING_FACE_TOKEN>
97+
```
98+
99+
Where:
100+
101+
- `<HUGGING_FACE_TOKEN>` is the Hugging Face token.
102+
103+
If the
104+
`platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/hugging-face-token.env`
105+
file doesn't exist, create it.
106+
107+
1. Get access to the model by signing the consent agreement:
108+
109+
- For Gemma:
110+
111+
1. Access the
112+
[model consent page on Kaggle.com](https://www.kaggle.com/models/google/gemma).
113+
114+
1. Verify consent using your Hugging Face account.
115+
116+
1. Accept the model terms.
117+
118+
- For Llama:
119+
120+
1. Accept the model terms on Hugging Face
121+
122+
1. Deploy the model downloader in the GKE cluster:
123+
124+
```shell
125+
kubectl apply -k platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download
126+
```
127+
128+
1. Wait for the model downloader to download the model:
129+
130+
```shell
131+
watch --color --interval 5 --no-title \
132+
"kubectl get job/transfer-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'"
133+
```
134+
135+
The output is similar to the following:
136+
137+
```text
138+
NAME STATUS COMPLETIONS DURATION AGE
139+
transfer-model-to-gcs Complete 1/1 33m 3h30m
140+
```
141+
142+
### Roles and permissions
143+
144+
### Next steps
145+
146+
### Destroy the reference architecture

platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ precedence over earlier ones:
7878

7979
## Deploy
8080

81+
To deploy this reference implementation, you need Terraform >= 1.8.0. For more
82+
information about installing Terraform, see
83+
[Install Terraform](https://developer.hashicorp.com/terraform/install).
84+
8185
```
8286
${ACP_PLATFORM_USE_CASE_DIR}/terraform/deploy.sh
8387
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
variable "ira_cloud_storage_buckets" {
16+
default = {}
17+
description = "Map describing the Cloud Storage buckets to create. Keys are bucket names."
18+
type = map(object({
19+
force_destroy = bool
20+
versioning_enabled = bool
21+
}))
22+
}
23+
24+
variable "ira_cloud_storage_buckets_iam_bindings" {
25+
default = []
26+
description = "Map of objects to configure Cloud IAM bindings for Cloud Storage buckets described by the ira_cloud_storage_buckets variable. Keys are bucket names. Use the same names that you use in the ira_cloud_storage_buckets variable"
27+
type = list(object({
28+
bucket_name = string
29+
member = string
30+
role = string
31+
}))
32+
}
33+
34+
variable "ira_use_case_flavor" {
35+
default = ""
36+
type = string
37+
38+
validation {
39+
condition = var.ira_use_case_flavor == "" || contains(
40+
[
41+
"ira-online-gpu",
42+
],
43+
var.ira_use_case_flavor
44+
)
45+
error_message = "'ira_use_case_flavor' value is invalid"
46+
}
47+
}
48+
49+
variable "ira_kubernetes_namespace" {
50+
default = "default"
51+
type = string
52+
}

0 commit comments

Comments
 (0)