GoogleCloudPlatform
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/.gitignore
+1 b/‎platforms/gke/base/use-cases/inference-ref-arch/.gitignore
+1
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/README.md
+4 b/‎platforms/gke/base/use-cases/inference-ref-arch/README.md
+4
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/kustomization.yaml
+29 b/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/kustomization.yaml
+29
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/load-model-to-cloud-storage.yaml
+112 b/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/load-model-to-cloud-storage.yaml
+112
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/model-download.env
+2 b/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/model-download.env
+2
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/service-account.yaml
+18 b/‎platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/service-account.yaml
+18
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/README.md
+146 b/‎platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/README.md
+146
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md
+4 b/‎platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md
+4
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch.auto.tfvars
+13 b/‎platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch.auto.tfvars
+13
diff --git a/‎platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf
+52 b/‎platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch_variables.tf
+52
@@ -0,0 +1 @@
+hugging-face-token.env
@@ -1,3 +1,7 @@
 # Inference Platform reference architecture
 
 [Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
+
+## Inference
+
+- [Online inference with GPUs](/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/README.md)
@@ -0,0 +1,29 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+configMapGenerator:
+  - name: model-download-configmap
+    envs:
+      - model-download.env
+resources:
+  - load-model-to-cloud-storage.yaml
+  - service-account.yaml
+secretGenerator:
+  - name: hugging-face-token-secret
+    envs:
+      - hugging-face-token.env
+
+replacements:
+  - source:
+      kind: ConfigMap
+      name: model-download-configmap
+      fieldPath: data.IRA_BUCKET_NAME
+    targets:
+      - select:
+          kind: Job
+          name: transfer-model-to-gcs
+        fieldPaths:
+          - spec.template.spec.volumes.[name=gcsfuse].csi.volumeAttributes.bucketName
+        options:
+          delimiter: .
+          index: 0
@@ -0,0 +1,112 @@
+# Copyright 2025 Google LLC
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: transfer-model-to-gcs
+spec:
+  backoffLimit: 0
+  template:
+    metadata:
+      labels:
+        app: transfer-model-to-gcs
+      annotations:
+        gke-gcsfuse/volumes: "true"
+        gke-gcsfuse/cpu-limit: "0"
+        gke-gcsfuse/memory-limit: "0"
+        gke-gcsfuse/ephemeral-storage-limit: "0"
+    spec:
+      nodeSelector:
+        iam.gke.io/gke-metadata-server-enabled: "true"
+      restartPolicy: Never
+      terminationGracePeriodSeconds: 0
+      serviceAccountName: ira-online-gpu-ksa
+      containers:
+        - name: transfer-model-to-gcs
+          image: gcr.io/google.com/cloudsdktool/cloud-sdk:518.0.0-slim
+          command:
+            - bash
+            - -c
+            - |
+              set -o errexit
+              set -o nounset
+              set -o pipefail
+
+              if [ -z "${HUGGING_FACE_TOKEN:-}" ]; then
+                echo "Error: HUGGING_FACE_TOKEN is not set."
+                exit 1
+              fi
+              if [ -z "${MODEL_ID:-}" ]; then
+                echo "Error: MODEL_ID is not set."
+                exit 1
+              fi
+               if [ -z "${IRA_BUCKET_NAME:-}" ]; then
+                echo "Error: IRA_BUCKET_NAME is not set."
+                exit 1
+              fi
+
+              echo "Downloading ${MODEL_ID} to ${IRA_BUCKET_NAME} Cloud Storage bucket"
+              echo "Debug Hugging Face token length: ${#HUGGING_FACE_TOKEN}"
+
+              pip3 install -U "huggingface_hub[cli]==0.30.2" --break-system-packages
+
+              huggingface-cli download --repo-type model ${MODEL_ID} --local-dir /local/temp --token ${HUGGING_FACE_TOKEN}
+
+              rm -rfv /local/temp/.cache
+              mkdir -pv "/local/${MODEL_ID}"
+              mv -v /local/temp/* "/local/${MODEL_ID}/"
+              rm -rfv /local/temp /local/.gcsfuse_tmp
+          env:
+            - name: MODEL_ID
+              valueFrom:
+                configMapKeyRef:
+                  key: MODEL_ID
+                  name: model-download-configmap
+            - name: IRA_BUCKET_NAME
+              valueFrom:
+                configMapKeyRef:
+                  key: IRA_BUCKET_NAME
+                  name: model-download-configmap
+            - name: HUGGING_FACE_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hugging-face-token-secret
+                  key: HUGGING_FACE_TOKEN
+          # If you want to consume less resources, don't install the hf_net
+          # package, at the expense of download speed.
+          resources:
+            limits:
+              cpu: 2000m
+              memory: 8Gi
+            requests:
+              cpu: 2000m
+              memory: 8Gi
+          volumeMounts:
+            - name: gcsfuse
+              mountPath: /local
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: gcsfuse
+          csi:
+            driver: gcsfuse.csi.storage.gke.io
+            volumeAttributes:
+              bucketName: cloud-storage-bucket-name
+      tolerations:
+        - key: "on-demand"
+          value: "true"
+          operator: "Equal"
+          effect: "NoSchedule"
@@ -0,0 +1,2 @@
+IRA_BUCKET_NAME=<IRA_BUCKET_NAME>
+MODEL_ID=<MODEL_ID>
@@ -0,0 +1,18 @@
+# Copyright 2025 Google LLC
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: ira-online-gpu-ksa
@@ -0,0 +1,146 @@
+# Online inference with GPUs on Google Cloud
+
+This reference architecture implements online inferencing using GPUs on Google
+Cloud. This reference architecture builds on top of the
+[Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md).
+
+## Best practices for online inferencing on Google Cloud
+
+### Accelerator selection
+
+### Storage solution selection
+
+### Model selection
+
+### Observability
+
+### Scalability
+
+### Cost optimization
+
+## Architecture
+
+## Deploy the reference architecture
+
+This reference architecture builds on top of the infrastructure that the
+[Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
+provides, and follows the best practices that the reference implementations
+establishes.
+
+Before deploying the reference architecture described in this document, you
+deploy one instance of the Inference Platform reference implementation. The
+reference architecture supports deploying multiple instances of the reference
+architecture in the same project. To deploy the reference architecture, you do
+the following:
+
+1.  To enable deploying resources for the online inference reference
+    architecture, initialize the following configuration variables in
+    `platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/inference-ref-arch.auto.tfvars`:
+
+    ```hcl
+    ira_use_case_flavor = "ira-online-gpu"
+    ```
+
+1.  Deploy an instance of the Inference Platform reference implementation. For
+    more information about how to deploy an instance of the reference
+    architecture, see
+    [Inference Platform reference implementation](/platforms/gke/base/use-cases/inference-ref-arch/terraform/README.md)
+
+    After you deploy the reference architecture instances, continue following
+    this document.
+
+## Download the model to Cloud Storage
+
+1.  Take note of the name of the Cloud Storage bucket where the model will be
+    downloaded:
+
+    ```shell
+    terraform -chdir="${ACP_PLATFORM_USE_CASE_DIR}/terraform/cloud_storage" init \
+      && terraform -chdir="${ACP_PLATFORM_USE_CASE_DIR}/terraform/cloud_storage" output -json ira_google_storage_bucket_names
+    ```
+
+    The output might contain multiple bucket names. The name of the bucket where
+    the model will be downloaded ends with the `ira-model` suffix.
+
+1.  Initialize the configuration variables to set the name of the Cloud Storage
+    bucket where the model will be downloaded in
+    `platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/model-download.env`:
+
+    ```shell
+    IRA_BUCKET_NAME=<IRA_BUCKET_NAME>
+    MODEL_ID=<MODEL_ID>
+    ```
+
+    Where:
+
+    - `<IRA_BUCKET_NAME>` is the name of the Cloud Storage bucket where the
+      model will be downloaded.
+    - `MODEL_ID>` is the fully qualified model identifier.
+
+      - For Gemma, the fully qualified model identifier is:
+        `google/gemma-3-27b-it`
+      - For Llama 4, the fully qualified model identifier is:
+        `meta-llama/Llama-4-Scout-17B-16E-Instruct`
+      - For Llama 3.3, the fully qualified model identifier is:
+        `meta-llama/Llama-3.3-70B-Instruct`
+
+1.  [Generate a Hugging Face token](https://huggingface.co/docs/hub/security-tokens).
+    Make sure to grant the
+    `Read access to contents of all public gated repos you can access`
+    permission to the Hugging Face token.
+
+1.  Store the Hugging Face token in
+    `platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/hugging-face-token.env`:
+
+    ```shell
+    HUGGING_FACE_TOKEN=<HUGGING_FACE_TOKEN>
+    ```
+
+    Where:
+
+    - `<HUGGING_FACE_TOKEN>` is the Hugging Face token.
+
+    If the
+    `platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download/hugging-face-token.env`
+    file doesn't exist, create it.
+
+1.  Get access to the model by signing the consent agreement:
+
+    - For Gemma:
+
+      1. Access the
+         [model consent page on Kaggle.com](https://www.kaggle.com/models/google/gemma).
+
+      1. Verify consent using your Hugging Face account.
+
+      1. Accept the model terms.
+
+    - For Llama:
+
+      1. Accept the model terms on Hugging Face
+
+1.  Deploy the model downloader in the GKE cluster:
+
+    ```shell
+    kubectl apply -k platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/model-download
+    ```
+
+1.  Wait for the model downloader to download the model:
+
+    ```shell
+    watch --color --interval 5 --no-title \
+      "kubectl get job/transfer-model-to-gcs | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e 'Complete'"
+    ```
+
+    The output is similar to the following:
+
+    ```text
+    NAME                    STATUS     COMPLETIONS   DURATION   AGE
+    transfer-model-to-gcs   Complete   1/1           33m        3h30m
+    ```
+
+### Roles and permissions
+
+### Next steps
+
+### Destroy the reference architecture
@@ -78,6 +78,10 @@ precedence over earlier ones:
 
 ## Deploy
 
+To deploy this reference implementation, you need Terraform >= 1.8.0. For more
+information about installing Terraform, see
+[Install Terraform](https://developer.hashicorp.com/terraform/install).
+
 ```
 ${ACP_PLATFORM_USE_CASE_DIR}/terraform/deploy.sh
 ```
 
@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,52 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "ira_cloud_storage_buckets" {
+  default     = {}
+  description = "Map describing the Cloud Storage buckets to create. Keys are bucket names."
+  type = map(object({
+    force_destroy      = bool
+    versioning_enabled = bool
+  }))
+}
+
+variable "ira_cloud_storage_buckets_iam_bindings" {
+  default     = []
+  description = "Map of objects to configure Cloud IAM bindings for Cloud Storage buckets described by the ira_cloud_storage_buckets variable. Keys are bucket names. Use the same names that you use in the ira_cloud_storage_buckets variable"
+  type = list(object({
+    bucket_name = string
+    member      = string
+    role        = string
+  }))
+}
+
+variable "ira_use_case_flavor" {
+  default = ""
+  type    = string
+
+  validation {
+    condition = var.ira_use_case_flavor == "" || contains(
+      [
+        "ira-online-gpu",
+      ],
+      var.ira_use_case_flavor
+    )
+    error_message = "'ira_use_case_flavor' value is invalid"
+  }
+}
+
+variable "ira_kubernetes_namespace" {
+  default = "default"
+  type    = string
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+IRA_BUCKET_NAME=<IRA_BUCKET_NAME>`
	`2`	`+MODEL_ID=<MODEL_ID>`