leroyjb
diff --git a/‎best-practices/gke-batch-refarch/02_platform/monitoring/gmp/README.md
+56 b/‎best-practices/gke-batch-refarch/02_platform/monitoring/gmp/README.md
+56
diff --git a/‎best-practices/gke-batch-refarch/02_platform/monitoring/gmp/gmp-kueue-monitoring.yaml
+77 b/‎best-practices/gke-batch-refarch/02_platform/monitoring/gmp/gmp-kueue-monitoring.yaml
+77
diff --git a/‎best-practices/gke-batch-refarch/02_platform/monitoring/gmp/install-gmp.sh
+22 b/‎best-practices/gke-batch-refarch/02_platform/monitoring/gmp/install-gmp.sh
+22
@@ -0,0 +1,56 @@
+# Monitoring Kueue with Google Managed Prometheus and Cloud Monitoring
+
+This document describes how to monitor Kueue metrics using Google Managed Prometheus and Cloud Monitoring.
+
+## Overview
+
+You can configure Google Managed Prometheus to automatically collect Kueue metrics. The collected metrics are then exported and made available in Google Cloud's Monitoring service.
+
+## Viewing the Dashboard
+
+The Kueue dashboard is available in Google Cloud Monitoring. This dashboard provides a visual representation of key Kueue metrics, allowing you to quickly assess the health and performance of your Kueue system.
+
+<img src="../../../images/kueue_cloud_monitoring_1.png" width="800">
+<img src="../../../images/kueue_cloud_monitoring_2.png" width="800">
+
+## Configuring Managed Collection and Creating the Dashboard
+
+Run the following command to configure Managed Collection for Kueue and Create the Dashboard in Cloud Monitoring. 
+
+```bash
+./install-gmp.sh
+```
+
+## Querying Metrics
+
+You can also query Kueue metrics directly using the [Google Cloud Monitoring - Metrics explorer](https://console.cloud.google.com/monitoring/metrics-explorer) interface. Both PromQL and MQL are supported for querying.
+
+For more information, refer to the [Cloud Monitoring Documentation](https://cloud.google.com/monitoring/charts/metrics-explorer).
+
+### Example Queries
+
+Here are some sample PromQL queries to help you get started with monitoring your Kueue system:
+
+#### Job Throughput
+
+```promql
+sum(rate(kueue_admitted_workloads_total[5m])) by (cluster_queue)
+```
+
+This query calculates the per-second rate of admitted workloads over 5 minutes for each cluster queue. Summing them provides the overall system throughput, while breaking it down by queue helps pinpoint potential bottlenecks.
+
+#### Resource Utilization (`requires metrics.enableClusterQueueResources`)
+
+```promql
+sum(kueue_cluster_queue_resource_usage{resource="cpu"}) by (cluster_queue) / sum(kueue_cluster_queue_nominal_quota{resource="cpu"}) by (cluster_queue)
+```
+
+This query calculates the ratio of current CPU usage to the nominal CPU quota for each queue. A value close to 1 indicates high CPU utilization. You can adapt this for memory or other resources by changing the resource label.
+
+>__Important__: This query requires the metrics.enableClusterQueueResources setting to be enabled in your Kueue manager's configuration.  To enable this setting, follow the instructions in the Kueue installation documentation: [https://kueue.sigs.k8s.io/docs/installation/#install-a-custom-configured-released-version](https://kueue.sigs.k8s.io/docs/installation/#install-a-custom-configured-released-version)
+
+#### Queue Wait Times
+```promql
+histogram_quantile(0.9, kueue_admission_wait_time_seconds_bucket{cluster_queue="QUEUE_NAME"})
+```
+This query provides the 90th percentile wait time for workloads in a specific queue. You can modify the quantile value (e.g., 0.5 for median, 0.99 for 99th percentile) to understand the wait time distribution. Replace `QUEUE_NAME` with the actual name of the queue you want to monitor.
@@ -0,0 +1,77 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: kueue-metrics-reader
+  namespace: kueue-system
+automountServiceAccountToken: true
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: kueue-metrics-reader-token
+  namespace: kueue-system
+  annotations:
+    kubernetes.io/service-account.name: kueue-metrics-reader
+type: kubernetes.io/service-account-token
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: kueue-secret-reader
+  namespace: kueue-system
+rules:
+- resources:
+  - secrets
+  apiGroups: [""]
+  verbs: ["get", "list", "watch"]
+  resourceNames: ["kueue-metrics-reader-token"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: kueue-metrics-reader
+subjects:
+- kind: ServiceAccount
+  name: kueue-metrics-reader
+  namespace: kueue-system
+roleRef:
+  kind: ClusterRole
+  name: kueue-metrics-reader
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: monitoring.googleapis.com/v1
+kind: PodMonitoring
+metadata:
+  name: kueue
+  namespace: kueue-system
+spec:
+  selector:
+    matchLabels:
+      control-plane: controller-manager
+  endpoints:
+  - port: https
+    interval: 30s
+    path: /metrics
+    scheme: https
+    tls:
+      insecureSkipVerify: true
+    authorization:
+      type: Bearer 
+      credentials:
+        secret: 
+          name: kueue-metrics-reader-token
+          key: token
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: gmp-system:collector:kueue-secret-reader
+  namespace: kueue-system
+roleRef:
+  name: kueue-secret-reader
+  kind: Role
+  apiGroup: rbac.authorization.k8s.io
+subjects:
+- name: collector
+  namespace: gmp-system
+  kind: ServiceAccount
@@ -0,0 +1,22 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[[ ! "${PROJECT_ID}" ]] && echo -e "Please export PROJECT_ID variable (\e[95mexport PROJECT_ID=<YOUR POROJECT ID>\e[0m)\nExiting." && exit 0
+echo -e "\e[95mPROJECT_ID is set to ${PROJECT_ID}\e[0m"
+
+[[ ! "${REGION}" ]] && echo -e "Please export REGION variable (\e[95mexport REGION=<YOUR REGION, eg: us-central1>\e[0m)\nExiting." && exit 0
+echo -e "\e[95mREGION is set to ${REGION}\e[0m"
+
+kubectl apply -f gmp-kueue-monitoring.yaml && \
+gcloud monitoring dashboards create --project=$PROJECT_ID --config-from-file=kueue-dashboard.json