Skip to content

Commit 3345790

Browse files
authored
Change TPU Metrics Source for Autoscaling (#770)
first commit
1 parent 54531da commit 3345790

File tree

2 files changed

+7
-3
lines changed

2 files changed

+7
-3
lines changed

modules/jetstream-maxtext-deployment/templates/custom-metrics-stackdriver-adapter/hpa.jetstream.yaml.tftpl

+5-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ spec:
2424
- type: External
2525
external:
2626
metric:
27-
name: kubernetes.io|node|accelerator|${rule.target_query}
27+
name: prometheus.googleapis.com|${rule.target_query}|gauge
28+
selector:
29+
matchLabels:
30+
metric.labels.container: jetstream-http
31+
metric.labels.exported_namespace: default
2832
target:
2933
type: AverageValue
3034
averageValue: ${rule.average_value_target}

modules/jetstream-maxtext-deployment/templates/prometheus-adapter/values.yaml.tftpl

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ rules:
2929
matches: ""
3030
as: "jetstream_slots_used_percentage"
3131
metricsQuery: avg(<<.Series>>{<<.LabelMatchers>>,cluster="${cluster_name}"})
32-
- seriesQuery: 'kubernetes_io:node_accelerator_memory_used'
32+
- seriesQuery: 'memory_used'
3333
resources:
3434
template: <<.Resource>>
3535
name:
3636
matches: ""
3737
as: "memory_used_percentage"
38-
metricsQuery: avg(kubernetes_io:node_accelerator_memory_used{cluster_name="${cluster_name}"}) / avg(kubernetes_io:node_accelerator_memory_total{cluster_name="${cluster_name}"})
38+
metricsQuery: avg(memory_used{cluster="${cluster_name}",exported_namespace="default",container="jetstream-http"}) / avg(memory_total{cluster="${cluster_name}",exported_namespace="default",container="jetstream-http"})

0 commit comments

Comments
 (0)