Skip to content

Commit a176e7d

Browse files
authored
Add ability to scale on tgi custom metrics (#263)
1 parent 92055de commit a176e7d

File tree

12 files changed

+479
-15
lines changed

12 files changed

+479
-15
lines changed

benchmarks/inference-server/text-generation-inference/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ cp sample-terraform.tfvars terraform.tfvars
3030

3131
Fill out your `terraform.tfvars` with the desired model and server configuration, referring to the list of required and optional variables [here](#variables). Variables `credentials_config` are required.
3232

33+
Optionally configure HPA (Horizontal Pod Autoscaling) by setting `hpa_type`. Note: GMP (Google Managed Prometheus) must be enabled on this cluster (which is the default) to scale based on custom metrics. See `autoscaling.md` for more details.
34+
3335
#### Determine number of gpus
3436

3537
`gpu_count` should be configured respective to the size of the model with some overhead for the kv cache. Here's an example on figuring out how many GPUs you need to run a model:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Autoscaling TGI
2+
3+
## tl;dr
4+
5+
Recommendation: TODO
6+
7+
## Autoscaling Options
8+
9+
### CPU
10+
11+
CPU scaling is a poor choice for this workload - the TGI workload starts up,
12+
pulls the model weights, and then spends a minute or two worth of cpu time
13+
crunching some numbers. This causes hpa to add a replica, which then spends
14+
more cpu time, which causes hpa to add a replica, etc. Eventually, things
15+
settle, and hpa scales down the replicas. This whole process could take up to
16+
an hour.
17+
18+
### Custom Metrics
19+
20+
Workload/custom metrics can be viewed in
21+
https://console.cloud.google.com/monitoring/metrics-explorer. (Just search for
22+
the metric name, e.g. "tgi_batch_current_size". The full name should be
23+
"prometheus/tgi_batch_current_size/gauge")
24+
25+
#### `tgi_batch_current_size`
26+
27+
TODO
28+
29+
### External Metrics
30+
31+
TODO
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Custom Metrics Stackdriver Adapter
2+
3+
Adapted from https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml
4+
5+
## Usage
6+
7+
To use this module, include it from your main terraform config, i.e.:
8+
9+
```
10+
module "custom_metrics_stackdriver_adapter" {
11+
source = "./path/to/custom-metrics-stackdriver-adapter"
12+
}
13+
```
14+
15+
For a workload identity enabled cluster, some additional configuration is
16+
needed:
17+
18+
```
19+
module "custom_metrics_stackdriver_adapter" {
20+
source = "./path/to/custom-metrics-stackdriver-adapter"
21+
workload_identity = {
22+
enabled = true
23+
project_id = "<PROJECT_ID>"
24+
}
25+
}
26+
```
27+
28+
# TODO
29+
30+
This module should be moved out of the text-generation-inference subdirectory,
31+
as it should be more broadly applicable.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
resource "kubernetes_namespace_v1" "custom-metrics" {
2+
metadata {
3+
name = "custom-metrics"
4+
}
5+
}
6+
7+
resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-no-wi" {
8+
count = var.workload_identity.enabled ? 0 : 1
9+
metadata {
10+
name = "custom-metrics-stackdriver-adapter"
11+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
12+
}
13+
}
14+
15+
resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-wi" {
16+
count = var.workload_identity.enabled ? 1 : 0
17+
metadata {
18+
name = "custom-metrics-stackdriver-adapter"
19+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
20+
annotations = {
21+
"iam.gke.io/gcp-service-account" = google_service_account.cmsa-sa[0].email
22+
}
23+
}
24+
}
25+
26+
resource "kubernetes_cluster_role_binding_v1" "custom-metrics-system-auth-delegator" {
27+
metadata {
28+
name = "custom-metrics:system:auth-delegator"
29+
}
30+
role_ref {
31+
api_group = "rbac.authorization.k8s.io"
32+
kind = "ClusterRole"
33+
name = "system:auth-delegator"
34+
}
35+
subject {
36+
kind = "ServiceAccount"
37+
name = (var.workload_identity.enabled
38+
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name
39+
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name
40+
)
41+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
42+
}
43+
}
44+
45+
resource "kubernetes_role_binding_v1" "custom-metrics-auth-reader" {
46+
metadata {
47+
name = "custom-metrics-auth-reader"
48+
namespace = "kube-system"
49+
}
50+
role_ref {
51+
api_group = "rbac.authorization.k8s.io"
52+
kind = "Role"
53+
name = "extension-apiserver-authentication-reader"
54+
}
55+
subject {
56+
kind = "ServiceAccount"
57+
name = (var.workload_identity.enabled
58+
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name
59+
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name
60+
)
61+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
62+
}
63+
}
64+
65+
resource "kubernetes_cluster_role_v1" "custom-metrics-resource-reader" {
66+
metadata {
67+
name = "custom-metrics-resource-reader"
68+
}
69+
rule {
70+
api_groups = [""]
71+
resources = ["pods", "nodes", "nodes/stats"]
72+
verbs = ["get", "list", "watch"]
73+
}
74+
}
75+
76+
resource "kubernetes_cluster_role_binding_v1" "custom-metrics-resource-reader" {
77+
metadata {
78+
name = "custom-metrics-resource-reader"
79+
}
80+
role_ref {
81+
api_group = "rbac.authorization.k8s.io"
82+
kind = "ClusterRole"
83+
name = kubernetes_cluster_role_v1.custom-metrics-resource-reader.metadata[0].name
84+
}
85+
subject {
86+
kind = "ServiceAccount"
87+
name = (var.workload_identity.enabled
88+
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name
89+
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name
90+
)
91+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
92+
}
93+
}
94+
95+
resource "kubernetes_deployment_v1" "custom-metrics-stackdriver-adapter" {
96+
metadata {
97+
name = "custom-metrics-stackdriver-adapter"
98+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
99+
labels = {
100+
run = "custom-metrics-stackdriver-adapter"
101+
k8s-app = "custom-metrics-stackdriver-adapter"
102+
}
103+
}
104+
spec {
105+
replicas = 1
106+
107+
selector {
108+
match_labels = {
109+
run = "custom-metrics-stackdriver-adapter"
110+
k8s-app = "custom-metrics-stackdriver-adapter"
111+
}
112+
}
113+
114+
template {
115+
metadata {
116+
labels = {
117+
run = "custom-metrics-stackdriver-adapter"
118+
k8s-app = "custom-metrics-stackdriver-adapter"
119+
"kubernetes.io/cluster-service" = "true"
120+
}
121+
}
122+
123+
spec {
124+
service_account_name = (var.workload_identity.enabled
125+
? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name
126+
: kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name
127+
)
128+
129+
container {
130+
image = "gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.14.2-gke.0"
131+
image_pull_policy = "Always"
132+
name = "pod-custom-metrics-stackdriver-adapter"
133+
command = ["/adapter", "--use-new-resource-model=true", "--fallback-for-container-metrics=true"]
134+
resources {
135+
limits = {
136+
cpu = "250m"
137+
memory = "200Mi"
138+
}
139+
requests = {
140+
cpu = "250m"
141+
memory = "200Mi"
142+
}
143+
}
144+
}
145+
}
146+
}
147+
}
148+
}
149+
150+
resource "kubernetes_service_v1" "custom-metrics-stackdriver-adapter" {
151+
metadata {
152+
name = "custom-metrics-stackdriver-adapter"
153+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
154+
labels = {
155+
run = "custom-metrics-stackdriver-adapter"
156+
k8s-app = "custom-metrics-stackdriver-adapter"
157+
"kubernetes.io/cluster-service" = "true"
158+
"kubernetes.io/name" = "Adapter"
159+
}
160+
}
161+
spec {
162+
selector = {
163+
run = "custom-metrics-stackdriver-adapter"
164+
k8s-app = "custom-metrics-stackdriver-adapter"
165+
}
166+
port {
167+
port = 443
168+
protocol = "TCP"
169+
target_port = 443
170+
}
171+
type = "ClusterIP"
172+
}
173+
}
174+
175+
resource "kubernetes_api_service_v1" "v1beta1-custom-metrics-k8s-io" {
176+
metadata {
177+
name = "v1beta1.custom.metrics.k8s.io"
178+
}
179+
spec {
180+
insecure_skip_tls_verify = true
181+
group = "custom.metrics.k8s.io"
182+
group_priority_minimum = 100
183+
version_priority = 100
184+
service {
185+
name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name
186+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
187+
}
188+
version = "v1beta1"
189+
}
190+
}
191+
192+
resource "kubernetes_api_service_v1" "v1beta2-custom-metrics-k8s-io" {
193+
metadata {
194+
name = "v1beta2.custom.metrics.k8s.io"
195+
}
196+
spec {
197+
insecure_skip_tls_verify = true
198+
group = "custom.metrics.k8s.io"
199+
group_priority_minimum = 100
200+
version_priority = 200
201+
service {
202+
name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name
203+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
204+
}
205+
version = "v1beta2"
206+
}
207+
}
208+
209+
resource "kubernetes_api_service_v1" "v1beta1-external-metrics-k8s-io" {
210+
metadata {
211+
name = "v1beta1.external.metrics.k8s.io"
212+
}
213+
spec {
214+
insecure_skip_tls_verify = true
215+
group = "external.metrics.k8s.io"
216+
group_priority_minimum = 100
217+
version_priority = 100
218+
service {
219+
name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name
220+
namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name
221+
}
222+
version = "v1beta1"
223+
}
224+
}
225+
226+
resource "kubernetes_cluster_role_binding_v1" "external-metrics-reader" {
227+
metadata {
228+
name = "external-metrics-reader"
229+
}
230+
role_ref {
231+
api_group = "rbac.authorization.k8s.io"
232+
kind = "ClusterRole"
233+
name = "external-metrics-reader"
234+
}
235+
subject {
236+
kind = "ServiceAccount"
237+
name = "horizontal-pod-autoscaler"
238+
namespace = "kube-system"
239+
}
240+
}
241+
242+
243+
# If workload identity is enabled, extra steps are required. We need to:
244+
# - create a service account
245+
# - grant it the monitoring.viewer IAM role
246+
# - bind it to the workload identity user for the cmsa
247+
# - annotate the cmsa service account (done above)
248+
249+
resource "google_service_account" "cmsa-sa" {
250+
count = var.workload_identity.enabled ? 1 : 0
251+
account_id = "cmsa-sa"
252+
project = var.workload_identity.project_id
253+
}
254+
255+
# Equivalent to:
256+
# gcloud projects add-iam-policy-binding PROJECT_ID \
257+
# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \
258+
# --role=roles/monitoring.viewer
259+
resource "google_project_iam_binding" "cmsa-project-binding" {
260+
count = var.workload_identity.enabled ? 1 : 0
261+
project = var.workload_identity.project_id
262+
role = "roles/monitoring.viewer"
263+
members = [
264+
"serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com"
265+
]
266+
}
267+
268+
# Equivalent to:
269+
# gcloud iam service-accounts add-iam-policy-binding \
270+
# --role roles/iam.workloadIdentityUser \
271+
# --member "serviceAccount:PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \
272+
# cmsa-sa@PROJECT_ID.iam.gserviceaccount.com
273+
resource "google_service_account_iam_member" "cmsa-bind-to-gsa" {
274+
count = var.workload_identity.enabled ? 1 : 0
275+
service_account_id = google_service_account.cmsa-sa[0].name
276+
role = "roles/iam.workloadIdentityUser"
277+
member = "serviceAccount:${var.workload_identity.project_id}.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]"
278+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
variable "workload_identity" {
2+
type = object({
3+
enabled = bool
4+
project_id = optional(string)
5+
})
6+
default = {
7+
enabled = false
8+
}
9+
validation {
10+
condition = (
11+
(var.workload_identity.enabled && var.workload_identity.project_id != null)
12+
|| (!var.workload_identity.enabled)
13+
)
14+
error_message = "A project_id must be specified if workload_identity_enabled is set."
15+
}
16+
}

benchmarks/inference-server/text-generation-inference/hpa-templates/hpa.cpu.yaml.tftpl

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ spec:
1010
name: tgi
1111
minReplicas: ${hpa_min_replicas}
1212
maxReplicas: ${hpa_max_replicas}
13-
targetCPUUtilizationPercentage: 50
13+
targetCPUUtilizationPercentage: ${hpa_averagevalue_target}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
apiVersion: autoscaling/v2
2+
kind: HorizontalPodAutoscaler
3+
metadata:
4+
name: tgi
5+
namespace: ${namespace}
6+
spec:
7+
scaleTargetRef:
8+
apiVersion: apps/v1
9+
kind: Deployment
10+
name: tgi
11+
minReplicas: ${hpa_min_replicas}
12+
maxReplicas: ${hpa_max_replicas}
13+
metrics:
14+
- type: Pods
15+
pods:
16+
metric:
17+
name: prometheus.googleapis.com|${custom_metric_name}|gauge
18+
target:
19+
type: AverageValue
20+
averageValue: ${hpa_averagevalue_target}

0 commit comments

Comments
 (0)