Skip to content

Commit 88956b9

Browse files
authored
Cherry pick- branch ENI operation op latency metrics (#487)
* update branch ENI operation metrics & dev guide (#465) * measure branch ENI operation latency in seconds (#469)
1 parent bbad908 commit 88956b9

File tree

5 files changed

+46
-29
lines changed

5 files changed

+46
-29
lines changed

DEVELOPER_GUIDE.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@ make toolchain # Install required to develop the project
88

99
## Testing a code change
1010

11-
Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account
12-
for ENI trunking before the deployment.
11+
Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account for ENI trunking before the deployment.
12+
13+
If you are testing on EKS beta cluster, set
14+
```sh
15+
BETA_CLUSTER=true
16+
```
1317

1418
```sh
1519
make apply-dependencies # install the cert manager and certificate

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ GOLANG_VERSION ?= $(shell cat .go-version)
1616
BUILD_IMAGE ?= public.ecr.aws/docker/library/golang:$(GOLANG_VERSION)
1717
GOARCH ?= amd64
1818
PLATFORM ?= linux/amd64
19+
USER_ROLE_ARN ?= arn:aws:iam::$(AWS_ACCOUNT):role/VPCResourceControllerRole
20+
BETA_CLUSTER ?= false
1921

2022
help: ## Display help
2123
@awk 'BEGIN {FS = ":.*##"; printf "Usage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
@@ -51,14 +53,19 @@ toolchain: ## Install developer toolchain
5153
./hack/toolchain.sh
5254

5355
apply: image check-deployment-env check-env ## Deploy controller to ~/.kube/config
56+
ifeq ($(BETA_CLUSTER), true)
57+
VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --endpoint https://api.beta.us-west-2.wesley.amazonaws.com --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId')
58+
else
59+
VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId')
60+
endif
5461
eksctl create iamserviceaccount vpc-resource-controller --namespace kube-system --cluster ${CLUSTER_NAME} --region ${AWS_REGION} \
5562
--role-name VPCResourceControllerRole \
5663
--attach-policy-arn=arn:aws:iam::aws:policy/AdministratorAccess \
5764
--override-existing-serviceaccounts \
5865
--approve
5966
kustomize build config/crd | kubectl apply -f -
6067
cd config/controller && kustomize edit set image controller=${IMAGE}
61-
kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g" | kubectl apply -f -
68+
kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g;s|VPC_ID|${VPC_ID}|g" | kubectl apply -f -
6269
kubectl patch rolebinding eks-vpc-resource-controller-rolebinding -n kube-system --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}'
6370
kubectl patch clusterrolebinding vpc-resource-controller-rolebinding --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}'
6471

config/controller/controller.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ spec:
3333
- --role-arn=USER_ROLE_ARN
3434
- --leader-elect
3535
- --metrics-bind-address=:8443
36+
- --introspect-bind-addr=:22775
37+
- --vpc-id=VPC_ID
3638
image: controller:latest
3739
name: controller
3840
resources:

pkg/provider/branch/provider.go

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,44 +45,47 @@ import (
4545
"sigs.k8s.io/controller-runtime/pkg/metrics"
4646
)
4747

48+
const (
49+
operationCreateBranchENI = "create_branch_eni"
50+
operationAnnotateBranchENI = "annotate_branch_eni"
51+
operationInitTrunk = "init_trunk"
52+
resourceCountLabel = "resource_count"
53+
operationLabel = "branch_provider_operation"
54+
55+
ReasonSecurityGroupRequested = "SecurityGroupRequested"
56+
ReasonResourceAllocated = "ResourceAllocated"
57+
ReasonBranchAllocationFailed = "BranchAllocationFailed"
58+
ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed"
59+
60+
ReasonTrunkENICreationFailed = "TrunkENICreationFailed"
61+
)
62+
4863
var (
4964
branchProviderOperationsErrCount = prometheus.NewCounterVec(
5065
prometheus.CounterOpts{
5166
Name: "branch_provider_operations_err_count",
5267
Help: "The number of errors encountered for branch provider operations",
5368
},
54-
[]string{"operation"},
69+
[]string{operationLabel},
5570
)
5671

5772
branchProviderOperationLatency = prometheus.NewSummaryVec(
5873
prometheus.SummaryOpts{
59-
Name: "branch_provider_operation_latency",
60-
Help: "Branch Provider operations latency in ms",
74+
Name: "branch_provider_operation_latency",
75+
Help: "Branch Provider operations latency in seconds",
76+
Objectives: map[float64]float64{0: 0, 0.5: 0.05, 0.9: 0.01, 0.99: 0.001, 1: 0},
6177
},
62-
[]string{"operation", "resource_count"},
78+
[]string{operationLabel, resourceCountLabel},
6379
)
6480

65-
operationCreateBranchENI = "create_branch_eni"
66-
operationCreateBranchENIAndAnnotate = "create_and_annotate_branch_eni"
67-
operationInitTrunk = "init_trunk"
68-
69-
ReasonSecurityGroupRequested = "SecurityGroupRequested"
70-
ReasonResourceAllocated = "ResourceAllocated"
71-
ReasonBranchAllocationFailed = "BranchAllocationFailed"
72-
ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed"
73-
74-
ReasonTrunkENICreationFailed = "TrunkENICreationFailed"
75-
7681
deleteQueueRequeueRequest = ctrl.Result{RequeueAfter: time.Second * 30, Requeue: true}
7782

7883
// NodeDeleteRequeueRequestDelay represents the time after which the resources belonging to a node will be cleaned
7984
// up after receiving the actual node delete event.
8085
NodeDeleteRequeueRequestDelay = time.Minute * 5
8186

8287
prometheusRegistered = false
83-
)
8488

85-
var (
8689
ErrTrunkExistInCache = fmt.Errorf("trunk eni already exist in cache")
8790
ErrTrunkNotInCache = fmt.Errorf("trunk eni not present in cache")
8891
)
@@ -131,9 +134,9 @@ func prometheusRegister() {
131134
}
132135
}
133136

134-
// timeSinceMs returns the time since MS from the start time
135-
func timeSinceMs(start time.Time) float64 {
136-
return float64(time.Since(start).Milliseconds())
137+
// timeSinceSeconds returns the time elapsed in seconds from the start time
138+
func timeSinceSeconds(start time.Time) float64 {
139+
return float64(time.Since(start).Seconds())
137140
}
138141

139142
// InitResources initialized the resource for the given node name. The initialized trunk ENI is stored in
@@ -172,9 +175,9 @@ func (b *branchENIProvider) InitResource(instance ec2.EC2Instance) error {
172175

173176
utils.SendNodeEventWithNodeName(b.apiWrapper.K8sAPI, nodeName, utils.NodeTrunkFailedInitializationReason, "The node failed initializing trunk interface", v1.EventTypeNormal, b.log)
174177
branchProviderOperationsErrCount.WithLabelValues("init").Inc()
175-
return fmt.Errorf("initalizing trunk, %w", err)
178+
return fmt.Errorf("initializing trunk, %w", err)
176179
}
177-
branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceMs(start))
180+
branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceSeconds(start))
178181

179182
// Add the Trunk ENI to cache
180183
if err := b.addTrunkToCache(nodeName, trunkENI); err != nil {
@@ -367,7 +370,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
367370
}
368371

369372
branchProviderOperationLatency.WithLabelValues(operationCreateBranchENI, strconv.Itoa(resourceCount)).
370-
Observe(timeSinceMs(start))
373+
Observe(timeSinceSeconds(start))
371374

372375
jsonBytes, err := json.Marshal(branchENIs)
373376
if err != nil {
@@ -377,6 +380,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
377380
return ctrl.Result{}, err
378381
}
379382

383+
start = time.Now()
380384
// Annotate the pod with the created resources
381385
err = b.apiWrapper.PodAPI.AnnotatePod(pod.Namespace, pod.Name, pod.UID,
382386
config.ResourceNamePodENI, string(jsonBytes))
@@ -393,8 +397,8 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
393397
b.apiWrapper.K8sAPI.BroadcastEvent(pod, ReasonResourceAllocated,
394398
fmt.Sprintf("Allocated %s to the pod", string(jsonBytes)), v1.EventTypeNormal)
395399

396-
branchProviderOperationLatency.WithLabelValues(operationCreateBranchENIAndAnnotate, strconv.Itoa(resourceCount)).
397-
Observe(timeSinceMs(start))
400+
branchProviderOperationLatency.WithLabelValues(operationAnnotateBranchENI, strconv.Itoa(resourceCount)).
401+
Observe(timeSinceSeconds(start))
398402

399403
log.Info("created and annotated branch interface/s successfully", "branches", branchENIs)
400404

scripts/test/lib/config.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ function add_suffix() {
1414

1515
# IAM Role Name for Linux Node Role where VPC Resource Controller Runs. It should
1616
# have the Trunk Association Policy
17-
TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfcePolicy")
17+
TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfacePolicy")
1818
INSTANCE_ROLE_NAME=$(add_suffix "LinuxNodeRole")
1919

2020
# IAM Role and it's Policy Names which have the permission to manage Trunk/Branch

0 commit comments

Comments
 (0)