Skip to content

Commit 0a0c86b

Browse files
authored
Add security group pods scale test in ginkgo (#457)
* Add security group pods scale test in ginkgo * Add instructions to run scale tests manually * fix typo in README
1 parent 9947178 commit 0a0c86b

File tree

8 files changed

+377
-25
lines changed

8 files changed

+377
-25
lines changed
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#!/usr/bin/env bash
2+
3+
# Create EKS cluster with Karpenter using eksctl
4+
set -eo pipefail
5+
6+
SCRIPTS_DIR=$(cd "$(dirname "$0")" || exit 1; pwd)
7+
source "$SCRIPTS_DIR/lib/common.sh"
8+
check_is_installed eksctl
9+
check_is_installed helm
10+
check_is_installed aws
11+
12+
13+
export KARPENTER_NAMESPACE="kube-system"
14+
export KARPENTER_VERSION="1.0.1"
15+
export K8S_VERSION="1.30"
16+
17+
export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov
18+
export CLUSTER_NAME="${USER}-sgp-scaletest"
19+
export AWS_DEFAULT_REGION="us-west-2"
20+
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
21+
export TEMPOUT="$(mktemp)"
22+
23+
# Deploy CFN stack to enable Karpenter to create and manage nodes
24+
echo "Deploying Karpenter CFN stack"
25+
curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \
26+
&& aws cloudformation deploy \
27+
--stack-name "Karpenter-${CLUSTER_NAME}" \
28+
--template-file "${TEMPOUT}" \
29+
--capabilities CAPABILITY_NAMED_IAM \
30+
--parameter-overrides "ClusterName=${CLUSTER_NAME}"
31+
32+
# Create EKS cluster
33+
echo "Creating EKS cluster"
34+
eksctl create cluster -f - <<EOF
35+
---
36+
apiVersion: eksctl.io/v1alpha5
37+
kind: ClusterConfig
38+
metadata:
39+
name: ${CLUSTER_NAME}
40+
region: ${AWS_DEFAULT_REGION}
41+
version: "${K8S_VERSION}"
42+
tags:
43+
karpenter.sh/discovery: ${CLUSTER_NAME}
44+
45+
iam:
46+
withOIDC: true
47+
podIdentityAssociations:
48+
- namespace: "${KARPENTER_NAMESPACE}"
49+
serviceAccountName: karpenter
50+
roleName: ${CLUSTER_NAME}-karpenter
51+
permissionPolicyARNs:
52+
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}
53+
54+
iamIdentityMappings:
55+
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
56+
username: system:node:{{EC2PrivateDNSName}}
57+
groups:
58+
- system:bootstrappers
59+
- system:nodes
60+
61+
managedNodeGroups:
62+
- instanceType: c5.xlarge
63+
amiFamily: AmazonLinux2
64+
name: ${CLUSTER_NAME}-ng
65+
desiredCapacity: 2
66+
minSize: 1
67+
maxSize: 10
68+
69+
addons:
70+
- name: eks-pod-identity-agent
71+
EOF
72+
73+
export CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --query "cluster.endpoint" --output text)"
74+
export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"
75+
76+
# Log out of ECR Public registry and perform unauthenticated image pull
77+
docker logout public.ecr.aws
78+
helm registry logout public.ecr.aws
79+
# Install Karpenter
80+
echo "Installing Karpenter"
81+
helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
82+
--set "settings.clusterName=${CLUSTER_NAME}" \
83+
--set "settings.interruptionQueue=${CLUSTER_NAME}" \
84+
--set controller.resources.requests.cpu=1 \
85+
--set controller.resources.requests.memory=1Gi \
86+
--set controller.resources.limits.cpu=1 \
87+
--set controller.resources.limits.memory=1Gi \
88+
--wait
89+
90+
# Create NodePool and EC2NodeClass.
91+
# NodePool sets constraints on the nodes that can be created by Karpenter and the pods that can run on those nodes
92+
# EC2NodeClass is used to configure AWS-specific settings like AMI type, AMI ID, EC2 security groups etc
93+
94+
cat <<EOF | envsubst | kubectl apply -f -
95+
apiVersion: karpenter.sh/v1
96+
kind: NodePool
97+
metadata:
98+
name: default
99+
spec:
100+
template:
101+
spec:
102+
requirements:
103+
- key: kubernetes.io/arch
104+
operator: In
105+
values: ["amd64"]
106+
- key: kubernetes.io/os
107+
operator: In
108+
values: ["linux"]
109+
- key: karpenter.sh/capacity-type
110+
operator: In
111+
values: ["on-demand"]
112+
- key: karpenter.k8s.aws/instance-category
113+
operator: In
114+
values: ["c"]
115+
- key: karpenter.k8s.aws/instance-generation
116+
operator: Gt
117+
values: ["2"]
118+
nodeClassRef:
119+
group: karpenter.k8s.aws
120+
kind: EC2NodeClass
121+
name: default
122+
expireAfter: 720h
123+
limits:
124+
cpu: 1000
125+
disruption:
126+
consolidationPolicy: WhenEmptyOrUnderutilized
127+
consolidateAfter: 1m
128+
---
129+
apiVersion: karpenter.k8s.aws/v1
130+
kind: EC2NodeClass
131+
metadata:
132+
name: default
133+
spec:
134+
amiFamily: AL2
135+
role: "KarpenterNodeRole-${CLUSTER_NAME}"
136+
subnetSelectorTerms:
137+
- tags:
138+
karpenter.sh/discovery: "${CLUSTER_NAME}"
139+
securityGroupSelectorTerms:
140+
- tags:
141+
karpenter.sh/discovery: "${CLUSTER_NAME}"
142+
amiSelectorTerms:
143+
- alias: al2@latest
144+
EOF
145+
146+
echo "Enabling security group for pods on cluster"
147+
kubectl set env daemonset aws-node -n kube-system ENABLE_POD_ENI=true
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env bash
2+
3+
# Delete EKS cluster & related resources created via script create-cluster-karpenter.sh
4+
set -eo pipefail
5+
6+
SCRIPTS_DIR=$(cd "$(dirname "$0")" || exit 1; pwd)
7+
source "$SCRIPTS_DIR/lib/common.sh"
8+
check_is_installed helm
9+
check_is_installed eksctl
10+
check_is_installed jq
11+
check_is_installed aws
12+
13+
export KARPENTER_NAMESPACE="kube-system"
14+
export CLUSTER_NAME="${USER}-sgp-scaletest" # Update cluster name if it is different
15+
echo "Uninstalling Karpenter"
16+
helm uninstall karpenter --namespace "${KARPENTER_NAMESPACE}"
17+
echo "Deleting Karpenter CFN stack"
18+
aws cloudformation delete-stack --stack-name "Karpenter-${CLUSTER_NAME}"
19+
aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${CLUSTER_NAME}" |
20+
jq -r ".LaunchTemplates[].LaunchTemplateName" |
21+
xargs -I{} aws ec2 delete-launch-template --launch-template-name {}
22+
echo "Deleting EKS cluster"
23+
eksctl delete cluster --name "${CLUSTER_NAME}"

test/README.md

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,42 @@ The Integration test suite provides the following focuses.
8282
8383
This is intended for the purposes of local development, testing and CI Setup. For more details refer the steps are provided in `scripts/test/README.md`
8484
85-
### Future Work
86-
- Once we have more test suites, we can provide a script instead of invoking each suite manually.
87-
- Add Windows tests to the list once the support is enabled.
88-
- Move the script based tests in `integration-test` to Ginkgo Based integration/e2e test.
85+
### Running Scale Tests
86+
87+
#### Test Pod startup latency
88+
For each release, verify that pod startup latency is comparable to the previous release. This helps to detect regression issues which impact controller performance in the new release.
89+
90+
To run the test manually:
91+
92+
##### 1. Create EKS cluster and install Karpenter.
93+
94+
Karpenter provides node lifecycle management for Kubernetes clusters. It automates provisioning and deprovisioning of nodes based on the scheduling needs of pods, allowing efficient scaling and cost optimization.
95+
96+
The script will provision all required resources for the test:
97+
1. Deploy CFN stack to set up EKS cluster infrastructure
98+
2. Create EKS cluster using eksctl
99+
3. Install Karpenter on the cluster via helm
100+
4. Deploy default NodePool and EC2NodeClass. NodePool sets constraints on the nodes that can be created by Karpenter and the pods that can run on those nodes. EC2NodeClass is used to configure AWS-specific settings such as AMI type, AMI ID, EC2 security groups.
101+
Refer to the Karpenter documentation for further details.
102+
```
103+
./scripts/test/create-cluster-karpenter.sh
104+
```
105+
The scripts are located in the `scripts/test` directory.
106+
107+
##### 2. Run the scale tests.
108+
109+
The scale tests are located in `test/integration/scale` directory. The test will create a deployment with 1000 pods and measures the pod startup latency. It asserts that all 1000 pods be ready within 5 minutes. The test is run three times on repeat and must pass each time.
110+
```
111+
KUBE_CONFIG_PATH=<path-to-kube-config> # Update the kube-config path
112+
ginkgo -v --timeout 30m -- --cluster-kubeconfig=$KUBE_CONFIG_PATH --cluster-name=$CLUSTER_NAME --aws-region=$AWS_REGION --aws-vpc-id=$VPC_ID
113+
```
114+
115+
##### 3. Delete EKS cluster and other resources.
116+
117+
The below script uninstalls Karpenter on the clusters, deletes the CFN stack, and finally deletes the EKS cluster.
118+
```
119+
./scripts/test/delete-cluster-karpenter.sh
120+
```
121+
122+
References:
123+
1. Karpenter Getting Started Guide: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/

test/framework/resource/aws/ec2/manager.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,3 +280,20 @@ func (d *Manager) DeleteNetworkInterface(nwInterfaceID string) error {
280280
})
281281
return err
282282
}
283+
func (d *Manager) ReCreateSG(securityGroupName string, ctx context.Context) (string, error) {
284+
groupID, err := d.GetSecurityGroupID(securityGroupName)
285+
// If the security group already exists, no error will be returned
286+
// We need to delete the security Group in this case so ingres/egress
287+
// rules from last run don't interfere with the current test
288+
if err == nil {
289+
if err = d.DeleteSecurityGroup(ctx, groupID); err != nil {
290+
return "", err
291+
}
292+
}
293+
// If error is not nil, then the Security Group doesn't exists, we need
294+
// to create new rule
295+
if groupID, err = d.CreateSecurityGroup(securityGroupName); err != nil {
296+
return "", err
297+
}
298+
return groupID, nil
299+
}

test/framework/utils/resource.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ package utils
1515

1616
const (
1717
ResourceNamePrefix = "vpc-resource-controller-integration-"
18+
TestNameSpace = "test-ns"
1819
)

test/integration/perpodsg/perpodsg_suite_test.go

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ var _ = BeforeSuite(func() {
4747
ctx = context.Background()
4848
verify = verifier.NewPodVerification(frameWork, ctx)
4949

50-
securityGroupID1 = reCreateSGIfAlreadyExists(utils.ResourceNamePrefix + "sg-1")
51-
securityGroupID2 = reCreateSGIfAlreadyExists(utils.ResourceNamePrefix + "sg-2")
50+
securityGroupID1, err = frameWork.EC2Manager.ReCreateSG(utils.ResourceNamePrefix+"sg-1", ctx)
51+
Expect(err).ToNot(HaveOccurred())
52+
securityGroupID2, err = frameWork.EC2Manager.ReCreateSG(utils.ResourceNamePrefix+"sg-2", ctx)
53+
Expect(err).ToNot(HaveOccurred())
5254

5355
nodeList = node.GetNodeAndWaitTillCapacityPresent(frameWork.NodeManager, "linux",
5456
config.ResourceNamePodENI)
@@ -60,22 +62,3 @@ var _ = AfterSuite(func() {
6062
Expect(frameWork.EC2Manager.DeleteSecurityGroup(ctx, securityGroupID1)).To(Succeed())
6163
Expect(frameWork.EC2Manager.DeleteSecurityGroup(ctx, securityGroupID2)).To(Succeed())
6264
})
63-
64-
func reCreateSGIfAlreadyExists(securityGroupName string) string {
65-
groupID, err := frameWork.EC2Manager.GetSecurityGroupID(securityGroupName)
66-
// If the security group already exists, no error will be returned
67-
// We need to delete the security Group in this case so ingres/egress
68-
// rules from last run don't interfere with the current test
69-
if err == nil {
70-
By("deleting the older security group" + groupID)
71-
err = frameWork.EC2Manager.DeleteSecurityGroup(ctx, groupID)
72-
Expect(err).ToNot(HaveOccurred())
73-
}
74-
// If error is not nil, then the Security Group doesn't exists, we need
75-
// to create new rule
76-
By("creating a new security group with name " + securityGroupName)
77-
groupID, err = frameWork.EC2Manager.CreateSecurityGroup(securityGroupName)
78-
Expect(err).ToNot(HaveOccurred())
79-
80-
return groupID
81-
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
// not use this file except in compliance with the License. A copy of the
5+
// License is located at
6+
//
7+
// http://aws.amazon.com/apache2.0/
8+
//
9+
// or in the "license" file accompanying this file. This file is distributed
10+
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
// express or implied. See the License for the specific language governing
12+
// permissions and limitations under the License.
13+
14+
package scale_test
15+
16+
import (
17+
"time"
18+
19+
"github.com/aws/amazon-vpc-resource-controller-k8s/apis/vpcresources/v1beta1"
20+
"github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/manifest"
21+
deploymentWrapper "github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/resource/k8s/deployment"
22+
sgpWrapper "github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/resource/k8s/sgp"
23+
. "github.com/onsi/ginkgo/v2"
24+
. "github.com/onsi/gomega"
25+
v1 "k8s.io/api/apps/v1"
26+
)
27+
28+
var _ = Describe("Security group per pod scale test", func() {
29+
var (
30+
sgpLabelKey string
31+
sgpLabelValue string
32+
securityGroups []string
33+
securityGroupPolicy *v1beta1.SecurityGroupPolicy
34+
err error
35+
)
36+
37+
BeforeEach(func() {
38+
sgpLabelKey = "role"
39+
sgpLabelValue = "db"
40+
securityGroups = []string{securityGroupID}
41+
})
42+
43+
JustBeforeEach(func() {
44+
// create SGP
45+
securityGroupPolicy, err = manifest.NewSGPBuilder().
46+
Namespace(namespace).
47+
PodMatchLabel(sgpLabelKey, sgpLabelValue).
48+
SecurityGroup(securityGroups).Build()
49+
Expect(err).NotTo(HaveOccurred())
50+
})
51+
52+
JustAfterEach(func() {
53+
By("deleting security group policy")
54+
err = frameWork.SGPManager.DeleteAndWaitTillSecurityGroupIsDeleted(ctx, securityGroupPolicy)
55+
Expect(err).NotTo(HaveOccurred())
56+
})
57+
58+
Describe("creating deployment", func() {
59+
var deployment *v1.Deployment
60+
61+
JustBeforeEach(func() {
62+
deployment = manifest.NewDefaultDeploymentBuilder().
63+
Namespace(namespace).
64+
Replicas(1000).
65+
PodLabel(sgpLabelKey, sgpLabelValue).Build()
66+
})
67+
68+
JustAfterEach(func() {
69+
By("deleting the deployment")
70+
err = frameWork.DeploymentManager.DeleteAndWaitUntilDeploymentDeleted(ctx, deployment)
71+
Expect(err).ToNot(HaveOccurred())
72+
time.Sleep(time.Minute) // allow time for pods to terminate
73+
})
74+
75+
Context("when deployment is created", func() {
76+
It("should have all the pods running", MustPassRepeatedly(3), func() {
77+
start := time.Now()
78+
sgpWrapper.CreateSecurityGroupPolicy(frameWork.K8sClient, ctx, securityGroupPolicy)
79+
deploymentWrapper.
80+
CreateAndWaitForDeploymentToStart(frameWork.DeploymentManager, ctx, deployment)
81+
duration := time.Since(start)
82+
verify.VerifyNetworkingOfAllPodUsingENI(namespace, sgpLabelKey, sgpLabelValue,
83+
securityGroups)
84+
Expect(duration.Minutes()).To(BeNumerically("<", 5.0))
85+
})
86+
})
87+
})
88+
89+
})

0 commit comments

Comments
 (0)