Skip to content

Commit 7da77c5

Browse files
author
Lindsay Hanks
committed
Add metrics, dashboard, and test suite
1 parent 2835a36 commit 7da77c5

15 files changed

+3864
-0
lines changed

config/grafana/grafana_dashboard.json

+2,404
Large diffs are not rendered by default.

pkg/ipamd/datastore/data_store.go

+40
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,19 @@ var (
122122
},
123123
[]string{"cidr"},
124124
)
125+
noAvailableAddrs = prometheus.NewCounter(
126+
prometheus.CounterOpts{
127+
Name: "awscni_err_no_avail_addrs",
128+
Help: "The number of IP/Prefix assignments that fail due to no available addresses at the ENI level",
129+
},
130+
)
131+
eniUtilization = prometheus.NewGaugeVec(
132+
prometheus.GaugeOpts{
133+
Name: "awscni_eni_util",
134+
Help: "The number of allocated ips partitioned by eni",
135+
},
136+
[]string{"fn"},
137+
)
125138
prometheusRegistered = false
126139
)
127140

@@ -344,6 +357,8 @@ func prometheusRegister() {
344357
prometheus.MustRegister(forceRemovedIPs)
345358
prometheus.MustRegister(totalPrefixes)
346359
prometheus.MustRegister(ipsPerCidr)
360+
prometheus.MustRegister(noAvailableAddrs)
361+
prometheus.MustRegister(eniUtilization)
347362
prometheusRegistered = true
348363
}
349364
}
@@ -521,6 +536,7 @@ func (ds *DataStore) AddENI(eniID string, deviceNumber int, isPrimary, isTrunk,
521536
DeviceNumber: deviceNumber,
522537
AvailableIPv4Cidrs: make(map[string]*CidrInfo)}
523538

539+
ds.GetENIUtilization()
524540
enis.Set(float64(len(ds.eniPool)))
525541
return nil
526542
}
@@ -714,6 +730,7 @@ func (ds *DataStore) AssignPodIPv6Address(ipamKey IPAMKey, ipamMetadata IPAMMeta
714730
return addr.Address, eni.DeviceNumber, nil
715731
}
716732
}
733+
noAvailableAddrs.Inc()
717734
return "", -1, errors.New("assignPodIPv6AddressUnsafe: no available IP addresses")
718735
}
719736

@@ -781,6 +798,7 @@ func (ds *DataStore) AssignPodIPv4Address(ipamKey IPAMKey, ipamMetadata IPAMMeta
781798
ds.log.Debugf("AssignPodIPv4Address: ENI %s does not have available addresses", eni.ID)
782799
}
783800

801+
noAvailableAddrs.Inc()
784802
ds.log.Errorf("DataStore has no available IP/Prefix addresses")
785803
return "", -1, errors.New("assignPodIPv4AddressUnsafe: no available IP/Prefix addresses")
786804
}
@@ -797,6 +815,7 @@ func (ds *DataStore) assignPodIPAddressUnsafe(addr *AddressInfo, ipamKey IPAMKey
797815
addr.IPAMMetadata = ipamMetadata
798816
addr.AssignedTime = assignedTime
799817

818+
ds.log.Debugf("IP allocation request")
800819
ds.assigned++
801820
// Prometheus gauge
802821
assignedIPs.Set(float64(ds.assigned))
@@ -813,6 +832,7 @@ func (ds *DataStore) unassignPodIPAddressUnsafe(addr *AddressInfo) {
813832
addr.IPAMKey = IPAMKey{} // unassign the addr
814833
addr.IPAMMetadata = IPAMMetadata{}
815834
ds.assigned--
835+
ds.log.Debugf("IP deallocation request")
816836
// Prometheus gauge
817837
assignedIPs.Set(float64(ds.assigned))
818838
}
@@ -866,6 +886,24 @@ func (ds *DataStore) GetIPStats(addressFamily string) *DataStoreStats {
866886
return stats
867887
}
868888

889+
// GetENIUtilization updates a Prometheus gauge vector with each ENIs id and how many ip addresses are assigned on it
890+
func (ds *DataStore) GetENIUtilization() {
891+
//eniUtilization.Reset()
892+
for _, eni := range ds.eniPool {
893+
count := 0
894+
for _, assignedAddr := range eni.AvailableIPv4Cidrs {
895+
for _, addr := range assignedAddr.IPAddresses {
896+
if addr.Assigned() {
897+
count += 1
898+
}
899+
}
900+
}
901+
utilization := count
902+
eniID := eni.ID
903+
eniUtilization.WithLabelValues(eniID).Set(float64(utilization))
904+
}
905+
}
906+
869907
// GetTrunkENI returns the trunk ENI ID or an empty string
870908
func (ds *DataStore) GetTrunkENI() string {
871909
ds.lock.Lock()
@@ -1072,6 +1110,7 @@ func (ds *DataStore) RemoveUnusedENIFromStore(warmIPTarget, minimumIPTarget, war
10721110

10731111
// Prometheus update
10741112
enis.Set(float64(len(ds.eniPool)))
1113+
ds.GetENIUtilization()
10751114
totalIPs.Set(float64(ds.total))
10761115
return removableENI
10771116
}
@@ -1126,6 +1165,7 @@ func (ds *DataStore) RemoveENIFromDataStore(eniID string, force bool) error {
11261165

11271166
// Prometheus gauge
11281167
enis.Set(float64(len(ds.eniPool)))
1168+
ds.GetENIUtilization()
11291169
return nil
11301170
}
11311171

pkg/ipamd/ipamd.go

+2
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,8 @@ func (c *IPAMContext) updateIPPoolIfRequired(ctx context.Context) {
679679
if c.shouldRemoveExtraENIs() {
680680
c.tryFreeENI()
681681
}
682+
// Prometheus Metric
683+
c.dataStore.GetENIUtilization()
682684
}
683685

684686
// decreaseDatastorePool runs every `interval` and attempts to return unused ENIs and IPs
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package warm_pool
2+
3+
import (
4+
k8sUtils "github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/utils"
5+
. "github.com/onsi/ginkgo/v2"
6+
)
7+
8+
// Environment variables are not reset before and after each test so that way multiple tests can be run to
9+
// evaluate behavior. You can run this test which will unset all warm pool environment variables. Or, if you
10+
// want to test the behavior with some of those environment variables set, alter them in that file and run it once before
11+
// you run the desired tests.
12+
var _ = Describe("clear warm env", func() {
13+
Context("Clear out environment variables for warm pool for testing", func() {
14+
15+
It("Unsetting env variables", func() {
16+
k8sUtils.UpdateEnvVarOnDaemonSetAndWaitUntilReady(f, "aws-node", "kube-system",
17+
"aws-node", map[string]string{},
18+
map[string]struct{}{
19+
"WARM_ENI_TARGET": {},
20+
"WARM_IP_TARGET": {},
21+
"MINIMUM_IP_TARGET": {},
22+
"WARM_PREFIX_TARGET": {},
23+
})
24+
})
25+
})
26+
})
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package warm_pool
2+
3+
import (
4+
k8sUtils "github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/utils"
5+
"github.com/aws/amazon-vpc-cni-k8s/test/framework/utils"
6+
. "github.com/onsi/ginkgo/v2"
7+
"strconv"
8+
)
9+
10+
// Environment variables are not reset before and after each test so that way multiple tests can be run to
11+
// evaluate behavior. You can run this test which will unset all warm pool environment variables. Or, if you
12+
// want to test the behavior with some of those environment variables set, alter them in that file and run it once before
13+
// you run the desired tests.
14+
var _ = Describe("set warm env", func() {
15+
Context("Sets env variables", func() {
16+
17+
It("Sets env variables", func() {
18+
k8sUtils.AddEnvVarToDaemonSetAndWaitTillUpdated(f,
19+
utils.AwsNodeName, utils.AwsNodeNamespace, utils.AwsNodeName,
20+
map[string]string{
21+
"WARM_IP_TARGET": strconv.Itoa(0),
22+
"ENABLE_DYNAMIC_WARM_POOL": strconv.FormatBool(true),
23+
})
24+
})
25+
})
26+
})
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package warm_pool
2+
3+
import (
4+
"fmt"
5+
"github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/manifest"
6+
"github.com/aws/amazon-vpc-cni-k8s/test/framework/utils"
7+
. "github.com/onsi/ginkgo/v2"
8+
. "github.com/onsi/gomega"
9+
v1 "k8s.io/api/core/v1"
10+
"time"
11+
)
12+
13+
var primaryNode v1.Node
14+
15+
// This test scales up the cluster to maxPods, then scales it back down to minPods.
16+
var _ = Describe("use case 1", func() {
17+
Context("Quick Scale Up and Down", func() {
18+
19+
BeforeEach(func() {
20+
By("Getting Warm Pool Environment Variables Before Test")
21+
getWarmPoolEnvVars()
22+
})
23+
24+
It("Scales the cluster and checks warm pool before and after", func() {
25+
fmt.Fprintf(GinkgoWriter, "Deploying %v minimum pods\n", minPods)
26+
27+
start := time.Now().Unix()
28+
29+
fmt.Fprintf(GinkgoWriter, "Scaling cluster up to %v pods\n", minPods)
30+
deploymentSpec := manifest.NewBusyBoxDeploymentBuilder(f.Options.TestImageRegistry).
31+
Namespace("default").
32+
Name("busybox").
33+
NodeName(primaryNode.Name).
34+
Namespace(utils.DefaultTestNamespace).
35+
Replicas(minPods).
36+
Build()
37+
38+
_, err := f.K8sResourceManagers.
39+
DeploymentManager().
40+
CreateAndWaitTillDeploymentIsReady(deploymentSpec, utils.DefaultDeploymentReadyTimeout*5)
41+
Expect(err).ToNot(HaveOccurred())
42+
43+
if minPods != 0 {
44+
time.Sleep(sleep)
45+
}
46+
47+
fmt.Fprintf(GinkgoWriter, "Scaling cluster up to %v pods\n", maxPods)
48+
quickScale(maxPods)
49+
50+
Expect(maxPods).To(Equal(busyboxPodCnt()))
51+
52+
fmt.Fprintf(GinkgoWriter, "Scaling cluster down to %v pods\n", minPods)
53+
quickScale(minPods)
54+
55+
end := time.Now().Unix()
56+
57+
fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Start Time: %v\n", start))
58+
fmt.Fprintf(GinkgoWriter, fmt.Sprintf("End Time: %v\n", end))
59+
60+
By("Starting Curl Container")
61+
curlContainer := manifest.NewCurlContainer().
62+
Command([]string{"sleep", "1000"}).Build()
63+
64+
getCurlPod := manifest.NewDefaultPodBuilder().
65+
Name("curl-pod").
66+
Namespace(utils.DefaultTestNamespace).
67+
NodeName(primaryNode.Name).
68+
HostNetwork(true).
69+
Container(curlContainer).
70+
Build()
71+
72+
testPod, err := f.K8sResourceManagers.PodManager().
73+
CreateAndWaitTillPodCompleted(getCurlPod)
74+
75+
logs, errLogs := f.K8sResourceManagers.PodManager().
76+
PodLogs(testPod.Namespace, testPod.Name)
77+
Expect(errLogs).ToNot(HaveOccurred())
78+
fmt.Fprintln(GinkgoWriter, logs)
79+
80+
By("Fetching metrics via Curl Container")
81+
getMetrics(start, end)
82+
83+
By("Deleting the deployment")
84+
err = f.K8sResourceManagers.DeploymentManager().DeleteAndWaitTillDeploymentIsDeleted(deploymentSpec)
85+
Expect(err).NotTo(HaveOccurred())
86+
87+
By("Deleting Curl Container")
88+
err = f.K8sResourceManagers.PodManager().DeleteAndWaitTillPodDeleted(getCurlPod)
89+
Expect(err).NotTo(HaveOccurred())
90+
})
91+
92+
AfterEach(func() {
93+
By("Getting Warm Pool Environment Variables After Test")
94+
getWarmPoolEnvVars()
95+
96+
})
97+
})
98+
})
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package warm_pool
2+
3+
import (
4+
"fmt"
5+
"github.com/aws/amazon-vpc-cni-k8s/test/framework/resources/k8s/manifest"
6+
"github.com/aws/amazon-vpc-cni-k8s/test/framework/utils"
7+
"strconv"
8+
"time"
9+
10+
. "github.com/onsi/ginkgo/v2"
11+
. "github.com/onsi/gomega"
12+
)
13+
14+
// This test replicates sawtooth behavior by adding a fixed amount of pods and removing the same fixed amount of pods
15+
// over a preset number of iterations.
16+
var _ = Describe("use case 2", func() {
17+
Context("Sawtooth Fixed Add and Subtract", func() {
18+
19+
BeforeEach(func() {
20+
By("Getting Warm Pool Environment Variables Before Test")
21+
getWarmPoolEnvVars()
22+
})
23+
24+
It("Scales the cluster and checks warm pool before and after", func() {
25+
replicas := minPods
26+
27+
start := time.Now().Unix()
28+
29+
fmt.Fprintf(GinkgoWriter, "Deploying %v minimum pods\n", minPods)
30+
deploymentSpec := manifest.NewBusyBoxDeploymentBuilder(f.Options.TestImageRegistry).
31+
Namespace("default").
32+
Name("busybox").
33+
NodeName(primaryNode.Name).
34+
Namespace(utils.DefaultTestNamespace).
35+
Replicas(replicas).
36+
Build()
37+
38+
_, err := f.K8sResourceManagers.
39+
DeploymentManager().
40+
CreateAndWaitTillDeploymentIsReady(deploymentSpec, utils.DefaultDeploymentReadyTimeout*5)
41+
Expect(err).ToNot(HaveOccurred())
42+
43+
if minPods != 0 {
44+
time.Sleep(sleep)
45+
}
46+
47+
for i := 0; i < iterations; i++ {
48+
By("Loop " + strconv.Itoa(i))
49+
replicas = checkInRange(replicas + iterPods)
50+
fmt.Fprintf(GinkgoWriter, "Scaling cluster up to %v pods\n", replicas)
51+
quickScale(replicas)
52+
Expect(replicas).To(Equal(busyboxPodCnt()))
53+
54+
replicas = checkInRange(replicas - iterPods)
55+
fmt.Fprintf(GinkgoWriter, "Scaling cluster down to %v pods\n", replicas)
56+
quickScale(replicas)
57+
Expect(replicas).To(Equal(busyboxPodCnt()))
58+
}
59+
60+
Expect(minPods).To(Equal(busyboxPodCnt()))
61+
62+
end := time.Now().Unix()
63+
64+
fmt.Fprintf(GinkgoWriter, fmt.Sprintf("Start Time: %v\n", start))
65+
fmt.Fprintf(GinkgoWriter, fmt.Sprintf("End Time: %v\n", end))
66+
67+
By("Starting Curl Container")
68+
curlContainer := manifest.NewCurlContainer().
69+
Command([]string{"sleep", "3600"}).Build()
70+
71+
getCurlPod := manifest.NewDefaultPodBuilder().
72+
Name("curl-pod").
73+
Namespace(utils.DefaultTestNamespace).
74+
NodeName(primaryNode.Name).
75+
HostNetwork(true).
76+
Container(curlContainer).
77+
Build()
78+
79+
testPod, err := f.K8sResourceManagers.PodManager().
80+
CreateAndWaitTillPodCompleted(getCurlPod)
81+
82+
logs, errLogs := f.K8sResourceManagers.PodManager().
83+
PodLogs(testPod.Namespace, testPod.Name)
84+
Expect(errLogs).ToNot(HaveOccurred())
85+
fmt.Fprintln(GinkgoWriter, logs)
86+
87+
By("Fetching metrics via Curl Container")
88+
getMetrics(start, end)
89+
90+
By("Deleting the deployment")
91+
err = f.K8sResourceManagers.DeploymentManager().DeleteAndWaitTillDeploymentIsDeleted(deploymentSpec)
92+
Expect(err).NotTo(HaveOccurred())
93+
94+
By("Deleting Curl Container")
95+
err = f.K8sResourceManagers.PodManager().DeleteAndWaitTillPodDeleted(getCurlPod)
96+
Expect(err).NotTo(HaveOccurred())
97+
})
98+
99+
AfterEach(func() {
100+
By("Getting Warm Pool Environment Variables After Test")
101+
getWarmPoolEnvVars()
102+
})
103+
})
104+
})

0 commit comments

Comments
 (0)