Skip to content

Commit b5699de

Browse files
authored
fix: paginate DescribeNetworkInterfaces with deep filters (aws#375)
* fix: paginate DescribeNetworkInterfaces with deep filters * update metrics and address review comments * minor updates to address comments
1 parent c10421f commit b5699de

File tree

11 files changed

+193
-132
lines changed

11 files changed

+193
-132
lines changed

main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ func main() {
107107
var healthCheckTimeout int
108108
var enableWindowsPrefixDelegation bool
109109
var region string
110+
var vpcID string
110111

111112
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080",
112113
"The address the metric endpoint binds to.")
@@ -141,6 +142,7 @@ func main() {
141142
flag.BoolVar(&enableWindowsPrefixDelegation, "enable-windows-prefix-delegation", false,
142143
"Enable the feature flag for Windows prefix delegation")
143144
flag.StringVar(&region, "aws-region", "", "The aws region of the k8s cluster")
145+
flag.StringVar(&vpcID, "vpc-id", "", "The vpc-id where EKS cluster is deployed")
144146

145147
flag.Parse()
146148

@@ -183,6 +185,11 @@ func main() {
183185
os.Exit(1)
184186
}
185187

188+
if vpcID == "" {
189+
setupLog.Error(fmt.Errorf("vpc-id is a required parameter"), "unable to start the controller")
190+
os.Exit(1)
191+
}
192+
186193
// Profiler disabled by default, to enable set the enableProfiling argument
187194
if enableProfiling {
188195
// To use the profiler - https://golang.org/pkg/net/http/pprof/
@@ -336,6 +343,7 @@ func main() {
336343
EC2Wrapper: ec2Wrapper,
337344
ClusterName: clusterName,
338345
Log: ctrl.Log.WithName("eni cleaner"),
346+
VPCID: vpcID,
339347
}).SetupWithManager(ctx, mgr, healthzHandler); err != nil {
340348
setupLog.Error(err, "unable to start eni cleaner")
341349
os.Exit(1)

mocks/amazon-vcp-resource-controller-k8s/pkg/aws/ec2/api/mock_ec2_apihelper.go

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

mocks/amazon-vcp-resource-controller-k8s/pkg/aws/ec2/api/mock_ec2_wrapper.go

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/aws/ec2/api/eni_cleanup.go

Lines changed: 64 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ type ENICleaner struct {
3434
EC2Wrapper EC2Wrapper
3535
ClusterName string
3636
Log logr.Logger
37+
VPCID string
3738

3839
availableENIs map[string]struct{}
3940
shutdown bool
@@ -42,16 +43,22 @@ type ENICleaner struct {
4243
}
4344

4445
var (
45-
vpcCniLeakedENICleanupCnt = prometheus.NewCounter(
46-
prometheus.CounterOpts{
47-
Name: "vpc_cni_created_leaked_eni_cleanup_count",
48-
Help: "The number of leaked ENIs created by VPC-CNI that is cleaned up by the controller",
46+
vpccniAvailableENICnt = prometheus.NewGauge(
47+
prometheus.GaugeOpts{
48+
Name: "vpc_cni_created_available_eni_count",
49+
Help: "The number of available ENIs created by VPC-CNI that controller will try to delete in each cleanup cycle",
4950
},
5051
)
51-
vpcrcLeakedENICleanupCnt = prometheus.NewCounter(
52-
prometheus.CounterOpts{
53-
Name: "vpc_rc_created_leaked_eni_cleanup_count",
54-
Help: "The number of leaked ENIs created by VPC-RC that is cleaned up by the controller",
52+
vpcrcAvailableENICnt = prometheus.NewGauge(
53+
prometheus.GaugeOpts{
54+
Name: "vpc_rc_created_available_eni_count",
55+
Help: "The number of available ENIs created by VPC-RC that controller will try to delete in each cleanup cycle",
56+
},
57+
)
58+
leakedENICnt = prometheus.NewGauge(
59+
prometheus.GaugeOpts{
60+
Name: "leaked_eni_count",
61+
Help: "The number of available ENIs that failed to be deleted by the controller in each cleanup cycle",
5562
},
5663
)
5764
)
@@ -101,6 +108,9 @@ func (e *ENICleaner) Start(ctx context.Context) error {
101108
// interval between cycle 1 and 2 and hence can be safely deleted. And we can also conclude that Interface 1 was
102109
// created but not attached at the the time when 1st cycle ran and hence it should not be deleted.
103110
func (e *ENICleaner) cleanUpAvailableENIs() {
111+
vpcrcAvailableCount := 0
112+
vpccniAvailableCount := 0
113+
leakedENICount := 0
104114
describeNetworkInterfaceIp := &ec2.DescribeNetworkInterfacesInput{
105115
Filters: []*ec2.Filter{
106116
{
@@ -116,63 +126,65 @@ func (e *ENICleaner) cleanUpAvailableENIs() {
116126
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
117127
config.NetworkInterfaceOwnerVPCCNITagValue}),
118128
},
129+
{
130+
Name: aws.String("vpc-id"),
131+
Values: []*string{aws.String(e.VPCID)},
132+
},
119133
},
120134
}
121135

122136
availableENIs := make(map[string]struct{})
123137

124-
for {
125-
describeNetworkInterfaceOp, err := e.EC2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfaceIp)
126-
if err != nil {
127-
e.Log.Error(err, "failed to describe network interfaces, will retry")
128-
return
129-
}
130-
131-
for _, networkInterface := range describeNetworkInterfaceOp.NetworkInterfaces {
132-
if _, exists := e.availableENIs[*networkInterface.NetworkInterfaceId]; exists {
133-
// Increment promethues metrics for number of leaked ENIs cleaned up
134-
if tagIdx := slices.IndexFunc(networkInterface.TagSet, func(tag *ec2.Tag) bool {
135-
return *tag.Key == config.NetworkInterfaceOwnerTagKey
136-
}); tagIdx != -1 {
137-
switch *networkInterface.TagSet[tagIdx].Value {
138-
case config.NetworkInterfaceOwnerTagValue:
139-
vpcrcLeakedENICleanupCnt.Inc()
140-
case config.NetworkInterfaceOwnerVPCCNITagValue:
141-
vpcCniLeakedENICleanupCnt.Inc()
142-
default:
143-
// We will not hit this case as we only filter for above two tag values, adding it for any future use cases
144-
e.Log.Info("found available ENI not created by VPC-CNI/VPC-RC")
145-
}
146-
}
138+
networkInterfaces, err := e.EC2Wrapper.DescribeNetworkInterfacesPages(describeNetworkInterfaceIp)
139+
if err != nil {
140+
e.Log.Error(err, "failed to describe network interfaces, cleanup will be retried in next cycle")
141+
return
142+
}
147143

148-
// The ENI in available state has been sitting for at least the eni clean up interval and it should
149-
// be removed
150-
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
151-
NetworkInterfaceId: networkInterface.NetworkInterfaceId,
152-
})
153-
if err != nil {
154-
// Log and continue, if the ENI is still present it will be cleaned up in next 2 cycles
155-
e.Log.Error(err, "failed to delete the dangling network interface",
156-
"id", *networkInterface.NetworkInterfaceId)
144+
for _, networkInterface := range networkInterfaces {
145+
if _, exists := e.availableENIs[*networkInterface.NetworkInterfaceId]; exists {
146+
// Increment promethues metrics for number of leaked ENIs cleaned up
147+
if tagIdx := slices.IndexFunc(networkInterface.TagSet, func(tag *ec2.Tag) bool {
148+
return *tag.Key == config.NetworkInterfaceOwnerTagKey
149+
}); tagIdx != -1 {
150+
switch *networkInterface.TagSet[tagIdx].Value {
151+
case config.NetworkInterfaceOwnerTagValue:
152+
vpcrcAvailableCount += 1
153+
case config.NetworkInterfaceOwnerVPCCNITagValue:
154+
vpccniAvailableCount += 1
155+
default:
156+
// We should not hit this case as we only filter for relevant tag values, log error and continue if unexpected ENIs found
157+
e.Log.Error(fmt.Errorf("found available ENI not created by VPC-CNI/VPC-RC"), "eniID", *networkInterface.NetworkInterfaceId)
157158
continue
158159
}
159-
e.Log.Info("deleted dangling ENI successfully",
160-
"eni id", networkInterface.NetworkInterfaceId)
161-
} else {
162-
// Seeing the ENI for the first time, add it to the new list of available network interfaces
163-
availableENIs[*networkInterface.NetworkInterfaceId] = struct{}{}
164-
e.Log.V(1).Info("adding eni to to the map of available ENIs, will be removed if present in "+
165-
"next run too", "id", *networkInterface.NetworkInterfaceId)
166160
}
167-
}
168161

169-
if describeNetworkInterfaceOp.NextToken == nil {
170-
break
162+
// The ENI in available state has been sitting for at least the eni clean up interval and it should
163+
// be removed
164+
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
165+
NetworkInterfaceId: networkInterface.NetworkInterfaceId,
166+
})
167+
if err != nil {
168+
leakedENICount += 1
169+
// Log and continue, if the ENI is still present it will be cleaned up in next 2 cycles
170+
e.Log.Error(err, "failed to delete the dangling network interface",
171+
"id", *networkInterface.NetworkInterfaceId)
172+
continue
173+
}
174+
e.Log.Info("deleted dangling ENI successfully",
175+
"eni id", networkInterface.NetworkInterfaceId)
176+
} else {
177+
// Seeing the ENI for the first time, add it to the new list of available network interfaces
178+
availableENIs[*networkInterface.NetworkInterfaceId] = struct{}{}
179+
e.Log.V(1).Info("adding eni to to the map of available ENIs, will be removed if present in "+
180+
"next run too", "id", *networkInterface.NetworkInterfaceId)
171181
}
172-
173-
describeNetworkInterfaceIp.NextToken = describeNetworkInterfaceOp.NextToken
174182
}
175183

184+
// Update leaked ENI metrics
185+
vpcrcAvailableENICnt.Set(float64(vpcrcAvailableCount))
186+
vpccniAvailableENICnt.Set(float64(vpccniAvailableCount))
187+
leakedENICnt.Set(float64(leakedENICount))
176188
// Set the available ENIs to the list of ENIs seen in the current cycle
177189
e.availableENIs = availableENIs
178190
}

pkg/aws/ec2/api/eni_cleanup_test.go

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ var (
3737
mockNetworkInterfaceId2 = "eni-000000000000001"
3838
mockNetworkInterfaceId3 = "eni-000000000000002"
3939

40+
mockVPCID = "vpc-0000000000000000"
41+
4042
mockDescribeNetworkInterfaceIp = &ec2.DescribeNetworkInterfacesInput{
4143
Filters: []*ec2.Filter{
4244
{
@@ -52,19 +54,19 @@ var (
5254
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
5355
config.NetworkInterfaceOwnerVPCCNITagValue}),
5456
},
57+
{
58+
Name: aws.String("vpc-id"),
59+
Values: []*string{aws.String(mockVPCID)},
60+
},
5561
},
5662
}
57-
mockDescribeInterfaceOpWith1And2 = &ec2.DescribeNetworkInterfacesOutput{
58-
NetworkInterfaces: []*ec2.NetworkInterface{
59-
{NetworkInterfaceId: &mockNetworkInterfaceId1},
60-
{NetworkInterfaceId: &mockNetworkInterfaceId2},
61-
},
63+
mockDescribeInterfaceOpWith1And2 = []*ec2.NetworkInterface{
64+
{NetworkInterfaceId: &mockNetworkInterfaceId1},
65+
{NetworkInterfaceId: &mockNetworkInterfaceId2},
6266
}
63-
mockDescribeInterfaceOpWith1And3 = &ec2.DescribeNetworkInterfacesOutput{
64-
NetworkInterfaces: []*ec2.NetworkInterface{
65-
{NetworkInterfaceId: &mockNetworkInterfaceId1},
66-
{NetworkInterfaceId: &mockNetworkInterfaceId3},
67-
},
67+
mockDescribeInterfaceOpWith1And3 = []*ec2.NetworkInterface{
68+
{NetworkInterfaceId: &mockNetworkInterfaceId1},
69+
{NetworkInterfaceId: &mockNetworkInterfaceId3},
6870
}
6971
)
7072

@@ -74,6 +76,7 @@ func getMockENICleaner(ctrl *gomock.Controller) (*ENICleaner, *mock_api.MockEC2W
7476
EC2Wrapper: mockEC2Wrapper,
7577
availableENIs: map[string]struct{}{},
7678
Log: zap.New(zap.UseDevMode(true)),
79+
VPCID: mockVPCID,
7780
clusterNameTagKey: mockClusterNameTagKey,
7881
ctx: context.Background(),
7982
}, mockEC2Wrapper
@@ -85,10 +88,10 @@ func TestENICleaner_cleanUpAvailableENIs(t *testing.T) {
8588

8689
gomock.InOrder(
8790
// Return network interface 1 and 2 in first cycle
88-
mockWrapper.EXPECT().DescribeNetworkInterfaces(mockDescribeNetworkInterfaceIp).
91+
mockWrapper.EXPECT().DescribeNetworkInterfacesPages(mockDescribeNetworkInterfaceIp).
8992
Return(mockDescribeInterfaceOpWith1And2, nil),
9093
// Return network interface 1 and 3 in the second cycle
91-
mockWrapper.EXPECT().DescribeNetworkInterfaces(mockDescribeNetworkInterfaceIp).
94+
mockWrapper.EXPECT().DescribeNetworkInterfacesPages(mockDescribeNetworkInterfaceIp).
9295
Return(mockDescribeInterfaceOpWith1And3, nil),
9396
// Expect to delete the network interface 1
9497
mockWrapper.EXPECT().DeleteNetworkInterface(

pkg/aws/ec2/api/helper.go

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ type EC2APIHelper interface {
7979
ipResourceCount *config.IPResourceCount, interfaceType *string) (*ec2.NetworkInterface, error)
8080
DeleteNetworkInterface(interfaceId *string) error
8181
GetSubnet(subnetId *string) (*ec2.Subnet, error)
82-
GetBranchNetworkInterface(trunkID *string) ([]*ec2.NetworkInterface, error)
82+
GetBranchNetworkInterface(trunkID *string, subnetID *string) ([]*ec2.NetworkInterface, error)
8383
GetInstanceNetworkInterface(instanceId *string) ([]*ec2.InstanceNetworkInterface, error)
8484
DescribeNetworkInterfaces(nwInterfaceIds []*string) ([]*ec2.NetworkInterface, error)
8585
DescribeTrunkInterfaceAssociation(trunkInterfaceId *string) ([]*ec2.TrunkInterfaceAssociation, error)
@@ -562,43 +562,20 @@ func (h *ec2APIHelper) UnassignIPv4Resources(eniID string, resourceType config.R
562562
return err
563563
}
564564

565-
func (h *ec2APIHelper) GetBranchNetworkInterface(trunkID *string) ([]*ec2.NetworkInterface, error) {
566-
filters := []*ec2.Filter{{
567-
Name: aws.String("tag:" + config.TrunkENIIDTag),
568-
Values: []*string{trunkID},
569-
}}
570-
571-
describeNetworkInterfacesInput := &ec2.DescribeNetworkInterfacesInput{Filters: filters}
572-
var nwInterfaces []*ec2.NetworkInterface
573-
for {
574-
describeNetworkInterfaceOutput, err := h.ec2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfacesInput)
575-
if err != nil {
576-
return nil, err
577-
}
578-
579-
if describeNetworkInterfaceOutput == nil || describeNetworkInterfaceOutput.NetworkInterfaces == nil ||
580-
len(describeNetworkInterfaceOutput.NetworkInterfaces) == 0 {
581-
// No more interface associated with the trunk, return the result
582-
break
583-
}
584-
585-
// One or more interface associated with the trunk, return the result
586-
for _, nwInterface := range describeNetworkInterfaceOutput.NetworkInterfaces {
587-
// Only attach the required details to avoid consuming extra memory
588-
nwInterfaces = append(nwInterfaces, &ec2.NetworkInterface{
589-
NetworkInterfaceId: nwInterface.NetworkInterfaceId,
590-
TagSet: nwInterface.TagSet,
591-
})
592-
}
593-
594-
if describeNetworkInterfaceOutput.NextToken == nil {
595-
break
596-
}
597-
598-
describeNetworkInterfacesInput.NextToken = describeNetworkInterfaceOutput.NextToken
565+
func (h *ec2APIHelper) GetBranchNetworkInterface(trunkID *string, subnetID *string) ([]*ec2.NetworkInterface, error) {
566+
filters := []*ec2.Filter{
567+
{
568+
Name: aws.String("tag:" + config.TrunkENIIDTag),
569+
Values: []*string{trunkID},
570+
},
571+
{
572+
Name: aws.String("subnet-id"),
573+
Values: []*string{subnetID},
574+
},
599575
}
600576

601-
return nwInterfaces, nil
577+
describeNetworkInterfacesInput := &ec2.DescribeNetworkInterfacesInput{Filters: filters}
578+
return h.ec2Wrapper.DescribeNetworkInterfacesPages(describeNetworkInterfacesInput)
602579
}
603580

604581
// DetachAndDeleteNetworkInterface detaches the network interface first and then deletes it

0 commit comments

Comments
 (0)