Skip to content

Commit ef0591a

Browse files
committed
Centralized leaked ENI cleanup- refactor periodic cleanup & add node termination cleaner
1 parent bd3ec5d commit ef0591a

File tree

17 files changed

+956
-542
lines changed

17 files changed

+956
-542
lines changed

main.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
corecontroller "github.com/aws/amazon-vpc-resource-controller-k8s/controllers/core"
2929
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/api"
3030
ec2API "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api"
31+
eniCleaner "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api/cleanup"
3132
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/condition"
3233
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
3334
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
@@ -362,12 +363,17 @@ func main() {
362363
os.Exit(1)
363364
}
364365

365-
if err := (&ec2API.ENICleaner{
366-
EC2Wrapper: ec2Wrapper,
366+
cleaner := &eniCleaner.ClusterENICleaner{
367367
ClusterName: clusterName,
368-
Log: ctrl.Log.WithName("eni cleaner"),
369-
VPCID: vpcID,
370-
}).SetupWithManager(ctx, mgr, healthzHandler); err != nil {
368+
}
369+
cleaner.ENICleaner = &eniCleaner.ENICleaner{
370+
EC2Wrapper: ec2Wrapper,
371+
Manager: cleaner,
372+
VpcId: vpcID,
373+
Log: ctrl.Log.WithName("eniCleaner").WithName("cluster"),
374+
}
375+
376+
if err := cleaner.SetupWithManager(ctx, mgr, healthzHandler); err != nil {
371377
setupLog.Error(err, "unable to start eni cleaner")
372378
os.Exit(1)
373379
}

mocks/amazon-vcp-resource-controller-k8s/pkg/aws/ec2/api/mock_ec2_apihelper.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

mocks/amazon-vcp-resource-controller-k8s/pkg/aws/ec2/api/mock_ec2_wrapper.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

mocks/amazon-vcp-resource-controller-k8s/pkg/provider/branch/trunk/mock_trunk.go

Lines changed: 0 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"). You may
4+
// not use this file except in compliance with the License. A copy of the
5+
// License is located at
6+
//
7+
// http://aws.amazon.com/apache2.0/
8+
//
9+
// or in the "license" file accompanying this file. This file is distributed
10+
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
// express or implied. See the License for the specific language governing
12+
// permissions and limitations under the License.
13+
14+
package cleanup
15+
16+
import (
17+
"context"
18+
"fmt"
19+
"strings"
20+
"time"
21+
22+
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api"
23+
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
24+
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
25+
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/utils"
26+
27+
ec2Errors "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/errors"
28+
"github.com/aws/aws-sdk-go/aws"
29+
"github.com/aws/aws-sdk-go/service/ec2"
30+
"github.com/go-logr/logr"
31+
kerrors "k8s.io/apimachinery/pkg/util/errors"
32+
ctrl "sigs.k8s.io/controller-runtime"
33+
"sigs.k8s.io/controller-runtime/pkg/healthz"
34+
)
35+
36+
// NetworkInterfaceManager interface allows to define the ENI filters and checks if ENI should be deleted for different callers like in the periodic cleanup routine or
37+
// during node termination
38+
type NetworkInterfaceManager interface {
39+
GetENITagFilters() []*ec2.Filter
40+
ShouldDeleteENI(eniID *string) bool
41+
UpdateAvailableENIsIfNeeded(eniMap *map[string]struct{})
42+
UpdateCleanupMetrics(vpcrcAvailableCount int, vpccniAvailableCount int, leakedENICount int)
43+
}
44+
45+
type ENICleaner struct {
46+
EC2Wrapper api.EC2Wrapper
47+
Manager NetworkInterfaceManager
48+
VpcId string
49+
Log logr.Logger
50+
}
51+
52+
// common filters for describing network interfaces
53+
var CommonNetworkInterfaceFilters = []*ec2.Filter{
54+
{
55+
Name: aws.String("status"),
56+
Values: []*string{aws.String(ec2.NetworkInterfaceStatusAvailable)},
57+
},
58+
{
59+
Name: aws.String("tag:" + config.NetworkInterfaceOwnerTagKey),
60+
Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue,
61+
config.NetworkInterfaceOwnerVPCCNITagValue}),
62+
},
63+
}
64+
65+
// ClusterENICleaner periodically deletes leaked network interfaces(provisioned by the controller or VPC-CNI) in the cluster
66+
type ClusterENICleaner struct {
67+
ClusterName string
68+
shutdown bool
69+
ctx context.Context
70+
availableENIs map[string]struct{}
71+
*ENICleaner
72+
}
73+
74+
func (e *ClusterENICleaner) SetupWithManager(ctx context.Context, mgr ctrl.Manager, healthzHandler *rcHealthz.HealthzHandler) error {
75+
e.ctx = ctx
76+
e.availableENIs = make(map[string]struct{})
77+
healthzHandler.AddControllersHealthCheckers(
78+
map[string]healthz.Checker{
79+
"health-interface-cleaner": rcHealthz.SimplePing("interface cleanup", e.Log),
80+
},
81+
)
82+
83+
return mgr.Add(e)
84+
}
85+
86+
// StartENICleaner starts the ENI Cleaner routine that cleans up dangling ENIs created by the controller
87+
func (e *ClusterENICleaner) Start(ctx context.Context) error {
88+
e.Log.Info("starting eni clean up routine")
89+
90+
// Start routine to listen for shut down signal, on receiving the signal it set shutdown to true
91+
go func() {
92+
<-ctx.Done()
93+
e.shutdown = true
94+
}()
95+
// Perform ENI cleanup after fixed time intervals till shut down variable is set to true on receiving the shutdown
96+
// signal
97+
for !e.shutdown {
98+
e.DeleteLeakedResources()
99+
time.Sleep(config.ENICleanUpInterval)
100+
}
101+
102+
return nil
103+
}
104+
105+
// DeleteLeakedResources describes all the network interfaces in available status that are created by the controller or VPC-CNI
106+
// This is called by periodically by ClusterENICleaner which deletes available ENIs cluster-wide, and by the NodeTermination cleaner on node termination
107+
// The available ENIs are deleted if ShouldDeleteENI is true, defined in the respective cleaners
108+
// The function also updates metrics for the periodic cleanup routine and the node termination cleanup
109+
func (e *ENICleaner) DeleteLeakedResources() error {
110+
var errors []error
111+
availableENIs := make(map[string]struct{})
112+
vpcrcAvailableCount := 0
113+
vpccniAvailableCount := 0
114+
leakedENICount := 0
115+
116+
filters := CommonNetworkInterfaceFilters
117+
// Append the VPC-ID deep filter for the paginated call
118+
filters = append(filters, []*ec2.Filter{
119+
{
120+
Name: aws.String("vpc-id"),
121+
Values: []*string{aws.String(e.VpcId)},
122+
},
123+
}...)
124+
// get cleaner specific filters
125+
filters = append(filters, e.Manager.GetENITagFilters()...)
126+
describeNetworkInterfaceIp := &ec2.DescribeNetworkInterfacesInput{
127+
Filters: filters,
128+
}
129+
for {
130+
describeNetworkInterfaceOp, err := e.EC2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfaceIp)
131+
if err != nil {
132+
e.Log.Error(err, "failed to describe network interfaces, cleanup will be retried in next cycle")
133+
return err
134+
}
135+
for _, nwInterface := range describeNetworkInterfaceOp.NetworkInterfaces {
136+
if e.Manager.ShouldDeleteENI(nwInterface.NetworkInterfaceId) {
137+
tagMap := utils.GetTagKeyValueMap(nwInterface.TagSet)
138+
if val, ok := tagMap[config.NetworkInterfaceOwnerTagKey]; ok {
139+
// Increment promethues metrics for number of leaked ENIs cleaned up
140+
switch val {
141+
case config.NetworkInterfaceOwnerTagValue:
142+
vpcrcAvailableCount += 1
143+
case config.NetworkInterfaceOwnerVPCCNITagValue:
144+
vpccniAvailableCount += 1
145+
default:
146+
// We should not hit this case as we only filter for relevant tag values, log error and continue if unexpected ENIs found
147+
e.Log.Error(fmt.Errorf("found available ENI not created by VPC-CNI/VPC-RC"), "eniID", *nwInterface.NetworkInterfaceId)
148+
continue
149+
}
150+
}
151+
_, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{
152+
NetworkInterfaceId: nwInterface.NetworkInterfaceId,
153+
})
154+
if err != nil {
155+
if !strings.Contains(err.Error(), ec2Errors.NotFoundInterfaceID) { // ignore InvalidNetworkInterfaceID.NotFound error
156+
// append err and continue, we will retry deletion in the next period/reconcile
157+
leakedENICount += 1
158+
errors = append(errors, fmt.Errorf("failed to delete leaked network interface %v:%v", *nwInterface.NetworkInterfaceId, err))
159+
e.Log.Error(err, "failed to delete the leaked network interface",
160+
"id", *nwInterface.NetworkInterfaceId)
161+
}
162+
continue
163+
}
164+
e.Log.Info("deleted leaked ENI successfully", "eni id", nwInterface.NetworkInterfaceId)
165+
} else {
166+
// Seeing the ENI for the first time, add it to the new list of available network interfaces
167+
availableENIs[*nwInterface.NetworkInterfaceId] = struct{}{}
168+
e.Log.Info("adding eni to to the map of available ENIs, will be removed if present in "+
169+
"next run too", "id", *nwInterface.NetworkInterfaceId)
170+
}
171+
}
172+
173+
if describeNetworkInterfaceOp.NextToken == nil {
174+
break
175+
}
176+
describeNetworkInterfaceIp.NextToken = describeNetworkInterfaceOp.NextToken
177+
}
178+
e.Manager.UpdateCleanupMetrics(vpcrcAvailableCount, vpccniAvailableCount, leakedENICount)
179+
e.Manager.UpdateAvailableENIsIfNeeded(&availableENIs)
180+
return kerrors.NewAggregate(errors)
181+
}
182+
183+
func (e *ClusterENICleaner) GetENITagFilters() []*ec2.Filter {
184+
clusterNameTagKey := fmt.Sprintf(config.ClusterNameTagKeyFormat, e.ClusterName)
185+
return []*ec2.Filter{
186+
{
187+
Name: aws.String("tag:" + clusterNameTagKey),
188+
Values: []*string{aws.String(config.ClusterNameTagValue)},
189+
},
190+
}
191+
}
192+
193+
// ShouldDeleteENI returns true if the ENI should be deleted.
194+
func (e *ClusterENICleaner) ShouldDeleteENI(eniID *string) bool {
195+
if _, exists := e.availableENIs[*eniID]; exists {
196+
return true
197+
}
198+
return false
199+
}
200+
201+
// Set the available ENIs to the list of ENIs seen in the current cycle
202+
// This adds ENIs that should not be deleted in the current cleanup cycle to the internal cache so it can be deleted in next cycle
203+
// This prevents the clean up routine to remove ENIs that are created by another routines and are yet not attached to
204+
// an instance or associated with a trunk interface in the periodic cleanup routine
205+
206+
// Example
207+
// 1st cycle, Describe Available NetworkInterface Result - Interface 1, Interface 2, Interface 3
208+
// 2nd cycle, Describe Available NetworkInterface Result - Interface 2, Interface 3
209+
// In the second cycle we can conclude that Interface 2 and 3 are leaked because they have been sitting for the time
210+
// interval between cycle 1 and 2 and hence can be safely deleted. And we can also conclude that Interface 1 was
211+
// created but not attached at the the time when 1st cycle ran and hence it should not be deleted.
212+
func (e *ClusterENICleaner) UpdateAvailableENIsIfNeeded(eniMap *map[string]struct{}) {
213+
e.availableENIs = *eniMap
214+
}
215+
216+
// Update cluster cleanup metrics for the current cleanup cycle
217+
func (e *ClusterENICleaner) UpdateCleanupMetrics(vpcrcAvailableCount int, vpccniAvailableCount int, leakedENICount int) {
218+
api.VpcRcAvailableClusterENICnt.Set(float64(vpcrcAvailableCount))
219+
api.VpcCniAvailableClusterENICnt.Set(float64(vpccniAvailableCount))
220+
api.LeakedENIClusterCleanupCnt.Set(float64(leakedENICount))
221+
}

0 commit comments

Comments
 (0)