|
| 1 | +// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"). You may |
| 4 | +// not use this file except in compliance with the License. A copy of the |
| 5 | +// License is located at |
| 6 | +// |
| 7 | +// http://aws.amazon.com/apache2.0/ |
| 8 | +// |
| 9 | +// or in the "license" file accompanying this file. This file is distributed |
| 10 | +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either |
| 11 | +// express or implied. See the License for the specific language governing |
| 12 | +// permissions and limitations under the License. |
| 13 | + |
| 14 | +package cleanup |
| 15 | + |
| 16 | +import ( |
| 17 | + "context" |
| 18 | + "fmt" |
| 19 | + "strings" |
| 20 | + "time" |
| 21 | + |
| 22 | + "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/ec2/api" |
| 23 | + "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config" |
| 24 | + rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz" |
| 25 | + "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/utils" |
| 26 | + |
| 27 | + ec2Errors "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/errors" |
| 28 | + "github.com/aws/aws-sdk-go/aws" |
| 29 | + "github.com/aws/aws-sdk-go/service/ec2" |
| 30 | + "github.com/go-logr/logr" |
| 31 | + kerrors "k8s.io/apimachinery/pkg/util/errors" |
| 32 | + ctrl "sigs.k8s.io/controller-runtime" |
| 33 | + "sigs.k8s.io/controller-runtime/pkg/healthz" |
| 34 | +) |
| 35 | + |
| 36 | +// NetworkInterfaceManager interface allows to define the ENI filters and checks if ENI should be deleted for different callers like in the periodic cleanup routine or |
| 37 | +// during node termination |
| 38 | +type NetworkInterfaceManager interface { |
| 39 | + GetENITagFilters() []*ec2.Filter |
| 40 | + ShouldDeleteENI(eniID *string) bool |
| 41 | + UpdateAvailableENIsIfNeeded(eniMap *map[string]struct{}) |
| 42 | + UpdateCleanupMetrics(vpcrcAvailableCount int, vpccniAvailableCount int, leakedENICount int) |
| 43 | +} |
| 44 | + |
| 45 | +type ENICleaner struct { |
| 46 | + EC2Wrapper api.EC2Wrapper |
| 47 | + Manager NetworkInterfaceManager |
| 48 | + VpcId string |
| 49 | + Log logr.Logger |
| 50 | +} |
| 51 | + |
| 52 | +// common filters for describing network interfaces |
| 53 | +var CommonNetworkInterfaceFilters = []*ec2.Filter{ |
| 54 | + { |
| 55 | + Name: aws.String("status"), |
| 56 | + Values: []*string{aws.String(ec2.NetworkInterfaceStatusAvailable)}, |
| 57 | + }, |
| 58 | + { |
| 59 | + Name: aws.String("tag:" + config.NetworkInterfaceOwnerTagKey), |
| 60 | + Values: aws.StringSlice([]string{config.NetworkInterfaceOwnerTagValue, |
| 61 | + config.NetworkInterfaceOwnerVPCCNITagValue}), |
| 62 | + }, |
| 63 | +} |
| 64 | + |
| 65 | +// ClusterENICleaner periodically deletes leaked network interfaces(provisioned by the controller or VPC-CNI) in the cluster |
| 66 | +type ClusterENICleaner struct { |
| 67 | + ClusterName string |
| 68 | + shutdown bool |
| 69 | + ctx context.Context |
| 70 | + availableENIs map[string]struct{} |
| 71 | + *ENICleaner |
| 72 | +} |
| 73 | + |
| 74 | +func (e *ClusterENICleaner) SetupWithManager(ctx context.Context, mgr ctrl.Manager, healthzHandler *rcHealthz.HealthzHandler) error { |
| 75 | + e.ctx = ctx |
| 76 | + e.availableENIs = make(map[string]struct{}) |
| 77 | + healthzHandler.AddControllersHealthCheckers( |
| 78 | + map[string]healthz.Checker{ |
| 79 | + "health-interface-cleaner": rcHealthz.SimplePing("interface cleanup", e.Log), |
| 80 | + }, |
| 81 | + ) |
| 82 | + |
| 83 | + return mgr.Add(e) |
| 84 | +} |
| 85 | + |
| 86 | +// StartENICleaner starts the ENI Cleaner routine that cleans up dangling ENIs created by the controller |
| 87 | +func (e *ClusterENICleaner) Start(ctx context.Context) error { |
| 88 | + e.Log.Info("starting eni clean up routine") |
| 89 | + |
| 90 | + // Start routine to listen for shut down signal, on receiving the signal it set shutdown to true |
| 91 | + go func() { |
| 92 | + <-ctx.Done() |
| 93 | + e.shutdown = true |
| 94 | + }() |
| 95 | + // Perform ENI cleanup after fixed time intervals till shut down variable is set to true on receiving the shutdown |
| 96 | + // signal |
| 97 | + for !e.shutdown { |
| 98 | + e.DeleteLeakedResources() |
| 99 | + time.Sleep(config.ENICleanUpInterval) |
| 100 | + } |
| 101 | + |
| 102 | + return nil |
| 103 | +} |
| 104 | + |
| 105 | +// DeleteLeakedResources describes all the network interfaces in available status that are created by the controller or VPC-CNI |
| 106 | +// This is called by periodically by ClusterENICleaner which deletes available ENIs cluster-wide, and by the NodeTermination cleaner on node termination |
| 107 | +// The available ENIs are deleted if ShouldDeleteENI is true, defined in the respective cleaners |
| 108 | +// The function also updates metrics for the periodic cleanup routine and the node termination cleanup |
| 109 | +func (e *ENICleaner) DeleteLeakedResources() error { |
| 110 | + var errors []error |
| 111 | + availableENIs := make(map[string]struct{}) |
| 112 | + vpcrcAvailableCount := 0 |
| 113 | + vpccniAvailableCount := 0 |
| 114 | + leakedENICount := 0 |
| 115 | + |
| 116 | + filters := CommonNetworkInterfaceFilters |
| 117 | + // Append the VPC-ID deep filter for the paginated call |
| 118 | + filters = append(filters, []*ec2.Filter{ |
| 119 | + { |
| 120 | + Name: aws.String("vpc-id"), |
| 121 | + Values: []*string{aws.String(e.VpcId)}, |
| 122 | + }, |
| 123 | + }...) |
| 124 | + // get cleaner specific filters |
| 125 | + filters = append(filters, e.Manager.GetENITagFilters()...) |
| 126 | + describeNetworkInterfaceIp := &ec2.DescribeNetworkInterfacesInput{ |
| 127 | + Filters: filters, |
| 128 | + } |
| 129 | + for { |
| 130 | + describeNetworkInterfaceOp, err := e.EC2Wrapper.DescribeNetworkInterfaces(describeNetworkInterfaceIp) |
| 131 | + if err != nil { |
| 132 | + e.Log.Error(err, "failed to describe network interfaces, cleanup will be retried in next cycle") |
| 133 | + return err |
| 134 | + } |
| 135 | + for _, nwInterface := range describeNetworkInterfaceOp.NetworkInterfaces { |
| 136 | + if e.Manager.ShouldDeleteENI(nwInterface.NetworkInterfaceId) { |
| 137 | + tagMap := utils.GetTagKeyValueMap(nwInterface.TagSet) |
| 138 | + if val, ok := tagMap[config.NetworkInterfaceOwnerTagKey]; ok { |
| 139 | + // Increment promethues metrics for number of leaked ENIs cleaned up |
| 140 | + switch val { |
| 141 | + case config.NetworkInterfaceOwnerTagValue: |
| 142 | + vpcrcAvailableCount += 1 |
| 143 | + case config.NetworkInterfaceOwnerVPCCNITagValue: |
| 144 | + vpccniAvailableCount += 1 |
| 145 | + default: |
| 146 | + // We should not hit this case as we only filter for relevant tag values, log error and continue if unexpected ENIs found |
| 147 | + e.Log.Error(fmt.Errorf("found available ENI not created by VPC-CNI/VPC-RC"), "eniID", *nwInterface.NetworkInterfaceId) |
| 148 | + continue |
| 149 | + } |
| 150 | + } |
| 151 | + _, err := e.EC2Wrapper.DeleteNetworkInterface(&ec2.DeleteNetworkInterfaceInput{ |
| 152 | + NetworkInterfaceId: nwInterface.NetworkInterfaceId, |
| 153 | + }) |
| 154 | + if err != nil { |
| 155 | + if !strings.Contains(err.Error(), ec2Errors.NotFoundInterfaceID) { // ignore InvalidNetworkInterfaceID.NotFound error |
| 156 | + // append err and continue, we will retry deletion in the next period/reconcile |
| 157 | + leakedENICount += 1 |
| 158 | + errors = append(errors, fmt.Errorf("failed to delete leaked network interface %v:%v", *nwInterface.NetworkInterfaceId, err)) |
| 159 | + e.Log.Error(err, "failed to delete the leaked network interface", |
| 160 | + "id", *nwInterface.NetworkInterfaceId) |
| 161 | + } |
| 162 | + continue |
| 163 | + } |
| 164 | + e.Log.Info("deleted leaked ENI successfully", "eni id", nwInterface.NetworkInterfaceId) |
| 165 | + } else { |
| 166 | + // Seeing the ENI for the first time, add it to the new list of available network interfaces |
| 167 | + availableENIs[*nwInterface.NetworkInterfaceId] = struct{}{} |
| 168 | + e.Log.Info("adding eni to to the map of available ENIs, will be removed if present in "+ |
| 169 | + "next run too", "id", *nwInterface.NetworkInterfaceId) |
| 170 | + } |
| 171 | + } |
| 172 | + |
| 173 | + if describeNetworkInterfaceOp.NextToken == nil { |
| 174 | + break |
| 175 | + } |
| 176 | + describeNetworkInterfaceIp.NextToken = describeNetworkInterfaceOp.NextToken |
| 177 | + } |
| 178 | + e.Manager.UpdateCleanupMetrics(vpcrcAvailableCount, vpccniAvailableCount, leakedENICount) |
| 179 | + e.Manager.UpdateAvailableENIsIfNeeded(&availableENIs) |
| 180 | + return kerrors.NewAggregate(errors) |
| 181 | +} |
| 182 | + |
| 183 | +func (e *ClusterENICleaner) GetENITagFilters() []*ec2.Filter { |
| 184 | + clusterNameTagKey := fmt.Sprintf(config.ClusterNameTagKeyFormat, e.ClusterName) |
| 185 | + return []*ec2.Filter{ |
| 186 | + { |
| 187 | + Name: aws.String("tag:" + clusterNameTagKey), |
| 188 | + Values: []*string{aws.String(config.ClusterNameTagValue)}, |
| 189 | + }, |
| 190 | + } |
| 191 | +} |
| 192 | + |
| 193 | +// ShouldDeleteENI returns true if the ENI should be deleted. |
| 194 | +func (e *ClusterENICleaner) ShouldDeleteENI(eniID *string) bool { |
| 195 | + if _, exists := e.availableENIs[*eniID]; exists { |
| 196 | + return true |
| 197 | + } |
| 198 | + return false |
| 199 | +} |
| 200 | + |
| 201 | +// Set the available ENIs to the list of ENIs seen in the current cycle |
| 202 | +// This adds ENIs that should not be deleted in the current cleanup cycle to the internal cache so it can be deleted in next cycle |
| 203 | +// This prevents the clean up routine to remove ENIs that are created by another routines and are yet not attached to |
| 204 | +// an instance or associated with a trunk interface in the periodic cleanup routine |
| 205 | + |
| 206 | +// Example |
| 207 | +// 1st cycle, Describe Available NetworkInterface Result - Interface 1, Interface 2, Interface 3 |
| 208 | +// 2nd cycle, Describe Available NetworkInterface Result - Interface 2, Interface 3 |
| 209 | +// In the second cycle we can conclude that Interface 2 and 3 are leaked because they have been sitting for the time |
| 210 | +// interval between cycle 1 and 2 and hence can be safely deleted. And we can also conclude that Interface 1 was |
| 211 | +// created but not attached at the the time when 1st cycle ran and hence it should not be deleted. |
| 212 | +func (e *ClusterENICleaner) UpdateAvailableENIsIfNeeded(eniMap *map[string]struct{}) { |
| 213 | + e.availableENIs = *eniMap |
| 214 | +} |
| 215 | + |
| 216 | +// Update cluster cleanup metrics for the current cleanup cycle |
| 217 | +func (e *ClusterENICleaner) UpdateCleanupMetrics(vpcrcAvailableCount int, vpccniAvailableCount int, leakedENICount int) { |
| 218 | + api.VpcRcAvailableClusterENICnt.Set(float64(vpcrcAvailableCount)) |
| 219 | + api.VpcCniAvailableClusterENICnt.Set(float64(vpccniAvailableCount)) |
| 220 | + api.LeakedENIClusterCleanupCnt.Set(float64(leakedENICount)) |
| 221 | +} |
0 commit comments