Skip to content

Commit de7470a

Browse files
yash97haouc
andauthored
Adding cninode leak metrics and cninode finalizer handler. (#476)
* add finalizer handler in v1.4 * fix an err variable * adding logs for mismatched CNINode * add metrics for mismatches Co-authored-by: Hao Zhou <[email protected]>
1 parent 42a911f commit de7470a

File tree

7 files changed

+166
-9
lines changed

7 files changed

+166
-9
lines changed

controllers/core/node_controller.go

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package controllers
1515

1616
import (
1717
"context"
18+
"fmt"
1819
"net/http"
1920
"time"
2021

@@ -24,6 +25,8 @@ import (
2425
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/k8s"
2526
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager"
2627
"github.com/google/uuid"
28+
"github.com/prometheus/client_golang/prometheus"
29+
"sigs.k8s.io/controller-runtime/pkg/metrics"
2730

2831
"github.com/go-logr/logr"
2932
corev1 "k8s.io/api/core/v1"
@@ -33,16 +36,29 @@ import (
3336
ctrl "sigs.k8s.io/controller-runtime"
3437
"sigs.k8s.io/controller-runtime/pkg/client"
3538
"sigs.k8s.io/controller-runtime/pkg/controller"
39+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3640
"sigs.k8s.io/controller-runtime/pkg/healthz"
3741
)
3842

43+
var (
44+
leakedCNINodeResourceCount = prometheus.NewCounter(
45+
prometheus.CounterOpts{
46+
Name: "orphaned_cninode_objects",
47+
Help: "The number of leaked cninode resources",
48+
},
49+
)
50+
51+
prometheusRegistered = false
52+
)
53+
3954
// MaxNodeConcurrentReconciles is the number of go routines that can invoke
4055
// Reconcile in parallel. Since Node Reconciler, performs local operation
4156
// on cache only a single go routine should be sufficient. Using more than
4257
// one routines to help high rate churn and larger nodes groups restarting
4358
// when the controller has to be restarted for various reasons.
4459
const (
4560
MaxNodeConcurrentReconciles = 10
61+
NodeTerminationFinalizer = "networking.k8s.aws/resource-cleanup"
4662
)
4763

4864
// NodeReconciler reconciles a Node object
@@ -73,27 +89,45 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
7389
}
7490

7591
node := &corev1.Node{}
76-
var err error
7792

7893
logger := r.Log.WithValues("node", req.NamespacedName)
7994

80-
if err := r.Client.Get(ctx, req.NamespacedName, node); err != nil {
81-
if errors.IsNotFound(err) {
82-
r.Log.V(1).Info("the requested node couldn't be found by k8s client", "Node", req.NamespacedName)
95+
if nodeErr := r.Client.Get(ctx, req.NamespacedName, node); nodeErr != nil {
96+
if errors.IsNotFound(nodeErr) {
97+
// clean up CNINode finalizer
98+
cniNode := &v1alpha1.CNINode{}
99+
if cninodeErr := r.Client.Get(ctx, req.NamespacedName, cniNode); cninodeErr == nil {
100+
if yes := controllerutil.ContainsFinalizer(cniNode, NodeTerminationFinalizer); yes {
101+
updated := cniNode.DeepCopy()
102+
if yes = controllerutil.RemoveFinalizer(updated, NodeTerminationFinalizer); yes {
103+
if err := r.Client.Patch(ctx, updated, client.MergeFrom(cniNode)); err != nil {
104+
return ctrl.Result{}, err
105+
}
106+
r.Log.Info("removed leaked CNINode resource's finalizer", "cninode", cniNode.Name)
107+
}
108+
leakedCNINodeResourceCount.Inc()
109+
}
110+
} else if !errors.IsNotFound(cninodeErr) {
111+
return ctrl.Result{}, fmt.Errorf("failed getting CNINode %s from cached client, %w", cniNode.Name, cninodeErr)
112+
}
113+
114+
// clean up local cached nodes
83115
_, found := r.Manager.GetNode(req.Name)
84116
if found {
85-
err := r.Manager.DeleteNode(req.Name)
86-
if err != nil {
117+
cacheErr := r.Manager.DeleteNode(req.Name)
118+
if cacheErr != nil {
87119
// The request is not retryable so not returning the error
88-
logger.Error(err, "failed to delete node from manager")
120+
logger.Error(cacheErr, "failed to delete node from manager")
89121
return ctrl.Result{}, nil
90122
}
91123
logger.V(1).Info("deleted the node from manager")
92124
}
93125
}
94-
return ctrl.Result{}, client.IgnoreNotFound(err)
126+
return ctrl.Result{}, client.IgnoreNotFound(nodeErr)
95127
}
96128

129+
var err error
130+
97131
_, found := r.Manager.GetNode(req.Name)
98132
if found {
99133
logger.V(1).Info("updating node")
@@ -115,6 +149,8 @@ func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager, healthzHandler *rcHe
115149
map[string]healthz.Checker{"health-node-controller": r.Check()},
116150
)
117151

152+
prometheusRegister()
153+
118154
return ctrl.NewControllerManagedBy(mgr).
119155
For(&corev1.Node{}).
120156
WithOptions(controller.Options{MaxConcurrentReconciles: MaxNodeConcurrentReconciles}).
@@ -152,3 +188,11 @@ func (r *NodeReconciler) Check() healthz.Checker {
152188
return err
153189
}
154190
}
191+
192+
func prometheusRegister() {
193+
if !prometheusRegistered {
194+
metrics.Registry.MustRegister(leakedCNINodeResourceCount)
195+
196+
prometheusRegistered = true
197+
}
198+
}

controllers/core/node_controller_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,21 @@ import (
1818
"testing"
1919
"time"
2020

21+
"github.com/aws/amazon-vpc-resource-controller-k8s/apis/vpcresources/v1alpha1"
2122
mock_condition "github.com/aws/amazon-vpc-resource-controller-k8s/mocks/amazon-vcp-resource-controller-k8s/pkg/condition"
2223
mock_node "github.com/aws/amazon-vpc-resource-controller-k8s/mocks/amazon-vcp-resource-controller-k8s/pkg/node"
2324
mock_manager "github.com/aws/amazon-vpc-resource-controller-k8s/mocks/amazon-vcp-resource-controller-k8s/pkg/node/manager"
2425

2526
"github.com/golang/mock/gomock"
2627
"github.com/stretchr/testify/assert"
2728
corev1 "k8s.io/api/core/v1"
29+
"k8s.io/apimachinery/pkg/api/errors"
2830
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2931
"k8s.io/apimachinery/pkg/runtime"
3032
"k8s.io/apimachinery/pkg/types"
3133
"sigs.k8s.io/controller-runtime/pkg/client"
3234
fakeClient "sigs.k8s.io/controller-runtime/pkg/client/fake"
35+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3336
"sigs.k8s.io/controller-runtime/pkg/log/zap"
3437
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3538
)
@@ -61,6 +64,7 @@ func NewNodeMock(ctrl *gomock.Controller, mockObjects ...client.Object) NodeMock
6164

6265
scheme := runtime.NewScheme()
6366
_ = corev1.AddToScheme(scheme)
67+
_ = v1alpha1.AddToScheme(scheme)
6468
client := fakeClient.NewClientBuilder().WithScheme(scheme).WithObjects(mockObjects...).Build()
6569

6670
return NodeMock{
@@ -139,6 +143,43 @@ func TestNodeReconciler_Reconcile_DeleteNonExistentNode(t *testing.T) {
139143
assert.Equal(t, res, reconcile.Result{})
140144
}
141145

146+
func TestNodeReconciler_Reconcile_DeleteNonExistentNodesCNINode(t *testing.T) {
147+
ctrl := gomock.NewController(t)
148+
defer ctrl.Finish()
149+
150+
mock := NewNodeMock(ctrl)
151+
cniNode := &v1alpha1.CNINode{
152+
ObjectMeta: v1.ObjectMeta{
153+
Name: mockNodeName,
154+
Finalizers: []string{NodeTerminationFinalizer},
155+
},
156+
}
157+
mock.Reconciler.Client = fakeClient.NewClientBuilder().WithScheme(mock.Reconciler.Scheme).WithObjects(cniNode).Build()
158+
159+
mock.Conditions.EXPECT().GetPodDataStoreSyncStatus().Return(true)
160+
mock.Manager.EXPECT().GetNode(mockNodeName).Return(mock.MockNode, false)
161+
162+
original := &v1alpha1.CNINode{}
163+
err := mock.Reconciler.Client.Get(context.TODO(), types.NamespacedName{Name: cniNode.Name}, original)
164+
assert.NoError(t, err)
165+
assert.True(t, controllerutil.ContainsFinalizer(original, NodeTerminationFinalizer), "the CNINode has finalizer")
166+
167+
res, err := mock.Reconciler.Reconcile(context.TODO(), reconcileRequest)
168+
assert.NoError(t, err)
169+
assert.Equal(t, res, reconcile.Result{})
170+
171+
node := &corev1.Node{}
172+
updated := &v1alpha1.CNINode{}
173+
err = mock.Reconciler.Client.Get(context.TODO(), types.NamespacedName{Name: cniNode.Name}, node)
174+
assert.Error(t, err, "the node shouldn't existing")
175+
assert.True(t, errors.IsNotFound(err))
176+
177+
err = mock.Reconciler.Client.Get(context.TODO(), types.NamespacedName{Name: cniNode.Name}, updated)
178+
assert.NoError(t, err)
179+
assert.True(t, updated.Name == mockNodeName, "the CNINode should existing and waiting for finalizer removal")
180+
assert.False(t, controllerutil.ContainsFinalizer(updated, NodeTerminationFinalizer), "CNINode finalizer should be removed when the node is gone")
181+
}
182+
142183
func TestNodeReconciler_Reconcile_DeleteNonExistentUnmanagedNode(t *testing.T) {
143184
ctrl := gomock.NewController(t)
144185
defer ctrl.Finish()

mocks/amazon-vcp-resource-controller-k8s/pkg/aws/ec2/mock_instance.go

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/aws/ec2/instance.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ type EC2Instance interface {
7979
PrimaryNetworkInterfaceID() string
8080
CurrentInstanceSecurityGroups() []string
8181
SetNewCustomNetworkingSpec(subnetID string, securityGroup []string)
82+
GetCustomNetworkingSpec() (subnetID string, securityGroup []string)
8283
UpdateCurrentSubnetAndCidrBlock(helper api.EC2APIHelper) error
8384
}
8485

@@ -311,3 +312,10 @@ func (i *ec2Instance) updateCurrentSubnetAndCidrBlock(ec2APIHelper api.EC2APIHel
311312

312313
return nil
313314
}
315+
316+
func (i *ec2Instance) GetCustomNetworkingSpec() (subnetID string, securityGroup []string) {
317+
i.lock.RLock()
318+
defer i.lock.RUnlock()
319+
320+
return i.newCustomNetworkingSubnetID, i.newCustomNetworkingSecurityGroups
321+
}

pkg/node/manager/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ func (m *manager) CreateCNINodeIfNotExisting(node *v1.Node) error {
228228
}
229229
return err
230230
} else {
231-
m.Log.V(1).Info("The CNINode is already existing", "CNINode", cniNode)
231+
m.Log.Info("The CNINode is already existing", "cninode", cniNode.Name, "features", cniNode.Spec.Features)
232232
return nil
233233
}
234234
}

pkg/provider/branch/trunk/trunk.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package trunk
1616
import (
1717
"encoding/json"
1818
"fmt"
19+
"slices"
1920
"strconv"
2021
"strings"
2122
"sync"
@@ -27,6 +28,7 @@ import (
2728
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/aws/vpc"
2829
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
2930
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/provider/branch/cooldown"
31+
"github.com/samber/lo"
3032

3133
"github.com/aws/aws-sdk-go/aws"
3234
awsEC2 "github.com/aws/aws-sdk-go/service/ec2"
@@ -62,6 +64,13 @@ var (
6264
},
6365
[]string{"operation"},
6466
)
67+
unreconciledTrunkENICount = prometheus.NewCounterVec(
68+
prometheus.CounterOpts{
69+
Name: "unreconciled_trunk_network_interfaces",
70+
Help: "The number of unreconciled trunk network interfaces",
71+
},
72+
[]string{"attribute"},
73+
)
6574
branchENIOperationsSuccessCount = prometheus.NewCounterVec(
6675
prometheus.CounterOpts{
6776
Name: "branch_eni_opeartions_success_count",
@@ -173,6 +182,7 @@ func NewTrunkENI(logger logr.Logger, instance ec2.EC2Instance, helper api.EC2API
173182
func PrometheusRegister() {
174183
if !prometheusRegistered {
175184
metrics.Registry.MustRegister(trunkENIOperationsErrCount)
185+
metrics.Registry.MustRegister(unreconciledTrunkENICount)
176186
metrics.Registry.MustRegister(branchENIOperationsSuccessCount)
177187
metrics.Registry.MustRegister(branchENIOperationsFailureCount)
178188

@@ -192,6 +202,7 @@ func (t *trunkENI) InitTrunk(instance ec2.EC2Instance, podList []v1.Pod) error {
192202
return err
193203
}
194204

205+
var trunk awsEC2.InstanceNetworkInterface
195206
// Get trunk network interface
196207
for _, nwInterface := range nwInterfaces {
197208
// It's possible to get an empty network interface response if the instance is being deleted.
@@ -206,6 +217,7 @@ func (t *trunkENI) InitTrunk(instance ec2.EC2Instance, podList []v1.Pod) error {
206217
} else {
207218
return fmt.Errorf("failed to verify network interface status attached for %v", *nwInterface.NetworkInterfaceId)
208219
}
220+
trunk = *nwInterface
209221
}
210222
}
211223

@@ -231,6 +243,41 @@ func (t *trunkENI) InitTrunk(instance ec2.EC2Instance, podList []v1.Pod) error {
231243
return nil
232244
}
233245

246+
// the node already have trunk, let's check if its SGs and Subnets match with expected
247+
expectedSubnetID, expectedSecurityGroups := t.instance.GetCustomNetworkingSpec()
248+
if len(expectedSecurityGroups) > 0 || expectedSubnetID != "" {
249+
slices.Sort(expectedSecurityGroups)
250+
trunkSGs := lo.Map(trunk.Groups, func(g *awsEC2.GroupIdentifier, _ int) string {
251+
return lo.FromPtr(g.GroupId)
252+
})
253+
slices.Sort(trunkSGs)
254+
255+
mismatchedSubnets := expectedSubnetID != lo.FromPtr(trunk.SubnetId)
256+
mismatchedSGs := !slices.Equal(expectedSecurityGroups, trunkSGs)
257+
258+
extraSGsInTrunk, missingSGsInTrunk := lo.Difference(trunkSGs, expectedSecurityGroups)
259+
t.log.Info("Observed trunk ENI config",
260+
"instanceID", t.instance.InstanceID(),
261+
"trunkENIID", lo.FromPtr(trunk.NetworkInterfaceId),
262+
"configuredTrunkSGs", trunkSGs,
263+
"configuredTrunkSubnet", lo.FromPtr(trunk.SubnetId),
264+
"desiredTrunkSGs", expectedSecurityGroups,
265+
"desiredTrunkSubnet", expectedSubnetID,
266+
"mismatchedSGs", mismatchedSGs,
267+
"mismatchedSubnets", mismatchedSubnets,
268+
"missingSGs", missingSGsInTrunk,
269+
"extraSGs", extraSGsInTrunk,
270+
)
271+
272+
if mismatchedSGs {
273+
unreconciledTrunkENICount.WithLabelValues("security_groups").Inc()
274+
}
275+
276+
if mismatchedSubnets {
277+
unreconciledTrunkENICount.WithLabelValues("subnet").Inc()
278+
}
279+
}
280+
234281
// Get the list of branch ENIs
235282
branchInterfaces, err := t.ec2ApiHelper.GetBranchNetworkInterface(&t.trunkENIId, aws.String(t.instance.SubnetID()))
236283
if err != nil {

pkg/provider/branch/trunk/trunk_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,7 @@ func TestTrunkENI_InitTrunk(t *testing.T) {
645645
name: "TrunkExists_WithBranches, verifies no error when trunk exists with branches",
646646
prepare: func(f *fields) {
647647
f.mockInstance.EXPECT().InstanceID().Return(InstanceId)
648+
f.mockInstance.EXPECT().GetCustomNetworkingSpec().Return("", []string{})
648649
f.mockEC2APIHelper.EXPECT().GetInstanceNetworkInterface(&InstanceId).Return(instanceNwInterfaces, nil)
649650
f.mockEC2APIHelper.EXPECT().WaitForNetworkInterfaceStatusChange(&trunkId, awsEc2.AttachmentStatusAttached).Return(nil)
650651
f.mockInstance.EXPECT().SubnetID().Return(SubnetId)
@@ -674,6 +675,7 @@ func TestTrunkENI_InitTrunk(t *testing.T) {
674675
name: "TrunkExists_DanglingENIs, verifies ENIs are pushed to delete queue if no pod exists",
675676
prepare: func(f *fields) {
676677
f.mockInstance.EXPECT().InstanceID().Return(InstanceId)
678+
f.mockInstance.EXPECT().GetCustomNetworkingSpec().Return("", []string{})
677679
f.mockEC2APIHelper.EXPECT().GetInstanceNetworkInterface(&InstanceId).Return(instanceNwInterfaces, nil)
678680
f.mockEC2APIHelper.EXPECT().WaitForNetworkInterfaceStatusChange(&trunkId, awsEc2.AttachmentStatusAttached).Return(nil)
679681
f.mockInstance.EXPECT().SubnetID().Return(SubnetId)

0 commit comments

Comments
 (0)