Skip to content

Commit bb5930e

Browse files
authored
Improve PodEvictor observability through EvictOptions (#1349)
* feat: profile name for pods_evicted metric Support new label "profile" for "pods_evicted" metric to allow understand which profiles are evicting more pods, allowing better observability * refactor: evictoptions improved observability Send profile and strategy names for EvictOptions, allowing Evictors to access observability information * cleanup: remove unnecessary evictoption reference * feat: evictoptions for nodeutilzation Explicit usage of options when invoking evictPods from the helper function from nodeutilization for both highnodeutilization and lownodeutilization
1 parent 6c865fd commit bb5930e

File tree

15 files changed

+36
-38
lines changed

15 files changed

+36
-38
lines changed

metrics/metrics.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ var (
3636
Name: "pods_evicted",
3737
Help: "Number of evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
3838
StabilityLevel: metrics.ALPHA,
39-
}, []string{"result", "strategy", "namespace", "node"})
39+
}, []string{"result", "strategy", "profile", "namespace", "node"})
4040

4141
buildInfo = metrics.NewGauge(
4242
&metrics.GaugeOpts{

pkg/descheduler/evictions/evictions.go

+11-12
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ func (pe *PodEvictor) NodeLimitExceeded(node *v1.Node) bool {
111111
type EvictOptions struct {
112112
// Reason allows for passing details about the specific eviction for logging.
113113
Reason string
114+
// ProfileName allows for passing details about profile for observability.
115+
ProfileName string
116+
// StrategyName allows for passing details about strategy for observability.
117+
StrategyName string
114118
}
115119

116120
// EvictPod evicts a pod while exercising eviction limits.
@@ -119,16 +123,11 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
119123
var span trace.Span
120124
ctx, span = tracing.Tracer().Start(ctx, "EvictPod", trace.WithAttributes(attribute.String("podName", pod.Name), attribute.String("podNamespace", pod.Namespace), attribute.String("reason", opts.Reason), attribute.String("operation", tracing.EvictOperation)))
121125
defer span.End()
122-
// TODO: Replace context-propagated Strategy name with a defined framework handle for accessing Strategy info
123-
strategy := ""
124-
if ctx.Value("strategyName") != nil {
125-
strategy = ctx.Value("strategyName").(string)
126-
}
127126

128127
if pod.Spec.NodeName != "" {
129128
if pe.maxPodsToEvictPerNode != nil && pe.nodepodCount[pod.Spec.NodeName]+1 > *pe.maxPodsToEvictPerNode {
130129
if pe.metricsEnabled {
131-
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
130+
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
132131
}
133132
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", "Maximum number of evicted pods per node reached")))
134133
klog.ErrorS(fmt.Errorf("maximum number of evicted pods per node reached"), "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
@@ -138,7 +137,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
138137

139138
if pe.maxPodsToEvictPerNamespace != nil && pe.namespacePodCount[pod.Namespace]+1 > *pe.maxPodsToEvictPerNamespace {
140139
if pe.metricsEnabled {
141-
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
140+
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
142141
}
143142
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", "Maximum number of evicted pods per namespace reached")))
144143
klog.ErrorS(fmt.Errorf("maximum number of evicted pods per namespace reached"), "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace)
@@ -151,7 +150,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
151150
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
152151
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", opts.Reason)
153152
if pe.metricsEnabled {
154-
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
153+
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
155154
}
156155
return false
157156
}
@@ -162,16 +161,16 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
162161
pe.namespacePodCount[pod.Namespace]++
163162

164163
if pe.metricsEnabled {
165-
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
164+
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
166165
}
167166

168167
if pe.dryRun {
169-
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", strategy, "node", pod.Spec.NodeName)
168+
klog.V(1).InfoS("Evicted pod in dry run mode", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", opts.StrategyName, "node", pod.Spec.NodeName, "profile", opts.ProfileName)
170169
} else {
171-
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", strategy, "node", pod.Spec.NodeName)
170+
klog.V(1).InfoS("Evicted pod", "pod", klog.KObj(pod), "reason", opts.Reason, "strategy", opts.StrategyName, "node", pod.Spec.NodeName, "profile", opts.ProfileName)
172171
reason := opts.Reason
173172
if len(reason) == 0 {
174-
reason = strategy
173+
reason = opts.StrategyName
175174
if len(reason) == 0 {
176175
reason = "NotSet"
177176
}

pkg/framework/plugins/nodeutilization/highnodeutilization.go

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"k8s.io/apimachinery/pkg/runtime"
2626
"k8s.io/klog/v2"
2727
"sigs.k8s.io/descheduler/pkg/api"
28+
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
2829
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
2930

3031
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
@@ -144,6 +145,7 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr
144145
sourceNodes,
145146
highNodes,
146147
h.handle.Evictor(),
148+
evictions.EvictOptions{StrategyName: HighNodeUtilizationPluginName},
147149
h.podFilter,
148150
resourceNames,
149151
continueEvictionCond)

pkg/framework/plugins/nodeutilization/lownodeutilization.go

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"k8s.io/apimachinery/pkg/api/resource"
2525
"k8s.io/apimachinery/pkg/runtime"
2626
"k8s.io/klog/v2"
27+
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
2728
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
2829
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
2930
frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types"
@@ -191,6 +192,7 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra
191192
sourceNodes,
192193
lowNodes,
193194
l.handle.Evictor(),
195+
evictions.EvictOptions{StrategyName: LowNodeUtilizationPluginName},
194196
l.podFilter,
195197
resourceNames,
196198
continueEvictionCond)

pkg/framework/plugins/nodeutilization/nodeutilization.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ func evictPodsFromSourceNodes(
222222
evictableNamespaces *api.Namespaces,
223223
sourceNodes, destinationNodes []NodeInfo,
224224
podEvictor frameworktypes.Evictor,
225+
evictOptions evictions.EvictOptions,
225226
podFilter func(pod *v1.Pod) bool,
226227
resourceNames []v1.ResourceName,
227228
continueEviction continueEvictionCond,
@@ -273,7 +274,7 @@ func evictPodsFromSourceNodes(
273274
klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers")
274275
// sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers.
275276
podutil.SortPodsBasedOnPriorityLowToHigh(removablePods)
276-
evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, continueEviction)
277+
evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, evictOptions, continueEviction)
277278

278279
}
279280
}
@@ -286,6 +287,7 @@ func evictPods(
286287
totalAvailableUsage map[v1.ResourceName]*resource.Quantity,
287288
taintsOfLowNodes map[string][]v1.Taint,
288289
podEvictor frameworktypes.Evictor,
290+
evictOptions evictions.EvictOptions,
289291
continueEviction continueEvictionCond,
290292
) {
291293
var excludedNamespaces sets.Set[string]
@@ -310,7 +312,7 @@ func evictPods(
310312
}
311313

312314
if preEvictionFilterWithOptions(pod) {
313-
if podEvictor.Evict(ctx, pod, evictions.EvictOptions{}) {
315+
if podEvictor.Evict(ctx, pod, evictOptions) {
314316
klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod))
315317

316318
for name := range totalAvailableUsage {

pkg/framework/plugins/podlifetime/pod_lifetime.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ func (d *PodLifeTime) Deschedule(ctx context.Context, nodes []*v1.Node) *framewo
133133

134134
for _, pod := range podsToEvict {
135135
if !d.handle.Evictor().NodeLimitExceeded(nodeMap[pod.Spec.NodeName]) {
136-
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{})
136+
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{StrategyName: PluginName})
137137
}
138138
}
139139

pkg/framework/plugins/removeduplicates/removeduplicates.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ func (r *RemoveDuplicates) Balance(ctx context.Context, nodes []*v1.Node) *frame
210210
// It's assumed all duplicated pods are in the same priority class
211211
// TODO(jchaloup): check if the pod has a different node to lend to
212212
for _, pod := range pods[upperAvg-1:] {
213-
r.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{})
213+
r.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{StrategyName: PluginName})
214214
if r.handle.Evictor().NodeLimitExceeded(nodeMap[nodeName]) {
215215
continue loop
216216
}

pkg/framework/plugins/removefailedpods/failedpods.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func (d *RemoveFailedPods) Deschedule(ctx context.Context, nodes []*v1.Node) *fr
103103
}
104104
totalPods := len(pods)
105105
for i := 0; i < totalPods; i++ {
106-
d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{})
106+
d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{StrategyName: PluginName})
107107
if d.handle.Evictor().NodeLimitExceeded(node) {
108108
break
109109
}

pkg/framework/plugins/removepodshavingtoomanyrestarts/toomanyrestarts.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ func (d *RemovePodsHavingTooManyRestarts) Deschedule(ctx context.Context, nodes
123123
}
124124
totalPods := len(pods)
125125
for i := 0; i < totalPods; i++ {
126-
d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{})
126+
d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{StrategyName: PluginName})
127127
if d.handle.Evictor().NodeLimitExceeded(node) {
128128
break
129129
}

pkg/framework/plugins/removepodsviolatinginterpodantiaffinity/pod_antiaffinity.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ loop:
9898
totalPods := len(pods)
9999
for i := 0; i < totalPods; i++ {
100100
if checkPodsWithAntiAffinityExist(pods[i], podsInANamespace, nodeMap) && d.handle.Evictor().Filter(pods[i]) && d.handle.Evictor().PreEvictionFilter(pods[i]) {
101-
if d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{}) {
101+
if d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{StrategyName: PluginName}) {
102102
// Since the current pod is evicted all other pods which have anti-affinity with this
103103
// pod need not be evicted.
104104
// Update allPods.

pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ func (d *RemovePodsViolatingNodeAffinity) processNodes(ctx context.Context, node
136136

137137
for _, pod := range pods {
138138
klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod))
139-
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{})
139+
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{StrategyName: PluginName})
140140
if d.handle.Evictor().NodeLimitExceeded(node) {
141141
break
142142
}

pkg/framework/plugins/removepodsviolatingnodetaints/node_taint.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func (d *RemovePodsViolatingNodeTaints) Deschedule(ctx context.Context, nodes []
112112
d.taintFilterFnc,
113113
) {
114114
klog.V(2).InfoS("Not all taints with NoSchedule effect are tolerated after update for pod on node", "pod", klog.KObj(pods[i]), "node", klog.KObj(node))
115-
d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{})
115+
d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{ProfileName: PluginName})
116116
if d.handle.Evictor().NodeLimitExceeded(node) {
117117
break
118118
}

pkg/framework/plugins/removepodsviolatingtopologyspreadconstraint/topologyspreadconstraint.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ func (d *RemovePodsViolatingTopologySpreadConstraint) Balance(ctx context.Contex
235235
}
236236

237237
if d.handle.Evictor().PreEvictionFilter(pod) {
238-
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{})
238+
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{StrategyName: PluginName})
239239
}
240240
if d.handle.Evictor().NodeLimitExceeded(nodeMap[pod.Spec.NodeName]) {
241241
nodeLimitExceeded[pod.Spec.NodeName] = true

pkg/framework/profile/profile.go

+6-13
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import (
4040
// evictorImpl implements the Evictor interface so plugins
4141
// can evict a pod without importing a specific pod evictor
4242
type evictorImpl struct {
43+
profileName string
4344
podEvictor *evictions.PodEvictor
4445
filter podutil.FilterFunc
4546
preEvictionFilter podutil.FilterFunc
@@ -59,6 +60,7 @@ func (ei *evictorImpl) PreEvictionFilter(pod *v1.Pod) bool {
5960

6061
// Evict evicts a pod (no pre-check performed)
6162
func (ei *evictorImpl) Evict(ctx context.Context, pod *v1.Pod, opts evictions.EvictOptions) bool {
63+
opts.ProfileName = ei.profileName
6264
return ei.podEvictor.EvictPod(ctx, pod, opts)
6365
}
6466

@@ -252,7 +254,8 @@ func NewProfile(config api.DeschedulerProfile, reg pluginregistry.Registry, opts
252254
getPodsAssignedToNodeFunc: hOpts.getPodsAssignedToNodeFunc,
253255
sharedInformerFactory: hOpts.sharedInformerFactory,
254256
evictor: &evictorImpl{
255-
podEvictor: hOpts.podEvictor,
257+
profileName: config.Name,
258+
podEvictor: hOpts.podEvictor,
256259
},
257260
}
258261

@@ -307,13 +310,8 @@ func (d profileImpl) RunDeschedulePlugins(ctx context.Context, nodes []*v1.Node)
307310
ctx, span = tracing.Tracer().Start(ctx, pl.Name(), trace.WithAttributes(attribute.String("plugin", pl.Name()), attribute.String("profile", d.profileName), attribute.String("operation", tracing.DescheduleOperation)))
308311
defer span.End()
309312
evicted := d.podEvictor.TotalEvicted()
310-
// TODO: strategyName should be accessible from within the strategy using a framework
311-
// handle or function which the Evictor has access to. For migration/in-progress framework
312-
// work, we are currently passing this via context. To be removed
313-
// (See discussion thread https://github.com/kubernetes-sigs/descheduler/pull/885#discussion_r919962292)
314313
strategyStart := time.Now()
315-
childCtx := context.WithValue(ctx, "strategyName", pl.Name())
316-
status := pl.Deschedule(childCtx, nodes)
314+
status := pl.Deschedule(ctx, nodes)
317315
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
318316

319317
if status != nil && status.Err != nil {
@@ -340,13 +338,8 @@ func (d profileImpl) RunBalancePlugins(ctx context.Context, nodes []*v1.Node) *f
340338
ctx, span = tracing.Tracer().Start(ctx, pl.Name(), trace.WithAttributes(attribute.String("plugin", pl.Name()), attribute.String("profile", d.profileName), attribute.String("operation", tracing.BalanceOperation)))
341339
defer span.End()
342340
evicted := d.podEvictor.TotalEvicted()
343-
// TODO: strategyName should be accessible from within the strategy using a framework
344-
// handle or function which the Evictor has access to. For migration/in-progress framework
345-
// work, we are currently passing this via context. To be removed
346-
// (See discussion thread https://github.com/kubernetes-sigs/descheduler/pull/885#discussion_r919962292)
347341
strategyStart := time.Now()
348-
childCtx := context.WithValue(ctx, "strategyName", pl.Name())
349-
status := pl.Balance(childCtx, nodes)
342+
status := pl.Balance(ctx, nodes)
350343
metrics.DeschedulerStrategyDuration.With(map[string]string{"strategy": pl.Name(), "profile": d.profileName}).Observe(time.Since(strategyStart).Seconds())
351344

352345
if status != nil && status.Err != nil {

pkg/framework/profile/profile_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ func TestProfileDescheduleBalanceExtensionPointsEviction(t *testing.T) {
185185
if test.extensionPoint == frameworktypes.DescheduleExtensionPoint {
186186
fakePlugin.AddReactor(string(frameworktypes.DescheduleExtensionPoint), func(action fakeplugin.Action) (handled, filter bool, err error) {
187187
if dAction, ok := action.(fakeplugin.DescheduleAction); ok {
188-
if dAction.Handle().Evictor().Evict(ctx, p1, evictions.EvictOptions{}) {
188+
if dAction.Handle().Evictor().Evict(ctx, p1, evictions.EvictOptions{StrategyName: fakePlugin.PluginName}) {
189189
return true, false, nil
190190
}
191191
return true, false, fmt.Errorf("pod not evicted")
@@ -196,7 +196,7 @@ func TestProfileDescheduleBalanceExtensionPointsEviction(t *testing.T) {
196196
if test.extensionPoint == frameworktypes.BalanceExtensionPoint {
197197
fakePlugin.AddReactor(string(frameworktypes.BalanceExtensionPoint), func(action fakeplugin.Action) (handled, filter bool, err error) {
198198
if dAction, ok := action.(fakeplugin.BalanceAction); ok {
199-
if dAction.Handle().Evictor().Evict(ctx, p1, evictions.EvictOptions{}) {
199+
if dAction.Handle().Evictor().Evict(ctx, p1, evictions.EvictOptions{StrategyName: fakePlugin.PluginName}) {
200200
return true, false, nil
201201
}
202202
return true, false, fmt.Errorf("pod not evicted")

0 commit comments

Comments
 (0)