Skip to content

Improve PodEvictor observability through EvictOptions #1349

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ var (
Name: "pods_evicted",
Help: "Number of evicted pods, by the result, by the strategy, by the namespace, by the node name. 'error' result means a pod could not be evicted",
StabilityLevel: metrics.ALPHA,
}, []string{"result", "strategy", "namespace", "node"})
}, []string{"result", "strategy", "profile", "namespace", "node"})

buildInfo = metrics.NewGauge(
&metrics.GaugeOpts{
Expand Down
15 changes: 11 additions & 4 deletions pkg/descheduler/evictions/evictions.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type PodEvictor struct {
namespacePodCount namespacePodEvictCount
metricsEnabled bool
eventRecorder events.EventRecorder
profileName string
}

func NewPodEvictor(
Expand Down Expand Up @@ -82,6 +83,7 @@ func NewPodEvictor(
namespacePodCount: namespacePodCount,
metricsEnabled: metricsEnabled,
eventRecorder: eventRecorder,
profileName: "",
}
}

Expand All @@ -90,6 +92,11 @@ func (pe *PodEvictor) NodeEvicted(node *v1.Node) uint {
return pe.nodepodCount[node.Name]
}

// ProfileName sets a profile name used for observability
func (pe *PodEvictor) ProfileName(name string) {
pe.profileName = name
}

// TotalEvicted gives a number of pods evicted through all nodes
func (pe *PodEvictor) TotalEvicted() uint {
var total uint
Expand Down Expand Up @@ -128,7 +135,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
if pod.Spec.NodeName != "" {
if pe.maxPodsToEvictPerNode != nil && pe.nodepodCount[pod.Spec.NodeName]+1 > *pe.maxPodsToEvictPerNode {
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": pe.profileName}).Inc()
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", "Maximum number of evicted pods per node reached")))
klog.ErrorS(fmt.Errorf("maximum number of evicted pods per node reached"), "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
Expand All @@ -138,7 +145,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio

if pe.maxPodsToEvictPerNamespace != nil && pe.namespacePodCount[pod.Namespace]+1 > *pe.maxPodsToEvictPerNamespace {
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": pe.profileName}).Inc()
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", "Maximum number of evicted pods per namespace reached")))
klog.ErrorS(fmt.Errorf("maximum number of evicted pods per namespace reached"), "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace)
Expand All @@ -151,7 +158,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", opts.Reason)
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": pe.profileName}).Inc()
}
return false
}
Expand All @@ -162,7 +169,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
pe.namespacePodCount[pod.Namespace]++

if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName}).Inc()
metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": pe.profileName}).Inc()
}

if pe.dryRun {
Expand Down
2 changes: 2 additions & 0 deletions pkg/framework/profile/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ func NewProfile(config api.DeschedulerProfile, reg pluginregistry.Registry, opts
return nil, fmt.Errorf("podEvictor missing")
}

hOpts.podEvictor.ProfileName(config.Name)

pi := &profileImpl{
profileName: config.Name,
podEvictor: hOpts.podEvictor,
Expand Down