Skip to content
This repository was archived by the owner on Apr 25, 2023. It is now read-only.

Commit 6c2b0b8

Browse files
committed
introduce standard controller runtime metrics
1 parent ba0bf52 commit 6c2b0b8

File tree

2 files changed

+83
-7
lines changed

2 files changed

+83
-7
lines changed

pkg/controller/util/worker.go

+30-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"k8s.io/client-go/util/flowcontrol"
2424
"k8s.io/client-go/util/workqueue"
2525
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
26+
27+
"sigs.k8s.io/kubefed/pkg/metrics"
2628
)
2729

2830
type ReconcileFunc func(qualifiedName QualifiedName) ReconciliationStatus
@@ -128,6 +130,8 @@ func (w *asyncWorker) EnqueueWithDelay(qualifiedName QualifiedName, delay time.D
128130
}
129131

130132
func (w *asyncWorker) Run(stopChan <-chan struct{}) {
133+
w.initMetrics()
134+
131135
StartBackoffGC(w.backoff, stopChan)
132136
w.deliverer.StartWithHandler(func(item *DelayingDelivererItem) {
133137
qualifiedName, ok := item.Value.(*QualifiedName)
@@ -183,16 +187,41 @@ func (w *asyncWorker) reconcileOnce() bool {
183187
return true
184188
}
185189

190+
metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(1)
191+
defer metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(-1)
192+
defer metrics.UpdateControllerRuntimeReconcileTimeFromStart(w.name, time.Now())
193+
186194
status := w.reconcile(qualifiedName)
187195
switch status {
188196
case StatusAllOK:
189-
break
197+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Inc()
190198
case StatusError:
191199
w.EnqueueForError(qualifiedName)
200+
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Inc()
201+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Inc()
192202
case StatusNeedsRecheck:
193203
w.EnqueueForRetry(qualifiedName)
204+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Inc()
194205
case StatusNotSynced:
195206
w.EnqueueForClusterSync(qualifiedName)
207+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Inc()
196208
}
197209
return true
198210
}
211+
212+
const (
213+
labelSuccess = "success"
214+
labelError = "error"
215+
labelNeedsRecheck = "needs_recheck"
216+
labelNotSynced = "not_synced"
217+
)
218+
219+
func (w *asyncWorker) initMetrics() {
220+
metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Set(0)
221+
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Add(0)
222+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Add(0)
223+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Add(0)
224+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Add(0)
225+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Add(0)
226+
metrics.ControllerRuntimeWorkerCount.WithLabelValues(w.name).Set(float64(w.maxConcurrentReconciles))
227+
}

pkg/metrics/metrics.go

+53-6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"github.com/prometheus/client_golang/prometheus"
23+
"github.com/prometheus/client_golang/prometheus/collectors"
2324
"k8s.io/klog/v2"
2425
"sigs.k8s.io/controller-runtime/pkg/metrics"
2526
)
@@ -58,7 +59,7 @@ var (
5859
reconcileFederatedResourcesDuration = prometheus.NewHistogram(
5960
prometheus.HistogramOpts{
6061
Name: "reconcile_federated_resources_duration_seconds",
61-
Help: "Time taken to reconcile federated resources in the target clusters.",
62+
Help: "[Deprecated] Time taken to reconcile federated resources in the target clusters. Replaced by controller_runtime_reconcile_time_seconds.",
6263
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
6364
},
6465
)
@@ -90,18 +91,45 @@ var (
9091
controllerRuntimeReconcileDuration = prometheus.NewHistogramVec(
9192
prometheus.HistogramOpts{
9293
Name: "controller_runtime_reconcile_duration_seconds",
93-
Help: "Time taken by various parts of Kubefed controllers reconciliation loops.",
94+
Help: "[Deprecated] Time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.",
9495
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
9596
}, []string{"controller"},
9697
)
9798

9899
controllerRuntimeReconcileDurationSummary = prometheus.NewSummaryVec(
99100
prometheus.SummaryOpts{
100101
Name: "controller_runtime_reconcile_quantile_seconds",
101-
Help: "Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.",
102+
Help: "[Deprecated] Quantiles of time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.",
102103
MaxAge: time.Hour,
103104
}, []string{"controller"},
104105
)
106+
107+
ControllerRuntimeReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
108+
Name: "controller_runtime_reconcile_total",
109+
Help: "Total number of reconciliations per controller",
110+
}, []string{"controller", "result"})
111+
112+
ControllerRuntimeReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
113+
Name: "controller_runtime_reconcile_errors_total",
114+
Help: "Total number of reconciliation errors per controller",
115+
}, []string{"controller"})
116+
117+
ControllerRuntimeReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
118+
Name: "controller_runtime_reconcile_time_seconds",
119+
Help: "Length of time per reconciliation per controller",
120+
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
121+
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
122+
}, []string{"controller"})
123+
124+
ControllerRuntimeWorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
125+
Name: "controller_runtime_max_concurrent_reconciles",
126+
Help: "Maximum number of concurrent reconciles per controller",
127+
}, []string{"controller"})
128+
129+
ControllerRuntimeActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
130+
Name: "controller_runtime_active_workers",
131+
Help: "Number of currently used workers per controller",
132+
}, []string{"controller"})
105133
)
106134

107135
const (
@@ -117,6 +145,10 @@ const (
117145
// RegisterAll registers all metrics.
118146
func RegisterAll() {
119147
metrics.Registry.MustRegister(
148+
// expose process metrics like CPU, Memory, file descriptor usage etc.
149+
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
150+
// expose Go runtime metrics like GC stats, memory stats etc.
151+
collectors.NewGoCollector(),
120152
kubefedClusterTotal,
121153
joinedClusterTotal,
122154
reconcileFederatedResourcesDuration,
@@ -127,6 +159,11 @@ func RegisterAll() {
127159
dispatchOperationDuration,
128160
controllerRuntimeReconcileDuration,
129161
controllerRuntimeReconcileDurationSummary,
162+
ControllerRuntimeReconcileTotal,
163+
ControllerRuntimeReconcileErrors,
164+
ControllerRuntimeReconcileTime,
165+
ControllerRuntimeWorkerCount,
166+
ControllerRuntimeActiveWorkers,
130167
)
131168
}
132169

@@ -203,10 +240,20 @@ func UpdateControllerReconcileDurationFromStart(controller string, start time.Ti
203240

204241
// UpdateControllerReconcileDuration records the duration of the reconcile function of a controller
205242
func UpdateControllerReconcileDuration(controller string, duration time.Duration) {
243+
controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
244+
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
245+
}
246+
247+
// UpdateControllerRuntimeReconcileTimeFromStart records the duration of the reconcile loop of a controller
248+
func UpdateControllerRuntimeReconcileTimeFromStart(controller string, start time.Time) {
249+
duration := time.Since(start)
250+
UpdateControllerRuntimeReconcileTime(controller, duration)
251+
}
252+
253+
// UpdateControllerRuntimeReconcileTime records the duration of the reconcile function of a controller
254+
func UpdateControllerRuntimeReconcileTime(controller string, duration time.Duration) {
206255
if duration > LogReconcileLongDurationThreshold {
207256
klog.V(4).Infof("Reconcile loop %s took %v to complete", controller, duration)
208257
}
209-
210-
controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
211-
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
258+
ControllerRuntimeReconcileTime.WithLabelValues(controller).Observe(duration.Seconds())
212259
}

0 commit comments

Comments
 (0)