Skip to content
This repository was archived by the owner on Apr 25, 2023. It is now read-only.

Commit 43420dc

Browse files
committed
introduce standard controller runtime metrics
1 parent ba0bf52 commit 43420dc

File tree

2 files changed

+89
-7
lines changed

2 files changed

+89
-7
lines changed

pkg/controller/util/worker.go

+30-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"k8s.io/client-go/util/flowcontrol"
2424
"k8s.io/client-go/util/workqueue"
2525
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"
26+
27+
"sigs.k8s.io/kubefed/pkg/metrics"
2628
)
2729

2830
type ReconcileFunc func(qualifiedName QualifiedName) ReconciliationStatus
@@ -128,6 +130,8 @@ func (w *asyncWorker) EnqueueWithDelay(qualifiedName QualifiedName, delay time.D
128130
}
129131

130132
func (w *asyncWorker) Run(stopChan <-chan struct{}) {
133+
w.initMetrics()
134+
131135
StartBackoffGC(w.backoff, stopChan)
132136
w.deliverer.StartWithHandler(func(item *DelayingDelivererItem) {
133137
qualifiedName, ok := item.Value.(*QualifiedName)
@@ -183,16 +187,41 @@ func (w *asyncWorker) reconcileOnce() bool {
183187
return true
184188
}
185189

190+
metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(1)
191+
defer metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(-1)
192+
defer metrics.UpdateControllerRuntimeReconcileTimeFromStart(w.name, time.Now())
193+
186194
status := w.reconcile(qualifiedName)
187195
switch status {
188196
case StatusAllOK:
189-
break
197+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Inc()
190198
case StatusError:
191199
w.EnqueueForError(qualifiedName)
200+
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Inc()
201+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Inc()
192202
case StatusNeedsRecheck:
193203
w.EnqueueForRetry(qualifiedName)
204+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Inc()
194205
case StatusNotSynced:
195206
w.EnqueueForClusterSync(qualifiedName)
207+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Inc()
196208
}
197209
return true
198210
}
211+
212+
const (
213+
labelSuccess = "success"
214+
labelError = "error"
215+
labelNeedsRecheck = "needs_recheck"
216+
labelNotSynced = "not_synced"
217+
)
218+
219+
func (w *asyncWorker) initMetrics() {
220+
metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Set(0)
221+
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Add(0)
222+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Add(0)
223+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Add(0)
224+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Add(0)
225+
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Add(0)
226+
metrics.ControllerRuntimeWorkerCount.WithLabelValues(w.name).Set(float64(w.maxConcurrentReconciles))
227+
}

pkg/metrics/metrics.go

+59-6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"github.com/prometheus/client_golang/prometheus"
23+
"github.com/prometheus/client_golang/prometheus/collectors"
2324
"k8s.io/klog/v2"
2425
"sigs.k8s.io/controller-runtime/pkg/metrics"
2526
)
@@ -55,10 +56,11 @@ var (
5556
},
5657
)
5758

59+
// Deprecated: use the more common ControllerRuntimeReconcileTime instead
5860
reconcileFederatedResourcesDuration = prometheus.NewHistogram(
5961
prometheus.HistogramOpts{
6062
Name: "reconcile_federated_resources_duration_seconds",
61-
Help: "Time taken to reconcile federated resources in the target clusters.",
63+
Help: "[Deprecated] Time taken to reconcile federated resources in the target clusters.",
6264
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
6365
},
6466
)
@@ -87,21 +89,50 @@ var (
8789
}, []string{"action"},
8890
)
8991

92+
// Deprecated: use the more common ControllerRuntimeReconcileTime instead
9093
controllerRuntimeReconcileDuration = prometheus.NewHistogramVec(
9194
prometheus.HistogramOpts{
9295
Name: "controller_runtime_reconcile_duration_seconds",
93-
Help: "Time taken by various parts of Kubefed controllers reconciliation loops.",
96+
Help: "[Deprecated] Time taken by various parts of Kubefed controllers reconciliation loops.",
9497
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
9598
}, []string{"controller"},
9699
)
97100

101+
// Deprecated: use the more common ControllerRuntimeReconcileTime instead
98102
controllerRuntimeReconcileDurationSummary = prometheus.NewSummaryVec(
99103
prometheus.SummaryOpts{
100104
Name: "controller_runtime_reconcile_quantile_seconds",
101-
Help: "Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.",
105+
Help: "[Deprecated] Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.",
102106
MaxAge: time.Hour,
103107
}, []string{"controller"},
104108
)
109+
110+
ControllerRuntimeReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
111+
Name: "controller_runtime_reconcile_total",
112+
Help: "Total number of reconciliations per controller",
113+
}, []string{"controller", "result"})
114+
115+
ControllerRuntimeReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
116+
Name: "controller_runtime_reconcile_errors_total",
117+
Help: "Total number of reconciliation errors per controller",
118+
}, []string{"controller"})
119+
120+
ControllerRuntimeReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
121+
Name: "controller_runtime_reconcile_time_seconds",
122+
Help: "Length of time per reconciliation per controller",
123+
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
124+
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
125+
}, []string{"controller"})
126+
127+
ControllerRuntimeWorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
128+
Name: "controller_runtime_max_concurrent_reconciles",
129+
Help: "Maximum number of concurrent reconciles per controller",
130+
}, []string{"controller"})
131+
132+
ControllerRuntimeActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
133+
Name: "controller_runtime_active_workers",
134+
Help: "Number of currently used workers per controller",
135+
}, []string{"controller"})
105136
)
106137

107138
const (
@@ -117,6 +148,10 @@ const (
117148
// RegisterAll registers all metrics.
118149
func RegisterAll() {
119150
metrics.Registry.MustRegister(
151+
// expose process metrics like CPU, Memory, file descriptor usage etc.
152+
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
153+
// expose Go runtime metrics like GC stats, memory stats etc.
154+
collectors.NewGoCollector(),
120155
kubefedClusterTotal,
121156
joinedClusterTotal,
122157
reconcileFederatedResourcesDuration,
@@ -127,6 +162,11 @@ func RegisterAll() {
127162
dispatchOperationDuration,
128163
controllerRuntimeReconcileDuration,
129164
controllerRuntimeReconcileDurationSummary,
165+
ControllerRuntimeReconcileTotal,
166+
ControllerRuntimeReconcileErrors,
167+
ControllerRuntimeReconcileTime,
168+
ControllerRuntimeWorkerCount,
169+
ControllerRuntimeActiveWorkers,
130170
)
131171
}
132172

@@ -188,25 +228,38 @@ func UnjoinedClusterDurationFromStart(start time.Time) {
188228
unjoinedClusterDuration.Observe(duration.Seconds())
189229
}
190230

231+
// Deprecated: use the more common UpdateControllerRuntimeReconcileTimeFromStart instead
191232
// ReconcileFederatedResourcesDurationFromStart records the duration of the federation of resources
192233
func ReconcileFederatedResourcesDurationFromStart(start time.Time) {
193234
duration := time.Since(start)
194235
reconcileFederatedResourcesDuration.Observe(duration.Seconds())
195236
}
196237

238+
// Deprecated: use the more common UpdateControllerRuntimeReconcileTimeFromStart instead
197239
// UpdateControllerReconcileDurationFromStart records the duration of the reconcile loop
198240
// of a controller
199241
func UpdateControllerReconcileDurationFromStart(controller string, start time.Time) {
200242
duration := time.Since(start)
201243
UpdateControllerReconcileDuration(controller, duration)
202244
}
203245

246+
// Deprecated: use the more common UpdateControllerRuntimeReconcileTime instead
204247
// UpdateControllerReconcileDuration records the duration of the reconcile function of a controller
205248
func UpdateControllerReconcileDuration(controller string, duration time.Duration) {
249+
controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
250+
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
251+
}
252+
253+
// UpdateControllerRuntimeReconcileTimeFromStart records the duration of the reconcile loop of a controller
254+
func UpdateControllerRuntimeReconcileTimeFromStart(controller string, start time.Time) {
255+
duration := time.Since(start)
256+
UpdateControllerRuntimeReconcileTime(controller, duration)
257+
}
258+
259+
// UpdateControllerRuntimeReconcileTime records the duration of the reconcile function of a controller
260+
func UpdateControllerRuntimeReconcileTime(controller string, duration time.Duration) {
206261
if duration > LogReconcileLongDurationThreshold {
207262
klog.V(4).Infof("Reconcile loop %s took %v to complete", controller, duration)
208263
}
209-
210-
controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
211-
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
264+
ControllerRuntimeReconcileTime.WithLabelValues(controller).Observe(duration.Seconds())
212265
}

0 commit comments

Comments
 (0)