@@ -14,48 +14,50 @@ import (
14
14
)
15
15
16
16
type metricsExporter struct {
17
- apiClient sdk.APIClient
17
+ coreClient core.APIClient
18
+ authnClient authn.APIClient
18
19
scrapeInterval time.Duration
19
- totalProjects prometheus.Gauge
20
- totalUsers prometheus.Gauge
21
- totalServiceAccounts prometheus.Gauge
20
+ projectsGauge prometheus.Gauge
21
+ usersGauge prometheus.Gauge
22
+ serviceAccountsGauge prometheus.Gauge
22
23
allWorkersByPhase * prometheus.GaugeVec
23
- totalPendingJobs prometheus.Gauge
24
+ pendingJobsGauge prometheus.Gauge
24
25
}
25
26
26
27
func newMetricsExporter (
27
28
apiClient sdk.APIClient ,
28
29
scrapeInterval time.Duration ,
29
30
) * metricsExporter {
30
31
return & metricsExporter {
31
- apiClient : apiClient ,
32
+ coreClient : apiClient .Core (),
33
+ authnClient : apiClient .Authn (),
32
34
scrapeInterval : scrapeInterval ,
33
- totalProjects : promauto .NewGauge (
35
+ projectsGauge : promauto .NewGauge (
34
36
prometheus.GaugeOpts {
35
37
Name : "brigade_projects_total" ,
36
- Help : "The total number of brigade projects" ,
38
+ Help : "The total number of projects" ,
37
39
},
38
40
),
39
- totalUsers : promauto .NewGauge (
41
+ usersGauge : promauto .NewGauge (
40
42
prometheus.GaugeOpts {
41
43
Name : "brigade_users_total" ,
42
44
Help : "The total number of users" ,
43
45
},
44
46
),
45
- totalServiceAccounts : promauto .NewGauge (
47
+ serviceAccountsGauge : promauto .NewGauge (
46
48
prometheus.GaugeOpts {
47
49
Name : "brigade_service_accounts_total" ,
48
50
Help : "The total number of service accounts" ,
49
51
},
50
52
),
51
53
allWorkersByPhase : promauto .NewGaugeVec (
52
54
prometheus.GaugeOpts {
53
- Name : "brigade_all_workers_by_phase " ,
54
- Help : "All workers separated by phase" ,
55
+ Name : "brigade_events_by_worker_phase " ,
56
+ Help : "The total number of events grouped by worker phase" ,
55
57
},
56
58
[]string {"workerPhase" },
57
59
),
58
- totalPendingJobs : promauto .NewGauge (
60
+ pendingJobsGauge : promauto .NewGauge (
59
61
prometheus.GaugeOpts {
60
62
Name : "brigade_pending_jobs_total" ,
61
63
Help : "The total number of pending jobs" ,
@@ -70,112 +72,126 @@ func (m *metricsExporter) run(ctx context.Context) {
70
72
for {
71
73
select {
72
74
case <- ticker .C :
73
- m .recordMetrics ()
75
+ if err := m .recordProjectsCount (); err != nil {
76
+ log .Println (err )
77
+ }
78
+ if err := m .recordUsersCount (); err != nil {
79
+ log .Println (err )
80
+ }
81
+ if err := m .recordServiceAccountsCount (); err != nil {
82
+ log .Println (err )
83
+ }
84
+ if err := m .recordEventCountsByWorkersPhase (); err != nil {
85
+ log .Println (err )
86
+ }
87
+ if err := m .recordPendingJobsCount (); err != nil {
88
+ log .Println (err )
89
+ }
74
90
case <- ctx .Done ():
75
91
return
76
92
}
77
93
}
78
94
}
79
95
80
- func (m * metricsExporter ) recordMetrics () {
96
+ func (m * metricsExporter ) recordProjectsCount () error {
81
97
// brigade_projects_total
82
- projects , err := m .apiClient . Core () .Projects ().List (
98
+ projects , err := m .coreClient .Projects ().List (
83
99
context .Background (),
84
100
& core.ProjectsSelector {},
85
101
& meta.ListOptions {},
86
102
)
87
103
if err != nil {
88
- log .Println (err )
89
- } else {
90
- m .totalProjects .Set (float64 (len (projects .Items ) +
91
- int (projects .RemainingItemCount )))
104
+ return err
92
105
}
106
+ m .projectsGauge .Set (
107
+ float64 (len (projects .Items ) + int (projects .RemainingItemCount )),
108
+ )
109
+ return nil
110
+ }
93
111
112
+ func (m * metricsExporter ) recordUsersCount () error {
94
113
// brigade_users_total
95
- users , err := m .apiClient . Authn () .Users ().List (
114
+ users , err := m .authnClient .Users ().List (
96
115
context .Background (),
97
116
& authn.UsersSelector {},
98
117
& meta.ListOptions {},
99
118
)
100
119
if err != nil {
101
- log .Println (err )
102
- } else {
103
- m .totalUsers .Set (float64 (len (users .Items ) +
104
- int (users .RemainingItemCount )))
120
+ return err
105
121
}
122
+ m .usersGauge .Set (
123
+ float64 (int64 (len (users .Items )) + users .RemainingItemCount ),
124
+ )
125
+ return nil
126
+ }
106
127
128
+ func (m * metricsExporter ) recordServiceAccountsCount () error {
107
129
// brigade_service_accounts_total
108
- serviceAccounts , err := m .apiClient . Authn () .ServiceAccounts ().List (
130
+ serviceAccounts , err := m .authnClient .ServiceAccounts ().List (
109
131
context .Background (),
110
132
& authn.ServiceAccountsSelector {},
111
133
& meta.ListOptions {},
112
134
)
113
135
if err != nil {
114
- log .Println (err )
115
- } else {
116
- m .totalServiceAccounts .Set (
117
- float64 (
118
- len (serviceAccounts .Items ) +
119
- int (serviceAccounts .RemainingItemCount ),
120
- ),
121
- )
136
+ return err
122
137
}
138
+ m .serviceAccountsGauge .Set (
139
+ float64 (
140
+ int64 (len (serviceAccounts .Items )) + serviceAccounts .RemainingItemCount ,
141
+ ),
142
+ )
143
+ return nil
144
+ }
123
145
124
- // brigade_all_workers_by_phase
146
+ func (m * metricsExporter ) recordEventCountsByWorkersPhase () error {
147
+ // brigade_events_by_worker_phase
125
148
for _ , phase := range core .WorkerPhasesAll () {
126
- var events core.EventList
127
- events , err = m .apiClient .Core ().Events ().List (
149
+ events , err := m .coreClient .Events ().List (
128
150
context .Background (),
129
151
& core.EventsSelector {
130
152
WorkerPhases : []core.WorkerPhase {phase },
131
153
},
132
154
& meta.ListOptions {},
133
155
)
134
156
if err != nil {
135
- log .Println (err )
136
- } else {
137
- m .allWorkersByPhase .With (
138
- prometheus.Labels {"workerPhase" : string (phase )},
139
- ).Set (float64 (len (events .Items ) + int (events .RemainingItemCount )))
157
+ return err
140
158
}
159
+ m .allWorkersByPhase .With (
160
+ prometheus.Labels {"workerPhase" : string (phase )},
161
+ ).Set (float64 (len (events .Items ) + int (events .RemainingItemCount )))
162
+ }
163
+ return nil
164
+ }
141
165
142
- // brigade_pending_jobs_total
143
- //
144
- // There is no way to query the API directly for pending Jobs, but only
145
- // running Workers should ever HAVE pending Jobs, so if we're currently
146
- // counting running Workers, we can iterate over those to count pending
147
- // jobs. Note, there's a cap on the max number of workers that can run
148
- // concurrently, so we assume that as long as that cap isn't enormous (which
149
- // would only occur on an enormous cluster), it's practical to iterate over
150
- // all the running workers.
151
- if phase == core .WorkerPhaseRunning {
152
- var pendingJobs int
153
- for {
154
- for _ , event := range events .Items {
155
- for _ , job := range event .Worker .Jobs {
156
- if job .Status .Phase == core .JobPhasePending {
157
- pendingJobs ++
158
- }
159
- }
160
- }
161
- if events .Continue == "" {
162
- break
163
- }
164
- if events , err = m .apiClient .Core ().Events ().List (
165
- context .Background (),
166
- & core.EventsSelector {
167
- WorkerPhases : []core.WorkerPhase {phase },
168
- },
169
- & meta.ListOptions {Continue : events .Continue },
170
- ); err != nil {
171
- log .Println (err )
172
- break
166
+ func (m * metricsExporter ) recordPendingJobsCount () error {
167
+ // brigade_pending_jobs_total
168
+ var pendingJobs int
169
+ var continueValue string
170
+ for {
171
+ events , err := m .coreClient .Events ().List (
172
+ context .Background (),
173
+ & core.EventsSelector {
174
+ WorkerPhases : []core.WorkerPhase {core .WorkerPhaseRunning },
175
+ },
176
+ & meta.ListOptions {
177
+ Continue : continueValue ,
178
+ },
179
+ )
180
+ if err != nil {
181
+ return err
182
+ }
183
+ for _ , event := range events .Items {
184
+ for _ , job := range event .Worker .Jobs {
185
+ if job .Status .Phase == core .JobPhasePending {
186
+ pendingJobs ++
173
187
}
174
188
}
175
- if err == nil {
176
- m .totalPendingJobs .Set (float64 (pendingJobs ))
177
- }
178
189
}
190
+ if events .Continue == "" {
191
+ break
192
+ }
193
+ continueValue = events .Continue
179
194
}
180
-
195
+ m .pendingJobsGauge .Set (float64 (pendingJobs ))
196
+ return nil
181
197
}
0 commit comments