@@ -14,8 +14,10 @@ import (
14
14
"time"
15
15
16
16
"github.com/go-kit/kit/log"
17
+ "github.com/go-kit/kit/log/level"
17
18
"github.com/prometheus/alertmanager/api"
18
19
"github.com/prometheus/alertmanager/cluster"
20
+ "github.com/prometheus/alertmanager/cluster/clusterpb"
19
21
"github.com/prometheus/alertmanager/config"
20
22
"github.com/prometheus/alertmanager/dispatch"
21
23
"github.com/prometheus/alertmanager/inhibit"
@@ -53,13 +55,21 @@ type Config struct {
53
55
PeerTimeout time.Duration
54
56
Retention time.Duration
55
57
ExternalURL * url.URL
58
+
59
+ ShardingEnabled bool
60
+ ReplicationFactor int
61
+ ReplicateStateFunc func (context.Context , string , * clusterpb.Part ) error
62
+ // The alertmanager replication protocol relies on a position related to other replicas.
63
+ // This position is then used to identify who should notify about the alert first.
64
+ GetPositionFunc func (userID string ) int
56
65
}
57
66
58
67
// An Alertmanager manages the alerts for one user.
59
68
type Alertmanager struct {
60
69
cfg * Config
61
70
api * api.API
62
71
logger log.Logger
72
+ state State
63
73
nflog * nflog.Log
64
74
silences * silence.Silences
65
75
marker types.Marker
@@ -96,6 +106,13 @@ func init() {
96
106
}()
97
107
}
98
108
109
+ // State helps with replication and synchronization of notifications and silences across several alertmanager replicas.
110
+ type State interface {
111
+ AddState (string , cluster.State , prometheus.Registerer ) cluster.ClusterChannel
112
+ Position () int
113
+ WaitReady ()
114
+ }
115
+
99
116
// New creates a new Alertmanager.
100
117
func New (cfg * Config , reg * prometheus.Registry ) (* Alertmanager , error ) {
101
118
am := & Alertmanager {
@@ -110,6 +127,22 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
110
127
111
128
am .registry = reg
112
129
130
+ // We currently have 3 operational modes:
131
+ // 1) Alertmanager clustering with upstream Gossip
132
+ // 2) Alertmanager sharding and ring-based replication
133
+ // 3) Alertmanager no replication
134
+ // These are covered in order.
135
+ if cfg .Peer != nil {
136
+ level .Debug (am .logger ).Log ("msg" , "starting tenant alertmanager with gossip-based replication" )
137
+ am .state = cfg .Peer
138
+ } else if cfg .ShardingEnabled {
139
+ level .Debug (am .logger ).Log ("msg" , "starting tenant alertmanager with ring-based replication" )
140
+ am .state = newReplicatedStates (cfg .UserID , cfg .ReplicationFactor , cfg .ReplicateStateFunc , cfg .GetPositionFunc , am .stop , am .logger , am .registry )
141
+ } else {
142
+ level .Debug (am .logger ).Log ("msg" , "starting tenant alertmanager without replication" )
143
+ am .state = & NilPeer {}
144
+ }
145
+
113
146
am .wg .Add (1 )
114
147
nflogID := fmt .Sprintf ("nflog:%s" , cfg .UserID )
115
148
var err error
@@ -123,10 +156,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
123
156
if err != nil {
124
157
return nil , fmt .Errorf ("failed to create notification log: %v" , err )
125
158
}
126
- if cfg .Peer != nil {
127
- c := cfg .Peer .AddState ("nfl:" + cfg .UserID , am .nflog , am .registry )
128
- am .nflog .SetBroadcast (c .Broadcast )
129
- }
159
+
160
+ c := am .state .AddState ("nfl:" + cfg .UserID , am .nflog , am .registry )
161
+ am .nflog .SetBroadcast (c .Broadcast )
130
162
131
163
am .marker = types .NewMarker (am .registry )
132
164
@@ -140,10 +172,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
140
172
if err != nil {
141
173
return nil , fmt .Errorf ("failed to create silences: %v" , err )
142
174
}
143
- if cfg .Peer != nil {
144
- c := cfg .Peer .AddState ("sil:" + cfg .UserID , am .silences , am .registry )
145
- am .silences .SetBroadcast (c .Broadcast )
146
- }
175
+
176
+ c = am .state .AddState ("sil:" + cfg .UserID , am .silences , am .registry )
177
+ am .silences .SetBroadcast (c .Broadcast )
147
178
148
179
am .pipelineBuilder = notify .NewPipelineBuilder (am .registry )
149
180
@@ -162,9 +193,10 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
162
193
Alerts : am .alerts ,
163
194
Silences : am .silences ,
164
195
StatusFunc : am .marker .Status ,
165
- Peer : & NilPeer {},
166
- Registry : am .registry ,
167
- Logger : log .With (am .logger , "component" , "api" ),
196
+ // Cortex should not expose cluster information back to its tenants.
197
+ Peer : & NilPeer {},
198
+ Registry : am .registry ,
199
+ Logger : log .With (am .logger , "component" , "api" ),
168
200
GroupFunc : func (f1 func (* dispatch.Route ) bool , f2 func (* types.Alert , time.Time ) bool ) (dispatch.AlertGroups , map [model.Fingerprint ][]string ) {
169
201
return am .dispatcher .Groups (f1 , f2 )
170
202
},
@@ -190,14 +222,16 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
190
222
}
191
223
192
224
am .dispatcherMetrics = dispatch .NewDispatcherMetrics (am .registry )
225
+
226
+ //TODO: From this point onward, the alertmanager _might_ receive requests - we need to make sure we've settled and are ready.
193
227
return am , nil
194
228
}
195
229
196
230
// clusterWait returns a function that inspects the current peer state and returns
197
231
// a duration of one base timeout for each peer with a higher ID than ourselves.
198
- func clusterWait (p * cluster. Peer , timeout time.Duration ) func () time.Duration {
232
+ func clusterWait (position func () int , timeout time.Duration ) func () time.Duration {
199
233
return func () time.Duration {
200
- return time .Duration (p . Position ()) * timeout
234
+ return time .Duration (position ()) * timeout
201
235
}
202
236
}
203
237
@@ -230,7 +264,8 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s
230
264
231
265
am .inhibitor = inhibit .NewInhibitor (am .alerts , conf .InhibitRules , am .marker , log .With (am .logger , "component" , "inhibitor" ))
232
266
233
- waitFunc := clusterWait (am .cfg .Peer , am .cfg .PeerTimeout )
267
+ waitFunc := clusterWait (am .state .Position , am .cfg .PeerTimeout )
268
+
234
269
timeoutFunc := func (d time.Duration ) time.Duration {
235
270
if d < notify .MinTimeout {
236
271
d = notify .MinTimeout
@@ -255,7 +290,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s
255
290
silence .NewSilencer (am .silences , am .marker , am .logger ),
256
291
muteTimes ,
257
292
am .nflog ,
258
- am .cfg . Peer ,
293
+ am .state ,
259
294
)
260
295
am .dispatcher = dispatch .NewDispatcher (
261
296
am .alerts ,
@@ -293,6 +328,10 @@ func (am *Alertmanager) StopAndWait() {
293
328
am .wg .Wait ()
294
329
}
295
330
331
+ func (am * Alertmanager ) mergePartialExternalState (part * clusterpb.Part ) error {
332
+ return am .state .(* state ).MergePartialState (part )
333
+ }
334
+
296
335
// buildIntegrationsMap builds a map of name to the list of integration notifiers off of a
297
336
// list of receiver config.
298
337
func buildIntegrationsMap (nc []* config.Receiver , tmpl * template.Template , logger log.Logger ) (map [string ][]notify.Integration , error ) {
0 commit comments