Skip to content

Commit 9037020

Browse files
authored
Read alertmanager state from storage if peer settling fails. (#4021)
* Read alertmanager state from storage if peer settling fails. Signed-off-by: Steve Simpson <[email protected]> * Review comments. Signed-off-by: Steve Simpson <[email protected]> * Review comments. Signed-off-by: Steve Simpson <[email protected]>
1 parent 88ebc30 commit 9037020

File tree

4 files changed

+78
-7
lines changed

4 files changed

+78
-7
lines changed

pkg/alertmanager/alertmanager.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import (
4343
"github.com/prometheus/common/model"
4444
"github.com/prometheus/common/route"
4545

46+
"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
4647
"github.com/cortexproject/cortex/pkg/util/services"
4748
)
4849

@@ -71,6 +72,7 @@ type Config struct {
7172
ShardingEnabled bool
7273
ReplicationFactor int
7374
Replicator Replicator
75+
Store alertstore.AlertStore
7476
}
7577

7678
// An Alertmanager manages the alerts for one user.
@@ -161,7 +163,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
161163
am.state = cfg.Peer
162164
} else if cfg.ShardingEnabled {
163165
level.Debug(am.logger).Log("msg", "starting tenant alertmanager with ring-based replication")
164-
am.state = newReplicatedStates(cfg.UserID, cfg.ReplicationFactor, cfg.Replicator, am.logger, am.registry)
166+
am.state = newReplicatedStates(cfg.UserID, cfg.ReplicationFactor, cfg.Replicator, cfg.Store, am.logger, am.registry)
165167
} else {
166168
level.Debug(am.logger).Log("msg", "starting tenant alertmanager without replication")
167169
am.state = &NilPeer{}

pkg/alertmanager/multitenant.go

+1
Original file line numberDiff line numberDiff line change
@@ -855,6 +855,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amco
855855
ShardingEnabled: am.cfg.ShardingEnabled,
856856
Replicator: am,
857857
ReplicationFactor: am.cfg.ShardingRing.ReplicationFactor,
858+
Store: am.store,
858859
}, reg)
859860
if err != nil {
860861
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)

pkg/alertmanager/state_replication.go

+28-2
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,14 @@ import (
1515
"github.com/prometheus/alertmanager/cluster/clusterpb"
1616
"github.com/prometheus/client_golang/prometheus"
1717

18+
"github.com/cortexproject/cortex/pkg/alertmanager/alertspb"
19+
"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
1820
"github.com/cortexproject/cortex/pkg/util/services"
1921
)
2022

2123
const (
2224
defaultSettleReadTimeout = 15 * time.Second
25+
defaultStoreReadTimeout = 15 * time.Second
2326
)
2427

2528
// state represents the Alertmanager silences and notification log internal state.
@@ -31,12 +34,14 @@ type state struct {
3134
reg prometheus.Registerer
3235

3336
settleReadTimeout time.Duration
37+
storeReadTimeout time.Duration
3438

3539
mtx sync.Mutex
3640
states map[string]cluster.State
3741

3842
replicationFactor int
3943
replicator Replicator
44+
store alertstore.AlertStore
4045

4146
partialStateMergesTotal *prometheus.CounterVec
4247
partialStateMergesFailed *prometheus.CounterVec
@@ -47,17 +52,19 @@ type state struct {
4752
}
4853

4954
// newReplicatedStates creates a new state struct, which manages state to be replicated between alertmanagers.
50-
func newReplicatedStates(userID string, rf int, re Replicator, l log.Logger, r prometheus.Registerer) *state {
55+
func newReplicatedStates(userID string, rf int, re Replicator, st alertstore.AlertStore, l log.Logger, r prometheus.Registerer) *state {
5156

5257
s := &state{
5358
logger: l,
5459
userID: userID,
5560
replicationFactor: rf,
5661
replicator: re,
62+
store: st,
5763
states: make(map[string]cluster.State, 2), // we use two, one for the notifications and one for silences.
5864
msgc: make(chan *clusterpb.Part),
5965
reg: r,
6066
settleReadTimeout: defaultSettleReadTimeout,
67+
storeReadTimeout: defaultStoreReadTimeout,
6168
partialStateMergesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
6269
Name: "alertmanager_partial_state_merges_total",
6370
Help: "Number of times we have received a partial state to merge for a key.",
@@ -167,7 +174,26 @@ func (s *state) starting(ctx context.Context) error {
167174
}
168175
}
169176

170-
level.Info(s.logger).Log("msg", "state not settled but continuing anyway", "err", err)
177+
level.Info(s.logger).Log("msg", "state not settled; trying to read from storage", "err", err)
178+
179+
// Attempt to read the state from persistent storage instead.
180+
storeReadCtx, cancel := context.WithTimeout(ctx, s.storeReadTimeout)
181+
defer cancel()
182+
183+
fullState, err := s.store.GetFullState(storeReadCtx, s.userID)
184+
if errors.Is(err, alertspb.ErrNotFound) {
185+
level.Info(s.logger).Log("msg", "no state for user in storage; proceeding", "user", s.userID)
186+
return nil
187+
}
188+
if err == nil {
189+
if err = s.mergeFullStates([]*clusterpb.FullState{fullState.State}); err == nil {
190+
level.Info(s.logger).Log("msg", "state read from storage; proceeding")
191+
return nil
192+
}
193+
}
194+
195+
level.Warn(s.logger).Log("msg", "failed to read state from storage; continuing anyway", "err", err)
196+
171197
return nil
172198
}
173199

pkg/alertmanager/state_replication_test.go

+46-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ import (
1818

1919
"github.com/go-kit/kit/log"
2020

21+
"github.com/cortexproject/cortex/pkg/alertmanager/alertspb"
22+
"github.com/cortexproject/cortex/pkg/alertmanager/alertstore"
2123
"github.com/cortexproject/cortex/pkg/util/services"
2224
)
2325

@@ -76,6 +78,25 @@ func (f *fakeReplicator) ReadFullStateForUser(ctx context.Context, userID string
7678
return f.read.res, f.read.err
7779
}
7880

81+
type fakeAlertStore struct {
82+
alertstore.AlertStore
83+
84+
states map[string]alertspb.FullStateDesc
85+
}
86+
87+
func newFakeAlertStore() *fakeAlertStore {
88+
return &fakeAlertStore{
89+
states: make(map[string]alertspb.FullStateDesc),
90+
}
91+
}
92+
93+
func (f *fakeAlertStore) GetFullState(ctx context.Context, user string) (alertspb.FullStateDesc, error) {
94+
if result, ok := f.states[user]; ok {
95+
return result, nil
96+
}
97+
return alertspb.FullStateDesc{}, alertspb.ErrNotFound
98+
}
99+
79100
func TestStateReplication(t *testing.T) {
80101
tc := []struct {
81102
name string
@@ -102,7 +123,8 @@ func TestStateReplication(t *testing.T) {
102123
reg := prometheus.NewPedanticRegistry()
103124
replicator := newFakeReplicator()
104125
replicator.read = readStateResult{res: nil, err: nil}
105-
s := newReplicatedStates("user-1", tt.replicationFactor, replicator, log.NewNopLogger(), reg)
126+
store := newFakeAlertStore()
127+
s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
106128

107129
require.False(t, s.Ready())
108130
{
@@ -163,6 +185,7 @@ func TestStateReplication_Settle(t *testing.T) {
163185
name string
164186
replicationFactor int
165187
read readStateResult
188+
storeStates map[string]alertspb.FullStateDesc
166189
results map[string][][]byte
167190
}{
168191
{
@@ -228,9 +251,26 @@ func TestStateReplication_Settle(t *testing.T) {
228251
},
229252
},
230253
{
231-
name: "when reading the full state fails, still become ready.",
254+
name: "when reading from replicas fails, state is read from storage.",
255+
replicationFactor: 3,
256+
read: readStateResult{err: errors.New("Read Error 1")},
257+
storeStates: map[string]alertspb.FullStateDesc{
258+
"user-1": {
259+
State: &clusterpb.FullState{
260+
Parts: []clusterpb.Part{{Key: "key1", Data: []byte("Datum1")}},
261+
},
262+
},
263+
},
264+
results: map[string][][]byte{
265+
"key1": {[]byte("Datum1")},
266+
"key2": nil,
267+
},
268+
},
269+
{
270+
name: "when reading from replicas and from storage fails, still become ready.",
232271
replicationFactor: 3,
233272
read: readStateResult{err: errors.New("Read Error 1")},
273+
storeStates: map[string]alertspb.FullStateDesc{},
234274
results: map[string][][]byte{
235275
"key1": nil,
236276
"key2": nil,
@@ -253,7 +293,9 @@ func TestStateReplication_Settle(t *testing.T) {
253293

254294
replicator := newFakeReplicator()
255295
replicator.read = tt.read
256-
s := newReplicatedStates("user-1", tt.replicationFactor, replicator, log.NewNopLogger(), reg)
296+
store := newFakeAlertStore()
297+
store.states = tt.storeStates
298+
s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
257299

258300
key1State := &fakeState{}
259301
key2State := &fakeState{}
@@ -322,7 +364,7 @@ func TestStateReplication_GetFullState(t *testing.T) {
322364
for _, tt := range tc {
323365
t.Run(tt.name, func(t *testing.T) {
324366
reg := prometheus.NewPedanticRegistry()
325-
s := newReplicatedStates("user-1", 1, nil, log.NewNopLogger(), reg)
367+
s := newReplicatedStates("user-1", 1, nil, nil, log.NewNopLogger(), reg)
326368

327369
for key, datum := range tt.data {
328370
state := &fakeState{binary: datum}

0 commit comments

Comments
 (0)