Restore alertmanager state from storage as fallback (grafana#2293)

56quarters · mason · commit cac96f51977f · 2022-07-11T15:20:08.000+08:00
* Restore alertmanager state from storage as fallback In cortexproject/cortex#3925 the ability to restore alertmanager state from peer alertmanagers was added, short-circuiting if there is only a single replica of the alertmanager. In cortexproject/cortex#4021 a fallback to read state from storage was added in case reading from peers failed. However, the short-circuiting if there is only a single peer was not removed. This has the effect of never restoring state in an alertmanager if only running a single replica. Fixes grafana#2245 Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com> * Code review changes Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 * [CHANGE] Ruler: Remove unused CLI flags `-ruler.search-pending-for` and `-ruler.flush-period` (and their respective YAML config options). #2288
 * [ENHANCEMENT] Alertmanager: Allow the HTTP `proxy_url` configuration option in the receiver's configuration. #2317
 * [BUGFIX] Compactor: log the actual error on compaction failed. #2261
+* [BUGFIX] Alertmanager: restore state from storage even when running a single replica. #2293
 
 ### Mixin
 
diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go
@@ -65,6 +65,7 @@ func createAlertmanagerAndSendAlerts(t *testing.T, alertGroups, groupsLimit, exp
 		TenantDataDir:     t.TempDir(),
 		ExternalURL:       &url.URL{Path: "/am"},
 		ShardingEnabled:   true,
+		Store:             prepareInMemoryAlertStore(),
 		Replicator:        &stubReplicator{},
 		ReplicationFactor: 1,
 		// We have to set this interval non-zero, though we don't need the persister to do anything.
@@ -148,6 +149,7 @@ func TestDispatcherLoggerInsightKey(t *testing.T) {
 		TenantDataDir:     t.TempDir(),
 		ExternalURL:       &url.URL{Path: "/am"},
 		ShardingEnabled:   true,
+		Store:             prepareInMemoryAlertStore(),
 		Replicator:        &stubReplicator{},
 		ReplicationFactor: 1,
 		PersisterConfig:   PersisterConfig{Interval: time.Hour},
diff --git a/pkg/alertmanager/state_replication.go b/pkg/alertmanager/state_replication.go
@@ -71,7 +71,7 @@ type state struct {
 func newReplicatedStates(userID string, rf int, re Replicator, st alertstore.AlertStore, l log.Logger, r prometheus.Registerer) *state {
 
 	s := &state{
-		logger:            l,
+		logger:            log.With(l, "user", userID),
 		userID:            userID,
 		replicationFactor: rf,
 		replicator:        re,
@@ -199,45 +199,44 @@ func (s *state) starting(ctx context.Context) error {
 	timer := prometheus.NewTimer(s.initialSyncDuration)
 	defer timer.ObserveDuration()
 
-	level.Info(s.logger).Log("msg", "Waiting for notification and silences to settle...")
-
-	// If the replication factor is <= 1, there is nowhere to obtain the state from.
-	if s.replicationFactor <= 1 {
-		level.Info(s.logger).Log("msg", "skipping settling (no replicas)")
-		return nil
-	}
-
-	// We can check other alertmanager(s) and explicitly ask them to propagate their state to us if available.
-	readCtx, cancel := context.WithTimeout(ctx, s.settleReadTimeout)
-	defer cancel()
+	// If replication factor is > 1 attempt to read state from other replicas, falling back to reading from
+	// storage if they are unavailable.
+	if s.replicationFactor > 1 {
+		level.Info(s.logger).Log("msg", "Waiting for notification and silences to settle...")
+
+		// We can check other alertmanager(s) and explicitly ask them to propagate their state to us if available.
+		readCtx, cancel := context.WithTimeout(ctx, s.settleReadTimeout)
+		defer cancel()
+
+		s.fetchReplicaStateTotal.Inc()
+		fullStates, err := s.replicator.ReadFullStateForUser(readCtx, s.userID)
+		if err == nil {
+			if err = s.mergeFullStates(fullStates); err == nil {
+				level.Info(s.logger).Log("msg", "state settled; proceeding")
+				s.initialSyncCompleted.WithLabelValues(syncFromReplica).Inc()
+				return nil
+			}
+		}
 
-	s.fetchReplicaStateTotal.Inc()
-	fullStates, err := s.replicator.ReadFullStateForUser(readCtx, s.userID)
-	if err == nil {
-		if err = s.mergeFullStates(fullStates); err == nil {
-			level.Info(s.logger).Log("msg", "state settled; proceeding")
-			s.initialSyncCompleted.WithLabelValues(syncFromReplica).Inc()
-			return nil
+		// The user not being found in all of the replicas is not recorded as a failure, as this is
+		// expected when this is the first replica to come up for a user. Note that it is important
+		// to continue and try to read from the state from remote storage, as the replicas may have
+		// lost state due to an all-replica restart.
+		if err != errAllReplicasUserNotFound {
+			s.fetchReplicaStateFailed.Inc()
 		}
-	}
 
-	// The user not being found in all of the replicas is not recorded as a failure, as this is
-	// expected when this is the first replica to come up for a user. Note that it is important
-	// to continue and try to read from the state from remote storage, as the replicas may have
-	// lost state due to an all-replica restart.
-	if err != errAllReplicasUserNotFound {
-		s.fetchReplicaStateFailed.Inc()
+		level.Info(s.logger).Log("msg", "unable to read state from other Alertmanager replicas; trying to read from storage", "err", err)
 	}
 
-	level.Info(s.logger).Log("msg", "state not settled; trying to read from storage", "err", err)
-
+	level.Info(s.logger).Log("msg", "reading state from storage")
 	// Attempt to read the state from persistent storage instead.
 	storeReadCtx, cancel := context.WithTimeout(ctx, s.storeReadTimeout)
 	defer cancel()
 
 	fullState, err := s.store.GetFullState(storeReadCtx, s.userID)
 	if errors.Is(err, alertspb.ErrNotFound) {
-		level.Info(s.logger).Log("msg", "no state for user in storage; proceeding", "user", s.userID)
+		level.Info(s.logger).Log("msg", "no state for user in storage; proceeding")
 		s.initialSyncCompleted.WithLabelValues(syncUserNotFound).Inc()
 		return nil
 	}
@@ -271,11 +270,11 @@ func (s *state) mergeFullStates(fs []*clusterpb.FullState) error {
 
 	for _, f := range fs {
 		for _, p := range f.Parts {
-			level.Debug(s.logger).Log("msg", "merging full state", "user", s.userID, "key", p.Key, "bytes", len(p.Data))
+			level.Debug(s.logger).Log("msg", "merging full state", "key", p.Key, "bytes", len(p.Data))
 
 			st, ok := s.states[p.Key]
 			if !ok {
-				level.Error(s.logger).Log("msg", "key not found while merging full state", "user", s.userID, "key", p.Key)
+				level.Error(s.logger).Log("msg", "key not found while merging full state", "key", p.Key)
 				continue
 			}
 
@@ -300,7 +299,7 @@ func (s *state) running(ctx context.Context) error {
 			s.stateReplicationTotal.WithLabelValues(p.Key).Inc()
 			if err := s.replicator.ReplicateStateForUser(ctx, s.userID, p); err != nil {
 				s.stateReplicationFailed.WithLabelValues(p.Key).Inc()
-				level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "user", s.userID, "key", p.Key, "err", err)
+				level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "key", p.Key, "err", err)
 			}
 		case <-ctx.Done():
 			return nil
diff --git a/pkg/alertmanager/state_replication_test.go b/pkg/alertmanager/state_replication_test.go
@@ -29,6 +29,8 @@ import (
 	"github.com/grafana/mimir/pkg/alertmanager/alertstore"
 )
 
+const testUserID = "user-1"
+
 type fakeState struct {
 	binary []byte
 	merges [][]byte
@@ -73,8 +75,8 @@ func (f *fakeReplicator) GetPositionForUser(_ string) int {
 }
 
 func (f *fakeReplicator) ReadFullStateForUser(ctx context.Context, userID string) ([]*clusterpb.FullState, error) {
-	if userID != "user-1" {
-		return nil, errors.New("Unexpected userID")
+	if userID != testUserID {
+		return nil, errors.New("unexpected userID")
 	}
 
 	if f.read.blocking {
@@ -96,31 +98,39 @@ func newFakeAlertStore() *fakeAlertStore {
 	}
 }
 
-func (f *fakeAlertStore) GetFullState(ctx context.Context, user string) (alertspb.FullStateDesc, error) {
+func (f *fakeAlertStore) GetFullState(_ context.Context, user string) (alertspb.FullStateDesc, error) {
 	if result, ok := f.states[user]; ok {
 		return result, nil
 	}
 	return alertspb.FullStateDesc{}, alertspb.ErrNotFound
 }
 
+func (f *fakeAlertStore) SetFullState(_ context.Context, user string, state alertspb.FullStateDesc) error {
+	f.states[user] = state
+	return nil
+}
+
 func TestStateReplication(t *testing.T) {
 	tc := []struct {
-		name              string
-		replicationFactor int
-		message           *clusterpb.Part
-		results           map[string]*clusterpb.Part
+		name               string
+		replicationFactor  int
+		message            *clusterpb.Part
+		replicationResults map[string]clusterpb.Part
+		storeResults       map[string]clusterpb.Part
 	}{
 		{
-			name:              "with a replication factor of <= 1, state is not replicated.",
-			replicationFactor: 1,
-			message:           &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
-			results:           map[string]*clusterpb.Part{},
+			name:               "with a replication factor of <= 1, state is not replicated but loaded from storage.",
+			replicationFactor:  1,
+			message:            &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
+			replicationResults: map[string]clusterpb.Part{},
+			storeResults:       map[string]clusterpb.Part{testUserID: {Key: "nflog", Data: []byte("OK")}},
 		},
 		{
-			name:              "with a replication factor of > 1, state is broadcasted for replication.",
-			replicationFactor: 3,
-			message:           &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
-			results:           map[string]*clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}},
+			name:               "with a replication factor of > 1, state is broadcasted for replication.",
+			replicationFactor:  3,
+			message:            &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
+			replicationResults: map[string]clusterpb.Part{testUserID: {Key: "nflog", Data: []byte("OK")}},
+			storeResults:       map[string]clusterpb.Part{},
 		},
 	}
 
@@ -129,9 +139,15 @@ func TestStateReplication(t *testing.T) {
 			reg := prometheus.NewPedanticRegistry()
 			replicator := newFakeReplicator()
 			replicator.read = readStateResult{res: nil, err: nil}
+
 			store := newFakeAlertStore()
-			s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
+			for user, part := range tt.storeResults {
+				require.NoError(t, store.SetFullState(context.Background(), user, alertspb.FullStateDesc{
+					State: &clusterpb.FullState{Parts: []clusterpb.Part{part}},
+				}))
+			}
 
+			s := newReplicatedStates(testUserID, tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
 			require.False(t, s.Ready())
 			{
 				ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
@@ -161,47 +177,32 @@ func TestStateReplication(t *testing.T) {
 			require.Eventually(t, func() bool {
 				replicator.mtx.Lock()
 				defer replicator.mtx.Unlock()
-				return len(replicator.results) == len(tt.results)
+				return len(replicator.results) == len(tt.replicationResults)
 			}, time.Second, time.Millisecond)
 
 			if tt.replicationFactor > 1 {
+				// If the replication factor is greater than 1, we expect state to be loaded from other Alertmanagers
 				assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
-# HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
-# TYPE alertmanager_state_fetch_replica_state_failed_total counter
-alertmanager_state_fetch_replica_state_failed_total 0
-# HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
-# TYPE alertmanager_state_fetch_replica_state_total counter
-alertmanager_state_fetch_replica_state_total 1
-# HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
-# TYPE alertmanager_partial_state_merges_failed_total counter
-alertmanager_partial_state_merges_failed_total{key="nflog"} 0
-# HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
-# TYPE alertmanager_partial_state_merges_total counter
-alertmanager_partial_state_merges_total{key="nflog"} 0
 # HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
 # TYPE alertmanager_state_initial_sync_completed_total counter
 alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
 alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1
 alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0
 alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
-# HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage.
-# TYPE alertmanager_state_initial_sync_total counter
-alertmanager_state_initial_sync_total 1
-# HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
-# TYPE alertmanager_state_replication_failed_total counter
-alertmanager_state_replication_failed_total{key="nflog"} 0
-# HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
-# TYPE alertmanager_state_replication_total counter
-alertmanager_state_replication_total{key="nflog"} 1
 	`),
-					"alertmanager_state_fetch_replica_state_failed_total",
-					"alertmanager_state_fetch_replica_state_total",
-					"alertmanager_partial_state_merges_failed_total",
-					"alertmanager_partial_state_merges_total",
 					"alertmanager_state_initial_sync_completed_total",
-					"alertmanager_state_initial_sync_total",
-					"alertmanager_state_replication_failed_total",
-					"alertmanager_state_replication_total",
+				))
+			} else {
+				// Replication factor is 1, we expect state to be loaded from storage *instead* of other Alertmanagers
+				assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
+# TYPE alertmanager_state_initial_sync_completed_total counter
+alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
+alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 0
+alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 1
+alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
+	`),
+					"alertmanager_state_initial_sync_completed_total",
 				))
 
 			}