Restore alertmanager state from storage as fallback

56quarters · 56quarters · commit 9bb1ed6727db · 2022-07-05T12:01:06.000-04:00
In cortexproject/cortex#3925 the ability to restore alertmanager state from peer alertmanagers was added, short-circuiting if there is only a single replica of the alertmanager. In cortexproject/cortex#4021 a fallback to read state from storage was added in case reading from peers failed. However, the short-circuiting if there is only a single peer was not removed. This has the effect of never restoring state in an alertmanager if only running a single replica. Fixes #2245 Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 * [CHANGE] Ruler: Remove unused CLI flags `-ruler.search-pending-for` and `-ruler.flush-period` (and their respective YAML config options). #2288
 * [ENHANCEMENT] Alertmanager: Allow the HTTP `proxy_url` configuration option in the receiver's configuration. #2317
 * [BUGFIX] Compactor: log the actual error on compaction failed. #2261
+* [BUGFIX] Alertmanager: restore state from storage even when running a single replica. #2293
 
 ### Mixin
 
diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go
@@ -65,6 +65,7 @@ func createAlertmanagerAndSendAlerts(t *testing.T, alertGroups, groupsLimit, exp
 		TenantDataDir:     t.TempDir(),
 		ExternalURL:       &url.URL{Path: "/am"},
 		ShardingEnabled:   true,
+		Store:             prepareInMemoryAlertStore(),
 		Replicator:        &stubReplicator{},
 		ReplicationFactor: 1,
 		// We have to set this interval non-zero, though we don't need the persister to do anything.
@@ -148,6 +149,7 @@ func TestDispatcherLoggerInsightKey(t *testing.T) {
 		TenantDataDir:     t.TempDir(),
 		ExternalURL:       &url.URL{Path: "/am"},
 		ShardingEnabled:   true,
+		Store:             prepareInMemoryAlertStore(),
 		Replicator:        &stubReplicator{},
 		ReplicationFactor: 1,
 		PersisterConfig:   PersisterConfig{Interval: time.Hour},
diff --git a/pkg/alertmanager/state_replication.go b/pkg/alertmanager/state_replication.go
@@ -199,38 +199,36 @@ func (s *state) starting(ctx context.Context) error {
 	timer := prometheus.NewTimer(s.initialSyncDuration)
 	defer timer.ObserveDuration()
 
-	level.Info(s.logger).Log("msg", "Waiting for notification and silences to settle...")
-
-	// If the replication factor is <= 1, there is nowhere to obtain the state from.
-	if s.replicationFactor <= 1 {
-		level.Info(s.logger).Log("msg", "skipping settling (no replicas)")
-		return nil
-	}
-
-	// We can check other alertmanager(s) and explicitly ask them to propagate their state to us if available.
-	readCtx, cancel := context.WithTimeout(ctx, s.settleReadTimeout)
-	defer cancel()
+	// If replication factor is > 1 attempt to read state from other replicas, falling back to reading from
+	// storage if they are unavailable.
+	if s.replicationFactor > 1 {
+		level.Info(s.logger).Log("msg", "Waiting for notification and silences to settle...")
+
+		// We can check other alertmanager(s) and explicitly ask them to propagate their state to us if available.
+		readCtx, cancel := context.WithTimeout(ctx, s.settleReadTimeout)
+		defer cancel()
+
+		s.fetchReplicaStateTotal.Inc()
+		fullStates, err := s.replicator.ReadFullStateForUser(readCtx, s.userID)
+		if err == nil {
+			if err = s.mergeFullStates(fullStates); err == nil {
+				level.Info(s.logger).Log("msg", "state settled; proceeding")
+				s.initialSyncCompleted.WithLabelValues(syncFromReplica).Inc()
+				return nil
+			}
+		}
 
-	s.fetchReplicaStateTotal.Inc()
-	fullStates, err := s.replicator.ReadFullStateForUser(readCtx, s.userID)
-	if err == nil {
-		if err = s.mergeFullStates(fullStates); err == nil {
-			level.Info(s.logger).Log("msg", "state settled; proceeding")
-			s.initialSyncCompleted.WithLabelValues(syncFromReplica).Inc()
-			return nil
+		// The user not being found in all of the replicas is not recorded as a failure, as this is
+		// expected when this is the first replica to come up for a user. Note that it is important
+		// to continue and try to read from the state from remote storage, as the replicas may have
+		// lost state due to an all-replica restart.
+		if err != errAllReplicasUserNotFound {
+			s.fetchReplicaStateFailed.Inc()
 		}
-	}
 
-	// The user not being found in all of the replicas is not recorded as a failure, as this is
-	// expected when this is the first replica to come up for a user. Note that it is important
-	// to continue and try to read from the state from remote storage, as the replicas may have
-	// lost state due to an all-replica restart.
-	if err != errAllReplicasUserNotFound {
-		s.fetchReplicaStateFailed.Inc()
+		level.Info(s.logger).Log("msg", "state not settled; trying to read from storage", "err", err)
 	}
 
-	level.Info(s.logger).Log("msg", "state not settled; trying to read from storage", "err", err)
-
 	// Attempt to read the state from persistent storage instead.
 	storeReadCtx, cancel := context.WithTimeout(ctx, s.storeReadTimeout)
 	defer cancel()
diff --git a/pkg/alertmanager/state_replication_test.go b/pkg/alertmanager/state_replication_test.go
@@ -96,31 +96,39 @@ func newFakeAlertStore() *fakeAlertStore {
 	}
 }
 
-func (f *fakeAlertStore) GetFullState(ctx context.Context, user string) (alertspb.FullStateDesc, error) {
+func (f *fakeAlertStore) GetFullState(_ context.Context, user string) (alertspb.FullStateDesc, error) {
 	if result, ok := f.states[user]; ok {
 		return result, nil
 	}
 	return alertspb.FullStateDesc{}, alertspb.ErrNotFound
 }
 
+func (f *fakeAlertStore) SetFullState(_ context.Context, user string, state alertspb.FullStateDesc) error {
+	f.states[user] = state
+	return nil
+}
+
 func TestStateReplication(t *testing.T) {
 	tc := []struct {
-		name              string
-		replicationFactor int
-		message           *clusterpb.Part
-		results           map[string]*clusterpb.Part
+		name               string
+		replicationFactor  int
+		message            *clusterpb.Part
+		replicationResults map[string]clusterpb.Part
+		storeResults       map[string]clusterpb.Part
 	}{
 		{
-			name:              "with a replication factor of <= 1, state is not replicated.",
-			replicationFactor: 1,
-			message:           &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
-			results:           map[string]*clusterpb.Part{},
+			name:               "with a replication factor of <= 1, state is not replicated but loaded from storage.",
+			replicationFactor:  1,
+			message:            &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
+			replicationResults: map[string]clusterpb.Part{},
+			storeResults:       map[string]clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}},
 		},
 		{
-			name:              "with a replication factor of > 1, state is broadcasted for replication.",
-			replicationFactor: 3,
-			message:           &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
-			results:           map[string]*clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}},
+			name:               "with a replication factor of > 1, state is broadcasted for replication.",
+			replicationFactor:  3,
+			message:            &clusterpb.Part{Key: "nflog", Data: []byte("OK")},
+			replicationResults: map[string]clusterpb.Part{"user-1": {Key: "nflog", Data: []byte("OK")}},
+			storeResults:       map[string]clusterpb.Part{},
 		},
 	}
 
@@ -129,9 +137,15 @@ func TestStateReplication(t *testing.T) {
 			reg := prometheus.NewPedanticRegistry()
 			replicator := newFakeReplicator()
 			replicator.read = readStateResult{res: nil, err: nil}
+
 			store := newFakeAlertStore()
-			s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
+			for user, part := range tt.storeResults {
+				require.NoError(t, store.SetFullState(context.Background(), user, alertspb.FullStateDesc{
+					State: &clusterpb.FullState{Parts: []clusterpb.Part{part}},
+				}))
+			}
 
+			s := newReplicatedStates("user-1", tt.replicationFactor, replicator, store, log.NewNopLogger(), reg)
 			require.False(t, s.Ready())
 			{
 				ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
@@ -161,47 +175,32 @@ func TestStateReplication(t *testing.T) {
 			require.Eventually(t, func() bool {
 				replicator.mtx.Lock()
 				defer replicator.mtx.Unlock()
-				return len(replicator.results) == len(tt.results)
+				return len(replicator.results) == len(tt.replicationResults)
 			}, time.Second, time.Millisecond)
 
 			if tt.replicationFactor > 1 {
+				// If the replication factor is greater than 1, we expect state to be loaded from other Alertmanagers
 				assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
-# HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
-# TYPE alertmanager_state_fetch_replica_state_failed_total counter
-alertmanager_state_fetch_replica_state_failed_total 0
-# HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
-# TYPE alertmanager_state_fetch_replica_state_total counter
-alertmanager_state_fetch_replica_state_total 1
-# HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
-# TYPE alertmanager_partial_state_merges_failed_total counter
-alertmanager_partial_state_merges_failed_total{key="nflog"} 0
-# HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
-# TYPE alertmanager_partial_state_merges_total counter
-alertmanager_partial_state_merges_total{key="nflog"} 0
 # HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
 # TYPE alertmanager_state_initial_sync_completed_total counter
 alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
 alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1
 alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0
 alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
-# HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage.
-# TYPE alertmanager_state_initial_sync_total counter
-alertmanager_state_initial_sync_total 1
-# HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
-# TYPE alertmanager_state_replication_failed_total counter
-alertmanager_state_replication_failed_total{key="nflog"} 0
-# HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
-# TYPE alertmanager_state_replication_total counter
-alertmanager_state_replication_total{key="nflog"} 1
 	`),
-					"alertmanager_state_fetch_replica_state_failed_total",
-					"alertmanager_state_fetch_replica_state_total",
-					"alertmanager_partial_state_merges_failed_total",
-					"alertmanager_partial_state_merges_total",
 					"alertmanager_state_initial_sync_completed_total",
-					"alertmanager_state_initial_sync_total",
-					"alertmanager_state_replication_failed_total",
-					"alertmanager_state_replication_total",
+				))
+			} else {
+				// Replication factor is 1, we expect state to be loaded from storage *instead* of other Alertmanagers
+				assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
+# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
+# TYPE alertmanager_state_initial_sync_completed_total counter
+alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
+alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 0
+alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 1
+alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
+	`),
+					"alertmanager_state_initial_sync_completed_total",
 				))
 
 			}