@@ -96,31 +96,39 @@ func newFakeAlertStore() *fakeAlertStore {
96
96
}
97
97
}
98
98
99
- func (f * fakeAlertStore ) GetFullState (ctx context.Context , user string ) (alertspb.FullStateDesc , error ) {
99
+ func (f * fakeAlertStore ) GetFullState (_ context.Context , user string ) (alertspb.FullStateDesc , error ) {
100
100
if result , ok := f .states [user ]; ok {
101
101
return result , nil
102
102
}
103
103
return alertspb.FullStateDesc {}, alertspb .ErrNotFound
104
104
}
105
105
106
+ func (f * fakeAlertStore ) SetFullState (_ context.Context , user string , state alertspb.FullStateDesc ) error {
107
+ f .states [user ] = state
108
+ return nil
109
+ }
110
+
106
111
func TestStateReplication (t * testing.T ) {
107
112
tc := []struct {
108
- name string
109
- replicationFactor int
110
- message * clusterpb.Part
111
- results map [string ]* clusterpb.Part
113
+ name string
114
+ replicationFactor int
115
+ message * clusterpb.Part
116
+ replicationResults map [string ]clusterpb.Part
117
+ storeResults map [string ]clusterpb.Part
112
118
}{
113
119
{
114
- name : "with a replication factor of <= 1, state is not replicated." ,
115
- replicationFactor : 1 ,
116
- message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
117
- results : map [string ]* clusterpb.Part {},
120
+ name : "with a replication factor of <= 1, state is not replicated but loaded from storage." ,
121
+ replicationFactor : 1 ,
122
+ message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
123
+ replicationResults : map [string ]clusterpb.Part {},
124
+ storeResults : map [string ]clusterpb.Part {"user-1" : {Key : "nflog" , Data : []byte ("OK" )}},
118
125
},
119
126
{
120
- name : "with a replication factor of > 1, state is broadcasted for replication." ,
121
- replicationFactor : 3 ,
122
- message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
123
- results : map [string ]* clusterpb.Part {"user-1" : {Key : "nflog" , Data : []byte ("OK" )}},
127
+ name : "with a replication factor of > 1, state is broadcasted for replication." ,
128
+ replicationFactor : 3 ,
129
+ message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
130
+ replicationResults : map [string ]clusterpb.Part {"user-1" : {Key : "nflog" , Data : []byte ("OK" )}},
131
+ storeResults : map [string ]clusterpb.Part {},
124
132
},
125
133
}
126
134
@@ -129,9 +137,15 @@ func TestStateReplication(t *testing.T) {
129
137
reg := prometheus .NewPedanticRegistry ()
130
138
replicator := newFakeReplicator ()
131
139
replicator .read = readStateResult {res : nil , err : nil }
140
+
132
141
store := newFakeAlertStore ()
133
- s := newReplicatedStates ("user-1" , tt .replicationFactor , replicator , store , log .NewNopLogger (), reg )
142
+ for user , part := range tt .storeResults {
143
+ require .NoError (t , store .SetFullState (context .Background (), user , alertspb.FullStateDesc {
144
+ State : & clusterpb.FullState {Parts : []clusterpb.Part {part }},
145
+ }))
146
+ }
134
147
148
+ s := newReplicatedStates ("user-1" , tt .replicationFactor , replicator , store , log .NewNopLogger (), reg )
135
149
require .False (t , s .Ready ())
136
150
{
137
151
ctx , cancel := context .WithTimeout (context .Background (), 100 * time .Millisecond )
@@ -161,47 +175,32 @@ func TestStateReplication(t *testing.T) {
161
175
require .Eventually (t , func () bool {
162
176
replicator .mtx .Lock ()
163
177
defer replicator .mtx .Unlock ()
164
- return len (replicator .results ) == len (tt .results )
178
+ return len (replicator .results ) == len (tt .replicationResults )
165
179
}, time .Second , time .Millisecond )
166
180
167
181
if tt .replicationFactor > 1 {
182
+ // If the replication factor is greater than 1, we expect state to be loaded from other Alertmanagers
168
183
assert .NoError (t , testutil .GatherAndCompare (reg , strings .NewReader (`
169
- # HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
170
- # TYPE alertmanager_state_fetch_replica_state_failed_total counter
171
- alertmanager_state_fetch_replica_state_failed_total 0
172
- # HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
173
- # TYPE alertmanager_state_fetch_replica_state_total counter
174
- alertmanager_state_fetch_replica_state_total 1
175
- # HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
176
- # TYPE alertmanager_partial_state_merges_failed_total counter
177
- alertmanager_partial_state_merges_failed_total{key="nflog"} 0
178
- # HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
179
- # TYPE alertmanager_partial_state_merges_total counter
180
- alertmanager_partial_state_merges_total{key="nflog"} 0
181
184
# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
182
185
# TYPE alertmanager_state_initial_sync_completed_total counter
183
186
alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
184
187
alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1
185
188
alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0
186
189
alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
187
- # HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage.
188
- # TYPE alertmanager_state_initial_sync_total counter
189
- alertmanager_state_initial_sync_total 1
190
- # HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
191
- # TYPE alertmanager_state_replication_failed_total counter
192
- alertmanager_state_replication_failed_total{key="nflog"} 0
193
- # HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
194
- # TYPE alertmanager_state_replication_total counter
195
- alertmanager_state_replication_total{key="nflog"} 1
196
190
` ),
197
- "alertmanager_state_fetch_replica_state_failed_total" ,
198
- "alertmanager_state_fetch_replica_state_total" ,
199
- "alertmanager_partial_state_merges_failed_total" ,
200
- "alertmanager_partial_state_merges_total" ,
201
191
"alertmanager_state_initial_sync_completed_total" ,
202
- "alertmanager_state_initial_sync_total" ,
203
- "alertmanager_state_replication_failed_total" ,
204
- "alertmanager_state_replication_total" ,
192
+ ))
193
+ } else {
194
+ // Replication factor is 1, we expect state to be loaded from storage *instead* of other Alertmanagers
195
+ assert .NoError (t , testutil .GatherAndCompare (reg , strings .NewReader (`
196
+ # HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
197
+ # TYPE alertmanager_state_initial_sync_completed_total counter
198
+ alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
199
+ alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 0
200
+ alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 1
201
+ alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
202
+ ` ),
203
+ "alertmanager_state_initial_sync_completed_total" ,
205
204
))
206
205
207
206
}
0 commit comments