@@ -29,6 +29,8 @@ import (
29
29
"github.com/grafana/mimir/pkg/alertmanager/alertstore"
30
30
)
31
31
32
+ const testUserID = "user-1"
33
+
32
34
type fakeState struct {
33
35
binary []byte
34
36
merges [][]byte
@@ -73,8 +75,8 @@ func (f *fakeReplicator) GetPositionForUser(_ string) int {
73
75
}
74
76
75
77
func (f * fakeReplicator ) ReadFullStateForUser (ctx context.Context , userID string ) ([]* clusterpb.FullState , error ) {
76
- if userID != "user-1" {
77
- return nil , errors .New ("Unexpected userID" )
78
+ if userID != testUserID {
79
+ return nil , errors .New ("unexpected userID" )
78
80
}
79
81
80
82
if f .read .blocking {
@@ -96,31 +98,39 @@ func newFakeAlertStore() *fakeAlertStore {
96
98
}
97
99
}
98
100
99
- func (f * fakeAlertStore ) GetFullState (ctx context.Context , user string ) (alertspb.FullStateDesc , error ) {
101
+ func (f * fakeAlertStore ) GetFullState (_ context.Context , user string ) (alertspb.FullStateDesc , error ) {
100
102
if result , ok := f .states [user ]; ok {
101
103
return result , nil
102
104
}
103
105
return alertspb.FullStateDesc {}, alertspb .ErrNotFound
104
106
}
105
107
108
+ func (f * fakeAlertStore ) SetFullState (_ context.Context , user string , state alertspb.FullStateDesc ) error {
109
+ f .states [user ] = state
110
+ return nil
111
+ }
112
+
106
113
func TestStateReplication (t * testing.T ) {
107
114
tc := []struct {
108
- name string
109
- replicationFactor int
110
- message * clusterpb.Part
111
- results map [string ]* clusterpb.Part
115
+ name string
116
+ replicationFactor int
117
+ message * clusterpb.Part
118
+ replicationResults map [string ]clusterpb.Part
119
+ storeResults map [string ]clusterpb.Part
112
120
}{
113
121
{
114
- name : "with a replication factor of <= 1, state is not replicated." ,
115
- replicationFactor : 1 ,
116
- message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
117
- results : map [string ]* clusterpb.Part {},
122
+ name : "with a replication factor of <= 1, state is not replicated but loaded from storage." ,
123
+ replicationFactor : 1 ,
124
+ message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
125
+ replicationResults : map [string ]clusterpb.Part {},
126
+ storeResults : map [string ]clusterpb.Part {testUserID : {Key : "nflog" , Data : []byte ("OK" )}},
118
127
},
119
128
{
120
- name : "with a replication factor of > 1, state is broadcasted for replication." ,
121
- replicationFactor : 3 ,
122
- message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
123
- results : map [string ]* clusterpb.Part {"user-1" : {Key : "nflog" , Data : []byte ("OK" )}},
129
+ name : "with a replication factor of > 1, state is broadcasted for replication." ,
130
+ replicationFactor : 3 ,
131
+ message : & clusterpb.Part {Key : "nflog" , Data : []byte ("OK" )},
132
+ replicationResults : map [string ]clusterpb.Part {testUserID : {Key : "nflog" , Data : []byte ("OK" )}},
133
+ storeResults : map [string ]clusterpb.Part {},
124
134
},
125
135
}
126
136
@@ -129,9 +139,15 @@ func TestStateReplication(t *testing.T) {
129
139
reg := prometheus .NewPedanticRegistry ()
130
140
replicator := newFakeReplicator ()
131
141
replicator .read = readStateResult {res : nil , err : nil }
142
+
132
143
store := newFakeAlertStore ()
133
- s := newReplicatedStates ("user-1" , tt .replicationFactor , replicator , store , log .NewNopLogger (), reg )
144
+ for user , part := range tt .storeResults {
145
+ require .NoError (t , store .SetFullState (context .Background (), user , alertspb.FullStateDesc {
146
+ State : & clusterpb.FullState {Parts : []clusterpb.Part {part }},
147
+ }))
148
+ }
134
149
150
+ s := newReplicatedStates (testUserID , tt .replicationFactor , replicator , store , log .NewNopLogger (), reg )
135
151
require .False (t , s .Ready ())
136
152
{
137
153
ctx , cancel := context .WithTimeout (context .Background (), 100 * time .Millisecond )
@@ -161,47 +177,32 @@ func TestStateReplication(t *testing.T) {
161
177
require .Eventually (t , func () bool {
162
178
replicator .mtx .Lock ()
163
179
defer replicator .mtx .Unlock ()
164
- return len (replicator .results ) == len (tt .results )
180
+ return len (replicator .results ) == len (tt .replicationResults )
165
181
}, time .Second , time .Millisecond )
166
182
167
183
if tt .replicationFactor > 1 {
184
+ // If the replication factor is greater than 1, we expect state to be loaded from other Alertmanagers
168
185
assert .NoError (t , testutil .GatherAndCompare (reg , strings .NewReader (`
169
- # HELP alertmanager_state_fetch_replica_state_failed_total Number of times we have failed to read and merge the full state from another replica.
170
- # TYPE alertmanager_state_fetch_replica_state_failed_total counter
171
- alertmanager_state_fetch_replica_state_failed_total 0
172
- # HELP alertmanager_state_fetch_replica_state_total Number of times we have tried to read and merge the full state from another replica.
173
- # TYPE alertmanager_state_fetch_replica_state_total counter
174
- alertmanager_state_fetch_replica_state_total 1
175
- # HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
176
- # TYPE alertmanager_partial_state_merges_failed_total counter
177
- alertmanager_partial_state_merges_failed_total{key="nflog"} 0
178
- # HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
179
- # TYPE alertmanager_partial_state_merges_total counter
180
- alertmanager_partial_state_merges_total{key="nflog"} 0
181
186
# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
182
187
# TYPE alertmanager_state_initial_sync_completed_total counter
183
188
alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
184
189
alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 1
185
190
alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 0
186
191
alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
187
- # HELP alertmanager_state_initial_sync_total Number of times we have tried to sync initial state from peers or remote storage.
188
- # TYPE alertmanager_state_initial_sync_total counter
189
- alertmanager_state_initial_sync_total 1
190
- # HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
191
- # TYPE alertmanager_state_replication_failed_total counter
192
- alertmanager_state_replication_failed_total{key="nflog"} 0
193
- # HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
194
- # TYPE alertmanager_state_replication_total counter
195
- alertmanager_state_replication_total{key="nflog"} 1
196
192
` ),
197
- "alertmanager_state_fetch_replica_state_failed_total" ,
198
- "alertmanager_state_fetch_replica_state_total" ,
199
- "alertmanager_partial_state_merges_failed_total" ,
200
- "alertmanager_partial_state_merges_total" ,
201
193
"alertmanager_state_initial_sync_completed_total" ,
202
- "alertmanager_state_initial_sync_total" ,
203
- "alertmanager_state_replication_failed_total" ,
204
- "alertmanager_state_replication_total" ,
194
+ ))
195
+ } else {
196
+ // Replication factor is 1, we expect state to be loaded from storage *instead* of other Alertmanagers
197
+ assert .NoError (t , testutil .GatherAndCompare (reg , strings .NewReader (`
198
+ # HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
199
+ # TYPE alertmanager_state_initial_sync_completed_total counter
200
+ alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
201
+ alertmanager_state_initial_sync_completed_total{outcome="from-replica"} 0
202
+ alertmanager_state_initial_sync_completed_total{outcome="from-storage"} 1
203
+ alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
204
+ ` ),
205
+ "alertmanager_state_initial_sync_completed_total" ,
205
206
))
206
207
207
208
}
0 commit comments