Skip to content

Commit 68048b7

Browse files
alanprotstevesgjtlisi
authored
Implementing Zone aware sharding for alert manager (#4204)
* Implementing Zone aware sharding for alert manager Signed-off-by: Alan Protasio <[email protected]> * Update pkg/alertmanager/multitenant_test.go Co-authored-by: Steve Simpson <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Update CHANGELOG.md Co-authored-by: Steve Simpson <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Addressing comments Signed-off-by: Alan Protasio <[email protected]> * Update pkg/alertmanager/multitenant.go Co-authored-by: Jacob Lisi <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Update pkg/alertmanager/alertmanager_ring.go Co-authored-by: Jacob Lisi <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * Update pkg/alertmanager/multitenant.go Co-authored-by: Jacob Lisi <[email protected]> Signed-off-by: Alan Protasio <[email protected]> * fix lint issue Signed-off-by: Alan Protasio <[email protected]> Co-authored-by: Steve Simpson <[email protected]> Co-authored-by: Jacob Lisi <[email protected]>
1 parent 89a0232 commit 68048b7

File tree

5 files changed

+107
-7
lines changed

5 files changed

+107
-7
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* `cortex_alertmanager_state_persist_failed_total`
2222
* [ENHANCEMENT] Blocks storage: support ingesting exemplars. Enabled by setting new CLI flag `-blocks-storage.tsdb.max-exemplars=<n>` or config option `blocks_storage.tsdb.max_exemplars` to positive value. #4124
2323
* [ENHANCEMENT] Distributor: Added distributors ring status section in the admin page. #4151
24+
* [ENHANCEMENT] Added zone-awareness support to alertmanager for use when sharding is enabled. When zone-awareness is enabled, alerts will be replicated across availability zones. #4204
2425
* [ENHANCEMENT] Added `tenant_ids` tag to tracing spans #4147
2526
* [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128
2627
* [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176

docs/configuration/config-file-reference.md

+10
Original file line numberDiff line numberDiff line change
@@ -1903,10 +1903,20 @@ sharding_ring:
19031903
# CLI flag: -alertmanager.sharding-ring.replication-factor
19041904
[replication_factor: <int> | default = 3]
19051905
1906+
# True to enable zone-awareness and replicate alerts across different
1907+
# availability zones.
1908+
# CLI flag: -alertmanager.sharding-ring.zone-awareness-enabled
1909+
[zone_awareness_enabled: <boolean> | default = false]
1910+
19061911
# Name of network interface to read address from.
19071912
# CLI flag: -alertmanager.sharding-ring.instance-interface-names
19081913
[instance_interface_names: <list of string> | default = [eth0 en0]]
19091914
1915+
# The availability zone where this instance is running. Required if
1916+
# zone-awareness is enabled.
1917+
# CLI flag: -alertmanager.sharding-ring.instance-availability-zone
1918+
[instance_availability_zone: <string> | default = ""]
1919+
19101920
# Filename of fallback config to use if none specified for instance.
19111921
# CLI flag: -alertmanager.configs.fallback
19121922
[fallback_config_file: <string> | default = ""]

pkg/alertmanager/alertmanager_ring.go

+10-4
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,18 @@ var SyncRingOp = ring.NewOp([]ring.InstanceState{ring.ACTIVE, ring.JOINING}, fun
4242
// is used to strip down the config to the minimum, and avoid confusion
4343
// to the user.
4444
type RingConfig struct {
45-
KVStore kv.Config `yaml:"kvstore" doc:"description=The key-value store used to share the hash ring across multiple instances."`
46-
HeartbeatPeriod time.Duration `yaml:"heartbeat_period"`
47-
HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"`
48-
ReplicationFactor int `yaml:"replication_factor"`
45+
KVStore kv.Config `yaml:"kvstore" doc:"description=The key-value store used to share the hash ring across multiple instances."`
46+
HeartbeatPeriod time.Duration `yaml:"heartbeat_period"`
47+
HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"`
48+
ReplicationFactor int `yaml:"replication_factor"`
49+
ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"`
4950

5051
// Instance details
5152
InstanceID string `yaml:"instance_id" doc:"hidden"`
5253
InstanceInterfaceNames []string `yaml:"instance_interface_names"`
5354
InstancePort int `yaml:"instance_port" doc:"hidden"`
5455
InstanceAddr string `yaml:"instance_addr" doc:"hidden"`
56+
InstanceZone string `yaml:"instance_availability_zone"`
5557

5658
// Injected internally
5759
ListenPort int `yaml:"-"`
@@ -77,13 +79,15 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
7779
f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.")
7880
f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring.")
7981
f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.")
82+
f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.")
8083

8184
// Instance flags
8285
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
8386
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), rfprefix+"instance-interface-names", "Name of network interface to read address from.")
8487
f.StringVar(&cfg.InstanceAddr, rfprefix+"instance-addr", "", "IP address to advertise in the ring.")
8588
f.IntVar(&cfg.InstancePort, rfprefix+"instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).")
8689
f.StringVar(&cfg.InstanceID, rfprefix+"instance-id", hostname, "Instance ID to register in the ring.")
90+
f.StringVar(&cfg.InstanceZone, rfprefix+"instance-availability-zone", "", "The availability zone where this instance is running. Required if zone-awareness is enabled.")
8791

8892
cfg.RingCheckPeriod = 5 * time.Second
8993
}
@@ -103,6 +107,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() (ring.BasicLifecyclerConfig, error)
103107
Addr: fmt.Sprintf("%s:%d", instanceAddr, instancePort),
104108
HeartbeatPeriod: cfg.HeartbeatPeriod,
105109
TokensObservePeriod: 0,
110+
Zone: cfg.InstanceZone,
106111
NumTokens: RingNumTokens,
107112
}, nil
108113
}
@@ -114,6 +119,7 @@ func (cfg *RingConfig) ToRingConfig() ring.Config {
114119
rc.KVStore = cfg.KVStore
115120
rc.HeartbeatTimeout = cfg.HeartbeatTimeout
116121
rc.ReplicationFactor = cfg.ReplicationFactor
122+
rc.ZoneAwarenessEnabled = cfg.ZoneAwarenessEnabled
117123

118124
return rc
119125
}

pkg/alertmanager/multitenant.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,10 @@ const (
8686
var (
8787
statusTemplate *template.Template
8888

89-
errInvalidExternalURL = errors.New("the configured external URL is invalid: should not end with /")
90-
errShardingLegacyStorage = errors.New("deprecated -alertmanager.storage.* not supported with -alertmanager.sharding-enabled, use -alertmanager-storage.*")
91-
errShardingUnsupportedStorage = errors.New("the configured alertmanager storage backend is not supported when sharding is enabled")
89+
errInvalidExternalURL = errors.New("the configured external URL is invalid: should not end with /")
90+
errShardingLegacyStorage = errors.New("deprecated -alertmanager.storage.* not supported with -alertmanager.sharding-enabled, use -alertmanager-storage.*")
91+
errShardingUnsupportedStorage = errors.New("the configured alertmanager storage backend is not supported when sharding is enabled")
92+
errZoneAwarenessEnabledWithoutZoneInfo = errors.New("the configured alertmanager has zone awareness enabled but zone is not set")
9293
)
9394

9495
func init() {
@@ -197,6 +198,9 @@ func (cfg *MultitenantAlertmanagerConfig) Validate(storageCfg alertstore.Config)
197198
if !storageCfg.IsFullStateSupported() {
198199
return errShardingUnsupportedStorage
199200
}
201+
if cfg.ShardingRing.ZoneAwarenessEnabled && cfg.ShardingRing.InstanceZone == "" {
202+
return errZoneAwarenessEnabledWithoutZoneInfo
203+
}
200204
}
201205

202206
return nil

pkg/alertmanager/multitenant_test.go

+79
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,13 @@ func TestMultitenantAlertmanagerConfig_Validate(t *testing.T) {
153153
},
154154
expected: errShardingLegacyStorage,
155155
},
156+
"should fail if zone aware is enabled but zone is not set": {
157+
setup: func(t *testing.T, cfg *MultitenantAlertmanagerConfig, storageCfg *alertstore.Config) {
158+
cfg.ShardingEnabled = true
159+
cfg.ShardingRing.ZoneAwarenessEnabled = true
160+
},
161+
expected: errZoneAwarenessEnabledWithoutZoneInfo,
162+
},
156163
}
157164

158165
for testName, testData := range tests {
@@ -601,6 +608,78 @@ func TestMultitenantAlertmanager_deleteUnusedLocalUserState(t *testing.T) {
601608
require.NotZero(t, dirs[user2]) // has config, files survived
602609
}
603610

611+
func TestMultitenantAlertmanager_zoneAwareSharding(t *testing.T) {
612+
ctx := context.Background()
613+
alertStore := prepareInMemoryAlertStore()
614+
ringStore := consul.NewInMemoryClient(ring.GetCodec())
615+
const (
616+
user1 = "user1"
617+
user2 = "user2"
618+
user3 = "user3"
619+
)
620+
621+
createInstance := func(i int, zone string, registries *util.UserRegistries) *MultitenantAlertmanager {
622+
reg := prometheus.NewPedanticRegistry()
623+
cfg := mockAlertmanagerConfig(t)
624+
instanceID := fmt.Sprintf("instance-%d", i)
625+
registries.AddUserRegistry(instanceID, reg)
626+
627+
cfg.ShardingRing.ReplicationFactor = 2
628+
cfg.ShardingRing.InstanceID = instanceID
629+
cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.1-%d", i)
630+
cfg.ShardingEnabled = true
631+
cfg.ShardingRing.ZoneAwarenessEnabled = true
632+
cfg.ShardingRing.InstanceZone = zone
633+
634+
am, err := createMultitenantAlertmanager(cfg, nil, nil, alertStore, ringStore, nil, log.NewLogfmtLogger(os.Stdout), reg)
635+
require.NoError(t, err)
636+
t.Cleanup(func() {
637+
require.NoError(t, services.StopAndAwaitTerminated(ctx, am))
638+
})
639+
require.NoError(t, services.StartAndAwaitRunning(ctx, am))
640+
641+
return am
642+
}
643+
644+
registriesZoneA := util.NewUserRegistries()
645+
registriesZoneB := util.NewUserRegistries()
646+
647+
am1ZoneA := createInstance(1, "zoneA", registriesZoneA)
648+
am2ZoneA := createInstance(2, "zoneA", registriesZoneA)
649+
am1ZoneB := createInstance(3, "zoneB", registriesZoneB)
650+
651+
{
652+
require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{
653+
User: user1,
654+
RawConfig: simpleConfigOne,
655+
Templates: []*alertspb.TemplateDesc{},
656+
}))
657+
require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{
658+
User: user2,
659+
RawConfig: simpleConfigOne,
660+
Templates: []*alertspb.TemplateDesc{},
661+
}))
662+
require.NoError(t, alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{
663+
User: user3,
664+
RawConfig: simpleConfigOne,
665+
Templates: []*alertspb.TemplateDesc{},
666+
}))
667+
668+
err := am1ZoneA.loadAndSyncConfigs(context.Background(), reasonPeriodic)
669+
require.NoError(t, err)
670+
err = am2ZoneA.loadAndSyncConfigs(context.Background(), reasonPeriodic)
671+
require.NoError(t, err)
672+
err = am1ZoneB.loadAndSyncConfigs(context.Background(), reasonPeriodic)
673+
require.NoError(t, err)
674+
}
675+
676+
metricsZoneA := registriesZoneA.BuildMetricFamiliesPerUser()
677+
metricsZoneB := registriesZoneB.BuildMetricFamiliesPerUser()
678+
679+
assert.Equal(t, float64(3), metricsZoneA.GetSumOfGauges("cortex_alertmanager_tenants_owned"))
680+
assert.Equal(t, float64(3), metricsZoneB.GetSumOfGauges("cortex_alertmanager_tenants_owned"))
681+
}
682+
604683
func TestMultitenantAlertmanager_deleteUnusedRemoteUserState(t *testing.T) {
605684
ctx := context.Background()
606685

0 commit comments

Comments
 (0)