Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alertmanager template limits. #4223

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* [FEATURE] Querier: Added new `-querier.max-fetched-series-per-query` flag. When Cortex is running with blocks storage, the max series per query limit is enforced in the querier and applies to unique series received from ingesters and store-gateway (long-term storage). #4179
* [FEATURE] Alertmanager: Added rate-limits to notifiers. Rate limits used by all integrations can be configured using `-alertmanager.notification-rate-limit`, while per-integration rate limits can be specified via `-alertmanager.notification-rate-limit-per-integration` parameter. Both shared and per-integration limits can be overwritten using overrides mechanism. These limits are applied on individual (per-tenant) alertmanagers. Rate-limited notifications are failed notifications. It is possible to monitor rate-limited notifications via new `cortex_alertmanager_notification_rate_limited_total` metric. #4135 #4163
* [FEATURE] Alertmanager: Added `-alertmanager.max-config-size-bytes` limit to control size of configuration files that Cortex users can upload to Alertmanager via API. This limit is configurable per-tenant. #4201
* [FEATURE] Alertmanager: Added `-alertmanager.max-templates-count` and `-alertmanager.max-template-size-bytes` options to control number and size of templates uploaded to Alertmanager via API. These limits are configurable per-tenant. #4223
* [FEATURE] Added flag `-debug.block-profile-rate` to enable goroutine blocking events profiling. #4217
* [ENHANCEMENT] Alertmanager: introduced new metrics to monitor operation when using `-alertmanager.sharding-enabled`: #4149
* `cortex_alertmanager_state_fetch_replica_state_total`
Expand Down
10 changes: 10 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -4133,6 +4133,16 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
# Alertmanager API. 0 = no limit.
# CLI flag: -alertmanager.max-config-size-bytes
[alertmanager_max_config_size_bytes: <int> | default = 0]

# Maximum number of templates in tenant's Alertmanager configuration uploaded
# via Alertmanager API. 0 = no limit.
# CLI flag: -alertmanager.max-templates-count
[alertmanager_max_templates_count: <int> | default = 0]

# Maximum size of single template in tenant's Alertmanager configuration
# uploaded via Alertmanager API. 0 = no limit.
# CLI flag: -alertmanager.max-template-size-bytes
[alertmanager_max_template_size_bytes: <int> | default = 0]
```

### `redis_config`
Expand Down
19 changes: 17 additions & 2 deletions pkg/alertmanager/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
errNoOrgID = "unable to determine the OrgID"
errListAllUser = "unable to list the Alertmanager users"
errConfigurationTooBig = "Alertmanager configuration is too big, limit: %d bytes"
errTooManyTemplates = "too many templates in the configuration: %d (limit: %d)"
errTemplateTooBig = "template %s is too big: %d bytes (limit: %d bytes)"

fetchConcurrency = 16
)
Expand Down Expand Up @@ -133,7 +135,7 @@ func (am *MultitenantAlertmanager) SetUserConfig(w http.ResponseWriter, r *http.
}

cfgDesc := alertspb.ToProto(cfg.AlertmanagerConfig, cfg.TemplateFiles, userID)
if err := validateUserConfig(logger, cfgDesc); err != nil {
if err := validateUserConfig(logger, cfgDesc, am.limits, userID); err != nil {
level.Warn(logger).Log("msg", errValidatingConfig, "err", err.Error())
http.Error(w, fmt.Sprintf("%s: %s", errValidatingConfig, err.Error()), http.StatusBadRequest)
return
Expand Down Expand Up @@ -171,7 +173,7 @@ func (am *MultitenantAlertmanager) DeleteUserConfig(w http.ResponseWriter, r *ht
}

// Partially copied from: https://github.com/prometheus/alertmanager/blob/8e861c646bf67599a1704fc843c6a94d519ce312/cli/check_config.go#L65-L96
func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {
func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc, limits Limits, user string) error {
// We don't have a valid use case for empty configurations. If a tenant does not have a
// configuration set and issue a request to the Alertmanager, we'll a) upload an empty
// config and b) immediately start an Alertmanager instance for them if a fallback
Expand All @@ -197,6 +199,19 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {
}
}

// Check template limits.
if l := limits.AlertmanagerMaxTemplatesCount(user); l > 0 && len(cfg.Templates) > l {
return fmt.Errorf(errTooManyTemplates, len(cfg.Templates), l)
}

if maxSize := limits.AlertmanagerMaxTemplateSize(user); maxSize > 0 {
for _, tmpl := range cfg.Templates {
if size := len(tmpl.GetBody()); size > maxSize {
return fmt.Errorf(errTemplateTooBig, tmpl.GetFilename(), size, maxSize)
}
}
}

// Validate template files.
for _, tmpl := range cfg.Templates {
if err := validateTemplateFilename(tmpl.Filename); err != nil {
Expand Down
74 changes: 71 additions & 3 deletions pkg/alertmanager/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@ import (

func TestAMConfigValidationAPI(t *testing.T) {
testCases := []struct {
name string
cfg string
maxConfigSize int
name string
cfg string
maxConfigSize int
maxTemplates int
maxTemplateSize int

response string
err error
Expand Down Expand Up @@ -486,6 +488,70 @@ alertmanager_config: |
maxConfigSize: 1000,
err: nil,
},
{
name: "templates limit reached",
cfg: `
alertmanager_config: |
route:
receiver: 'default-receiver'
receivers:
- name: default-receiver
template_files:
"t1.tmpl": "Some template"
"t2.tmpl": "Some template"
"t3.tmpl": "Some template"
"t4.tmpl": "Some template"
"t5.tmpl": "Some template"
`,
maxTemplates: 3,
err: errors.Wrap(fmt.Errorf(errTooManyTemplates, 5, 3), "error validating Alertmanager config"),
},
{
name: "templates limit not reached",
cfg: `
alertmanager_config: |
route:
receiver: 'default-receiver'
receivers:
- name: default-receiver
template_files:
"t1.tmpl": "Some template"
"t2.tmpl": "Some template"
"t3.tmpl": "Some template"
"t4.tmpl": "Some template"
"t5.tmpl": "Some template"
`,
maxTemplates: 10,
err: nil,
},
{
name: "template size limit reached",
cfg: `
alertmanager_config: |
route:
receiver: 'default-receiver'
receivers:
- name: default-receiver
template_files:
"t1.tmpl": "Very big template"
`,
maxTemplateSize: 5,
err: errors.Wrap(fmt.Errorf(errTemplateTooBig, "t1.tmpl", 17, 5), "error validating Alertmanager config"),
},
{
name: "template size limit ok",
cfg: `
alertmanager_config: |
route:
receiver: 'default-receiver'
receivers:
- name: default-receiver
template_files:
"t1.tmpl": "Very big template"
`,
maxTemplateSize: 20,
err: nil,
},
}

limits := &mockAlertManagerLimits{}
Expand All @@ -497,6 +563,8 @@ alertmanager_config: |
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
limits.maxConfigSize = tc.maxConfigSize
limits.maxTemplatesCount = tc.maxTemplates
limits.maxSizeOfTemplate = tc.maxTemplateSize

req := httptest.NewRequest(http.MethodPost, "http://alertmanager/api/v1/alerts", bytes.NewReader([]byte(tc.cfg)))
ctx := user.InjectOrgID(req.Context(), "testing")
Expand Down
6 changes: 6 additions & 0 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,12 @@ type Limits interface {

// AlertmanagerMaxConfigSize returns max size of configuration file that user is allowed to upload. If 0, there is no limit.
AlertmanagerMaxConfigSize(tenant string) int

// AlertmanagerMaxTemplatesCount returns max number of templates that tenant can use in the configuration. 0 = no limit.
AlertmanagerMaxTemplatesCount(tenant string) int

// AlertmanagerMaxTemplateSize returns max size of individual template. 0 = no limit.
AlertmanagerMaxTemplateSize(tenant string) int
}

// A MultitenantAlertmanager manages Alertmanager instances for multiple
Expand Down
10 changes: 10 additions & 0 deletions pkg/alertmanager/multitenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1943,12 +1943,22 @@ type mockAlertManagerLimits struct {
emailNotificationRateLimit rate.Limit
emailNotificationBurst int
maxConfigSize int
maxTemplatesCount int
maxSizeOfTemplate int
}

func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(tenant string) int {
return m.maxConfigSize
}

func (m *mockAlertManagerLimits) AlertmanagerMaxTemplatesCount(tenant string) int {
return m.maxTemplatesCount
}

func (m *mockAlertManagerLimits) AlertmanagerMaxTemplateSize(tenant string) int {
return m.maxSizeOfTemplate
}

func (m *mockAlertManagerLimits) AlertmanagerReceiversBlockCIDRNetworks(user string) []flagext.CIDR {
panic("implement me")
}
Expand Down
14 changes: 13 additions & 1 deletion pkg/util/validation/limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ type Limits struct {
NotificationRateLimit float64 `yaml:"alertmanager_notification_rate_limit" json:"alertmanager_notification_rate_limit"`
NotificationRateLimitPerIntegration NotificationRateLimitMap `yaml:"alertmanager_notification_rate_limit_per_integration" json:"alertmanager_notification_rate_limit_per_integration"`

AlertmanagerMaxConfigSizeBytes int `yaml:"alertmanager_max_config_size_bytes" json:"alertmanager_max_config_size_bytes"`
AlertmanagerMaxConfigSizeBytes int `yaml:"alertmanager_max_config_size_bytes" json:"alertmanager_max_config_size_bytes"`
AlertmanagerMaxTemplatesCount int `yaml:"alertmanager_max_templates_count" json:"alertmanager_max_templates_count"`
AlertmanagerMaxTemplateSizeBytes int `yaml:"alertmanager_max_template_size_bytes" json:"alertmanager_max_template_size_bytes"`
}

// RegisterFlags adds the flags required to config this to the given FlagSet
Expand Down Expand Up @@ -177,6 +179,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
}
f.Var(&l.NotificationRateLimitPerIntegration, "alertmanager.notification-rate-limit-per-integration", "Per-integration notification rate limits. Value is a map, where each key is integration name and value is a rate-limit (float). On command line, this map is given in JSON format. Rate limit has the same meaning as -alertmanager.notification-rate-limit, but only applies for specific integration. Allowed integration names: "+strings.Join(allowedIntegrationNames, ", ")+".")
f.IntVar(&l.AlertmanagerMaxConfigSizeBytes, "alertmanager.max-config-size-bytes", 0, "Maximum size of configuration file for Alertmanager that tenant can upload via Alertmanager API. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxTemplatesCount, "alertmanager.max-templates-count", 0, "Maximum number of templates in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.")
f.IntVar(&l.AlertmanagerMaxTemplateSizeBytes, "alertmanager.max-template-size-bytes", 0, "Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.")
}

// Validate the limits config and returns an error if the validation
Expand Down Expand Up @@ -587,6 +591,14 @@ func (o *Overrides) AlertmanagerMaxConfigSize(userID string) int {
return o.getOverridesForUser(userID).AlertmanagerMaxConfigSizeBytes
}

func (o *Overrides) AlertmanagerMaxTemplatesCount(userID string) int {
return o.getOverridesForUser(userID).AlertmanagerMaxTemplatesCount
}

func (o *Overrides) AlertmanagerMaxTemplateSize(userID string) int {
return o.getOverridesForUser(userID).AlertmanagerMaxTemplateSizeBytes
}

func (o *Overrides) getOverridesForUser(userID string) *Limits {
if o.tenantLimits != nil {
l := o.tenantLimits.ByUserID(userID)
Expand Down