Skip to content

Commit 32f1847

Browse files
pstibranypracucci
authored andcommitted
Cleanup obsolete local files for alertmanager. (cortexproject#3910)
* Cleanup obsolete local files for alertmanager. Signed-off-by: Peter Štibraný <[email protected]> * CHANGELOG.md Signed-off-by: Peter Štibraný <[email protected]> * Comment. Signed-off-by: Peter Štibraný <[email protected]> * Don't ignore directories. Log error when deletion fails instead. Signed-off-by: Peter Štibraný <[email protected]> * Address review feedback. Signed-off-by: Peter Štibraný <[email protected]> * Move per-tenant state into tenant directory to simplify cleanup. Signed-off-by: Peter Štibraný <[email protected]> * Move migration to separate function. Add test for migration. Fix test for deletion of unused dirs. Signed-off-by: Peter Štibraný <[email protected]> * Store templates to correct place. Signed-off-by: Peter Štibraný <[email protected]> * CHANGELOG.md Signed-off-by: Peter Štibraný <[email protected]> * Verify that templates are stored properly into correct location. Signed-off-by: Peter Štibraný <[email protected]> * Comments. Signed-off-by: Peter Štibraný <[email protected]> * Comments. Signed-off-by: Peter Štibraný <[email protected]> * Apply suggestions from code review Co-authored-by: Marco Pracucci <[email protected]> Signed-off-by: Peter Štibraný <[email protected]> * Review feedback. Signed-off-by: Peter Štibraný <[email protected]> Co-authored-by: Marco Pracucci <[email protected]>
1 parent e841b03 commit 32f1847

File tree

6 files changed

+436
-49
lines changed

6 files changed

+436
-49
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## master / unreleased
44

5+
* [CHANGE] Alertmanager now removes local files after Alertmanager is no longer running for removed or resharded user. #3910
6+
* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. Support for this migration will be removed in Cortex 1.10. #3910
57
* [ENHANCEMENT] Ruler: optimized `<prefix>/api/v1/rules` and `<prefix>/api/v1/alerts` when ruler sharding is enabled. #3916
68
* [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916
79
* `cortex_ruler_clients`

development/tsdb-blocks-storage-s3/config/rules.yaml

+10
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,13 @@ groups:
44
rules:
55
- record: up:count
66
expr: count(up)
7+
8+
- name: example2
9+
rules:
10+
- alert: TooManyServices
11+
expr: count(up) > 1
12+
for: 1m
13+
labels:
14+
severity: page
15+
annotations:
16+
summary: Too many services

pkg/alertmanager/alertmanager.go

+23-11
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,28 @@ import (
4646
"github.com/cortexproject/cortex/pkg/util/services"
4747
)
4848

49-
const notificationLogMaintenancePeriod = 15 * time.Minute
49+
const (
50+
// MaintenancePeriod is used for periodic storing of silences and notifications to local file.
51+
maintenancePeriod = 15 * time.Minute
52+
53+
// Filenames used within tenant-directory
54+
notificationLogSnapshot = "notifications"
55+
silencesSnapshot = "silences"
56+
templatesDir = "templates"
57+
)
5058

5159
// Config configures an Alertmanager.
5260
type Config struct {
53-
UserID string
54-
// Used to persist notification logs and silences on disk.
55-
DataDir string
61+
UserID string
5662
Logger log.Logger
5763
Peer *cluster.Peer
5864
PeerTimeout time.Duration
5965
Retention time.Duration
6066
ExternalURL *url.URL
6167

68+
// Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed.
69+
TenantDataDir string
70+
6271
ShardingEnabled bool
6372
ReplicationFactor int
6473
ReplicateStateFunc func(context.Context, string, *clusterpb.Part) error
@@ -118,6 +127,10 @@ type State interface {
118127

119128
// New creates a new Alertmanager.
120129
func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
130+
if cfg.TenantDataDir == "" {
131+
return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured")
132+
}
133+
121134
am := &Alertmanager{
122135
cfg: cfg,
123136
logger: log.With(cfg.Logger, "user", cfg.UserID),
@@ -153,12 +166,11 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
153166
}
154167

155168
am.wg.Add(1)
156-
nflogID := fmt.Sprintf("nflog:%s", cfg.UserID)
157169
var err error
158170
am.nflog, err = nflog.New(
159171
nflog.WithRetention(cfg.Retention),
160-
nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogID)),
161-
nflog.WithMaintenance(notificationLogMaintenancePeriod, am.stop, am.wg.Done),
172+
nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)),
173+
nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done),
162174
nflog.WithMetrics(am.registry),
163175
nflog.WithLogger(log.With(am.logger, "component", "nflog")),
164176
)
@@ -171,9 +183,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
171183

172184
am.marker = types.NewMarker(am.registry)
173185

174-
silencesID := fmt.Sprintf("silences:%s", cfg.UserID)
186+
silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot)
175187
am.silences, err = silence.New(silence.Options{
176-
SnapshotFile: filepath.Join(cfg.DataDir, silencesID),
188+
SnapshotFile: silencesFile,
177189
Retention: cfg.Retention,
178190
Logger: log.With(am.logger, "component", "silences"),
179191
Metrics: am.registry,
@@ -189,7 +201,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
189201

190202
am.wg.Add(1)
191203
go func() {
192-
am.silences.Maintenance(15*time.Minute, filepath.Join(cfg.DataDir, silencesID), am.stop)
204+
am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop)
193205
am.wg.Done()
194206
}()
195207

@@ -249,7 +261,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s
249261
templateFiles := make([]string, len(conf.Templates))
250262
if len(conf.Templates) > 0 {
251263
for i, t := range conf.Templates {
252-
templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t)
264+
templateFiles[i] = filepath.Join(am.cfg.TenantDataDir, templatesDir, t)
253265
}
254266
}
255267

pkg/alertmanager/api.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,14 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {
153153
// not to configured data dir, and on the flipside, it'll fail if we can't write
154154
// to tmpDir. Ignoring both cases for now as they're ultra rare but will revisit if
155155
// we see this in the wild.
156-
tmpDir, err := ioutil.TempDir("", "validate-config")
156+
userTempDir, err := ioutil.TempDir("", "validate-config-"+cfg.User)
157157
if err != nil {
158158
return err
159159
}
160-
defer os.RemoveAll(tmpDir)
160+
defer os.RemoveAll(userTempDir)
161161

162162
for _, tmpl := range cfg.Templates {
163-
_, err := createTemplateFile(tmpDir, cfg.User, tmpl.Filename, tmpl.Body)
163+
_, err := storeTemplateFile(userTempDir, tmpl.Filename, tmpl.Body)
164164
if err != nil {
165165
level.Error(logger).Log("msg", "unable to create template file", "err", err, "user", cfg.User)
166166
return fmt.Errorf("unable to create template file '%s'", tmpl.Filename)
@@ -169,7 +169,7 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {
169169

170170
templateFiles := make([]string, len(amCfg.Templates))
171171
for i, t := range amCfg.Templates {
172-
templateFiles[i] = filepath.Join(tmpDir, "templates", cfg.User, t)
172+
templateFiles[i] = filepath.Join(userTempDir, t)
173173
}
174174

175175
_, err = template.FromGlobs(templateFiles...)

0 commit comments

Comments
 (0)