Skip to content

Commit 5a7ba2f

Browse files
authored
Fix support for keep_firing_for field in alert rules (#5823)
* Support keep_firing_for field for alert rules Signed-off-by: Mustafain Ali Khan <[email protected]> * Include keepFiringFor and keepFiringSince in API response Signed-off-by: Mustafain Ali Khan <[email protected]> * Add integration test for keep_firing_for field Signed-off-by: Mustafain Ali Khan <[email protected]> --------- Signed-off-by: Mustafain Ali Khan <[email protected]>
1 parent 4b75a5c commit 5a7ba2f

File tree

7 files changed

+384
-98
lines changed

7 files changed

+384
-98
lines changed

integration/ruler_test.go

+166-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"context"
99
"crypto/x509"
1010
"crypto/x509/pkix"
11+
"encoding/json"
1112
"fmt"
1213
"math"
1314
"math/rand"
@@ -19,10 +20,7 @@ import (
1920
"testing"
2021
"time"
2122

22-
"github.com/cortexproject/cortex/pkg/ruler"
23-
24-
"github.com/cortexproject/cortex/pkg/storage/tsdb"
25-
23+
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
2624
"github.com/prometheus/common/model"
2725
"github.com/prometheus/prometheus/model/labels"
2826
"github.com/prometheus/prometheus/model/rulefmt"
@@ -37,6 +35,8 @@ import (
3735
"github.com/cortexproject/cortex/integration/e2e"
3836
e2edb "github.com/cortexproject/cortex/integration/e2e/db"
3937
"github.com/cortexproject/cortex/integration/e2ecortex"
38+
"github.com/cortexproject/cortex/pkg/ruler"
39+
"github.com/cortexproject/cortex/pkg/storage/tsdb"
4040
)
4141

4242
func TestRulerAPI(t *testing.T) {
@@ -1038,6 +1038,168 @@ func TestRulerDisablesRuleGroups(t *testing.T) {
10381038
})
10391039
}
10401040

1041+
func TestRulerKeepFiring(t *testing.T) {
1042+
s, err := e2e.NewScenario(networkName)
1043+
require.NoError(t, err)
1044+
defer s.Close()
1045+
1046+
// Start dependencies.
1047+
consul := e2edb.NewConsul()
1048+
minio := e2edb.NewMinio(9000, bucketName, rulestoreBucketName)
1049+
require.NoError(t, s.StartAndWaitReady(consul, minio))
1050+
1051+
// Configure the ruler.
1052+
flags := mergeFlags(
1053+
BlocksStorageFlags(),
1054+
RulerFlags(),
1055+
map[string]string{
1056+
// Since we're not going to run any rule (our only rule is invalid), we don't need the
1057+
// store-gateway to be configured to a valid address.
1058+
"-querier.store-gateway-addresses": "localhost:12345",
1059+
// Enable the bucket index so we can skip the initial bucket scan.
1060+
"-blocks-storage.bucket-store.bucket-index.enabled": "true",
1061+
// Evaluate rules often, so that we don't need to wait for metrics to show up.
1062+
"-ruler.evaluation-interval": "2s",
1063+
"-ruler.poll-interval": "2s",
1064+
// No delay
1065+
"-ruler.evaluation-delay-duration": "0",
1066+
1067+
"-blocks-storage.tsdb.block-ranges-period": "1h",
1068+
"-blocks-storage.bucket-store.sync-interval": "1s",
1069+
"-blocks-storage.tsdb.retention-period": "2h",
1070+
1071+
// We run single ingester only, no replication.
1072+
"-distributor.replication-factor": "1",
1073+
1074+
"-querier.max-fetched-chunks-per-query": "50",
1075+
},
1076+
)
1077+
1078+
const namespace = "test"
1079+
const user = "user"
1080+
1081+
distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
1082+
ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), flags, "")
1083+
ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
1084+
require.NoError(t, s.StartAndWaitReady(distributor, ingester, ruler))
1085+
1086+
// Wait until both the distributor and ruler have updated the ring. The querier will also watch
1087+
// the store-gateway ring if blocks sharding is enabled.
1088+
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
1089+
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
1090+
1091+
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", ruler.HTTPEndpoint(), user)
1092+
require.NoError(t, err)
1093+
1094+
expression := "vector(1) > 0" // Alert will fire
1095+
groupName := "rule_group_1"
1096+
ruleName := "rule_keep_firing"
1097+
1098+
require.NoError(t, c.SetRuleGroup(alertRuleWithKeepFiringFor(groupName, ruleName, expression, model.Duration(10*time.Second)), namespace))
1099+
1100+
m := ruleGroupMatcher(user, namespace, groupName)
1101+
1102+
// Wait until ruler has loaded the group.
1103+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
1104+
// Wait until rule group has tried to evaluate the rule.
1105+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
1106+
1107+
groups, err := c.GetPrometheusRules(e2ecortex.RuleFilter{
1108+
RuleNames: []string{ruleName},
1109+
})
1110+
require.NoError(t, err)
1111+
require.NotEmpty(t, groups)
1112+
require.Equal(t, 1, len(groups[0].Rules))
1113+
alert := parseAlertFromRule(t, groups[0].Rules[0])
1114+
require.Equal(t, float64(10), alert.KeepFiringFor)
1115+
require.Equal(t, 1, len(alert.Alerts))
1116+
require.Empty(t, alert.Alerts[0].KeepFiringSince) //Alert expression not resolved, keepFiringSince should be empty
1117+
1118+
expression = "vector(1) > 1" // Resolve, should keep firing for set duration
1119+
ts := time.Now()
1120+
require.NoError(t, c.SetRuleGroup(alertRuleWithKeepFiringFor(groupName, ruleName, expression, model.Duration(10*time.Second)), namespace))
1121+
// Wait until rule group has tried to evaluate the rule.
1122+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(2), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
1123+
1124+
updatedGroups, err := c.GetPrometheusRules(e2ecortex.RuleFilter{
1125+
RuleNames: []string{ruleName},
1126+
})
1127+
require.NoError(t, err)
1128+
require.NotEmpty(t, updatedGroups)
1129+
require.Equal(t, 1, len(updatedGroups[0].Rules))
1130+
1131+
alert = parseAlertFromRule(t, updatedGroups[0].Rules[0])
1132+
require.Equal(t, "firing", alert.State)
1133+
require.Equal(t, float64(10), alert.KeepFiringFor)
1134+
require.Equal(t, 1, len(alert.Alerts))
1135+
require.NotEmpty(t, alert.Alerts[0].KeepFiringSince)
1136+
require.Greater(t, alert.Alerts[0].KeepFiringSince.UnixNano(), ts.UnixNano(), "KeepFiringSince value should be after expression is resolved")
1137+
1138+
time.Sleep(10 * time.Second) // Sleep beyond keepFiringFor time
1139+
updatedGroups, err = c.GetPrometheusRules(e2ecortex.RuleFilter{
1140+
RuleNames: []string{ruleName},
1141+
})
1142+
require.NoError(t, err)
1143+
require.NotEmpty(t, updatedGroups)
1144+
require.Equal(t, 1, len(updatedGroups[0].Rules))
1145+
alert = parseAlertFromRule(t, updatedGroups[0].Rules[0])
1146+
require.Equal(t, 0, len(alert.Alerts)) // alert should be resolved once keepFiringFor time expires
1147+
}
1148+
1149+
func parseAlertFromRule(t *testing.T, rules interface{}) *alertingRule {
1150+
responseJson, err := json.Marshal(rules)
1151+
require.NoError(t, err)
1152+
1153+
alertResp := &alertingRule{}
1154+
require.NoError(t, json.Unmarshal(responseJson, alertResp))
1155+
return alertResp
1156+
}
1157+
1158+
type alertingRule struct {
1159+
// State can be "pending", "firing", "inactive".
1160+
State string `json:"state"`
1161+
Name string `json:"name"`
1162+
Query string `json:"query"`
1163+
Duration float64 `json:"duration"`
1164+
KeepFiringFor float64 `json:"keepFiringFor"`
1165+
Labels labels.Labels `json:"labels"`
1166+
Annotations labels.Labels `json:"annotations"`
1167+
Alerts []*Alert `json:"alerts"`
1168+
Health string `json:"health"`
1169+
LastError string `json:"lastError"`
1170+
Type v1.RuleType `json:"type"`
1171+
LastEvaluation time.Time `json:"lastEvaluation"`
1172+
EvaluationTime float64 `json:"evaluationTime"`
1173+
}
1174+
1175+
// Alert has info for an alert.
1176+
type Alert struct {
1177+
Labels labels.Labels `json:"labels"`
1178+
Annotations labels.Labels `json:"annotations"`
1179+
State string `json:"state"`
1180+
ActiveAt *time.Time `json:"activeAt"`
1181+
KeepFiringSince *time.Time `json:"keepFiringSince,omitempty"`
1182+
Value string `json:"value"`
1183+
}
1184+
1185+
func alertRuleWithKeepFiringFor(groupName string, ruleName string, expression string, keepFiring model.Duration) rulefmt.RuleGroup {
1186+
var recordNode = yaml.Node{}
1187+
var exprNode = yaml.Node{}
1188+
1189+
recordNode.SetString(ruleName)
1190+
exprNode.SetString(expression)
1191+
1192+
return rulefmt.RuleGroup{
1193+
Name: groupName,
1194+
Interval: 10,
1195+
Rules: []rulefmt.RuleNode{{
1196+
Alert: recordNode,
1197+
Expr: exprNode,
1198+
KeepFiringFor: keepFiring,
1199+
}},
1200+
}
1201+
}
1202+
10411203
func ruleGroupMatcher(user, namespace, groupName string) *labels.Matcher {
10421204
return labels.MustNewMatcher(labels.MatchEqual, "rule_group", fmt.Sprintf("/rules/%s/%s;%s", user, namespace, groupName))
10431205
}

pkg/ruler/api.go

+20-9
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,12 @@ type AlertDiscovery struct {
4646

4747
// Alert has info for an alert.
4848
type Alert struct {
49-
Labels labels.Labels `json:"labels"`
50-
Annotations labels.Labels `json:"annotations"`
51-
State string `json:"state"`
52-
ActiveAt *time.Time `json:"activeAt"`
53-
Value string `json:"value"`
49+
Labels labels.Labels `json:"labels"`
50+
Annotations labels.Labels `json:"annotations"`
51+
State string `json:"state"`
52+
ActiveAt *time.Time `json:"activeAt"`
53+
KeepFiringSince *time.Time `json:"keepFiringSince,omitempty"`
54+
Value string `json:"value"`
5455
}
5556

5657
// RuleDiscovery has info for all rules
@@ -80,6 +81,7 @@ type alertingRule struct {
8081
Name string `json:"name"`
8182
Query string `json:"query"`
8283
Duration float64 `json:"duration"`
84+
KeepFiringFor float64 `json:"keepFiringFor"`
8385
Labels labels.Labels `json:"labels"`
8486
Annotations labels.Labels `json:"annotations"`
8587
Alerts []*Alert `json:"alerts"`
@@ -211,13 +213,17 @@ func (a *API) PrometheusRules(w http.ResponseWriter, req *http.Request) {
211213
if g.ActiveRules[i].Rule.Alert != "" {
212214
alerts := make([]*Alert, 0, len(rl.Alerts))
213215
for _, a := range rl.Alerts {
214-
alerts = append(alerts, &Alert{
216+
alert := &Alert{
215217
Labels: cortexpb.FromLabelAdaptersToLabels(a.Labels),
216218
Annotations: cortexpb.FromLabelAdaptersToLabels(a.Annotations),
217219
State: a.GetState(),
218220
ActiveAt: &a.ActiveAt,
219221
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
220-
})
222+
}
223+
if !a.KeepFiringSince.IsZero() {
224+
alert.KeepFiringSince = &a.KeepFiringSince
225+
}
226+
alerts = append(alerts, alert)
221227
}
222228
grp.Rules[i] = alertingRule{
223229
State: rl.GetState(),
@@ -232,6 +238,7 @@ func (a *API) PrometheusRules(w http.ResponseWriter, req *http.Request) {
232238
LastEvaluation: rl.GetEvaluationTimestamp(),
233239
EvaluationTime: rl.GetEvaluationDuration().Seconds(),
234240
Type: v1.RuleTypeAlerting,
241+
KeepFiringFor: rl.Rule.KeepFiringFor.Seconds(),
235242
}
236243
} else {
237244
grp.Rules[i] = recordingRule{
@@ -296,13 +303,17 @@ func (a *API) PrometheusAlerts(w http.ResponseWriter, req *http.Request) {
296303
for _, rl := range g.ActiveRules {
297304
if rl.Rule.Alert != "" {
298305
for _, a := range rl.Alerts {
299-
alerts = append(alerts, &Alert{
306+
alert := &Alert{
300307
Labels: cortexpb.FromLabelAdaptersToLabels(a.Labels),
301308
Annotations: cortexpb.FromLabelAdaptersToLabels(a.Annotations),
302309
State: a.GetState(),
303310
ActiveAt: &a.ActiveAt,
304311
Value: strconv.FormatFloat(a.Value, 'e', -1, 64),
305-
})
312+
}
313+
if !a.KeepFiringSince.IsZero() {
314+
alert.KeepFiringSince = &a.KeepFiringSince
315+
}
316+
alerts = append(alerts, alert)
306317
}
307318
}
308319
}

pkg/ruler/ruler.go

+10-9
Original file line numberDiff line numberDiff line change
@@ -829,15 +829,16 @@ func (r *Ruler) getLocalRules(userID string, rulesRequest RulesRequest) ([]*Grou
829829
alerts := []*AlertStateDesc{}
830830
for _, a := range rule.ActiveAlerts() {
831831
alerts = append(alerts, &AlertStateDesc{
832-
State: a.State.String(),
833-
Labels: cortexpb.FromLabelsToLabelAdapters(a.Labels),
834-
Annotations: cortexpb.FromLabelsToLabelAdapters(a.Annotations),
835-
Value: a.Value,
836-
ActiveAt: a.ActiveAt,
837-
FiredAt: a.FiredAt,
838-
ResolvedAt: a.ResolvedAt,
839-
LastSentAt: a.LastSentAt,
840-
ValidUntil: a.ValidUntil,
832+
State: a.State.String(),
833+
Labels: cortexpb.FromLabelsToLabelAdapters(a.Labels),
834+
Annotations: cortexpb.FromLabelsToLabelAdapters(a.Annotations),
835+
Value: a.Value,
836+
ActiveAt: a.ActiveAt,
837+
FiredAt: a.FiredAt,
838+
ResolvedAt: a.ResolvedAt,
839+
LastSentAt: a.LastSentAt,
840+
ValidUntil: a.ValidUntil,
841+
KeepFiringSince: a.KeepFiringSince,
841842
})
842843
}
843844
ruleDesc = &RuleStateDesc{

0 commit comments

Comments
 (0)