Convert all validation.Limits durations to model.Duration (#4044)

56quarters · jtlisi · web-flow · commit cb8ed0e4eaaa · 2021-04-02T14:47:52.000+02:00
* Test demonstrating time.Duration handling issue in JSON

Signed-off-by: Nick Pillitteri &lt;nick.pillitteri@grafana.com&gt;

* Convert all validation.Limits durations to model.Duration

Use the prometheus/common model.Duration for all duration related
limits instead of the standard library time.Duration type. This works
around an issue with the JSON representation of the standard library
type.

For example:
* The YAML representation of time.Duration(1 * time.Second) is the string `1s`
* The JSON representation of time.Duration(1 * time.Second) is the integer `1000000000`

By contrast:
* The YAML representation of model.Duration(1 * time.Second) is the string `1s`
* The JSON representation of model.Duration(1 * time.Second) is the string `1s`

Signed-off-by: Nick Pillitteri &lt;nick.pillitteri@grafana.com&gt;

* Update default configuration to match Limits struct

Signed-off-by: Nick Pillitteri &lt;nick.pillitteri@grafana.com&gt;

* Update CHANGELOG.md

Co-authored-by: Jacob Lisi &lt;jlisi@grafana.com&gt;
Signed-off-by: Nick Pillitteri &lt;nick.pillitteri@grafana.com&gt;

* Code review feedback

Signed-off-by: Nick Pillitteri &lt;nick.pillitteri@grafana.com&gt;

Co-authored-by: Jacob Lisi &lt;jlisi@grafana.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@
 * [ENHANCEMENT] Ingester: reduce CPU and memory when an high number of errors are returned by the ingester on the write path with the blocks storage. #3969 #3971 #3973
 * [ENHANCEMENT] Distributor: reduce CPU and memory when an high number of errors are returned by the distributor on the write path. #3990
 * [ENHANCEMENT] Put metric before label value in the "label value too long" error message. #4018
+* [ENHANCEMENT] Allow use of `y|w|d` suffixes for duration related limits and per-tenant limits. #4044
 * [BUGFIX] Ruler-API: fix bug where `/api/v1/rules/<namespace>/<group_name>` endpoint return `400` instead of `404`. #4013
 * [BUGFIX] Distributor: reverted changes done to rate limiting in #3825. #3948
 * [BUGFIX] Ingester: Fix race condition when opening and closing tsdb concurrently. #3959
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -3829,7 +3829,7 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
 
 # Maximum accepted sample age before rejecting.
 # CLI flag: -validation.reject-old-samples.max-age
-[reject_old_samples_max_age: <duration> | default = 336h]
+[reject_old_samples_max_age: <duration> | default = 2w]
 
 # Duration which table will be created/deleted before/after it's needed; we
 # won't accept sample from before this time.
diff --git a/pkg/chunk/chunk_store_test.go b/pkg/chunk/chunk_store_test.go
@@ -86,7 +86,7 @@ func newTestChunkStoreConfigWithMockStorage(t require.TestingT, schemaCfg Schema
 
 	var limits validation.Limits
 	flagext.DefaultValues(&limits)
-	limits.MaxQueryLength = 30 * 24 * time.Hour
+	limits.MaxQueryLength = model.Duration(30 * 24 * time.Hour)
 	overrides, err := validation.NewOverrides(limits, nil)
 	require.NoError(t, err)
 
diff --git a/pkg/chunk/testutils/testutils.go b/pkg/chunk/testutils/testutils.go
@@ -126,7 +126,7 @@ func SetupTestChunkStoreWithClients(indexClient chunk.IndexClient, chunksClient
 
 	var limits validation.Limits
 	flagext.DefaultValues(&limits)
-	limits.MaxQueryLength = 30 * 24 * time.Hour
+	limits.MaxQueryLength = model.Duration(30 * 24 * time.Hour)
 	overrides, err := validation.NewOverrides(limits, nil)
 	if err != nil {
 		return nil, err
diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go
@@ -151,7 +151,7 @@ func (t *Cortex) initRing() (serv services.Service, err error) {
 func (t *Cortex) initRuntimeConfig() (services.Service, error) {
 	if t.Cfg.RuntimeConfig.LoadPath == "" {
 		t.Cfg.RuntimeConfig.LoadPath = t.Cfg.LimitsConfig.PerTenantOverrideConfig
-		t.Cfg.RuntimeConfig.ReloadPeriod = t.Cfg.LimitsConfig.PerTenantOverridePeriod
+		t.Cfg.RuntimeConfig.ReloadPeriod = time.Duration(t.Cfg.LimitsConfig.PerTenantOverridePeriod)
 	}
 
 	if t.Cfg.RuntimeConfig.LoadPath == "" {
diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go
@@ -1112,7 +1112,7 @@ func BenchmarkDistributor_Push(b *testing.B) {
 		"timestamp too old": {
 			prepareConfig: func(limits *validation.Limits) {
 				limits.RejectOldSamples = true
-				limits.RejectOldSamplesMaxAge = time.Hour
+				limits.RejectOldSamplesMaxAge = model.Duration(time.Hour)
 			},
 			prepareSeries: func() ([]labels.Labels, []cortexpb.Sample) {
 				metrics := make([]labels.Labels, numSeriesPerRequest)
@@ -1137,7 +1137,7 @@ func BenchmarkDistributor_Push(b *testing.B) {
 		},
 		"timestamp too new": {
 			prepareConfig: func(limits *validation.Limits) {
-				limits.CreationGracePeriod = time.Minute
+				limits.CreationGracePeriod = model.Duration(time.Minute)
 			},
 			prepareSeries: func() ([]labels.Labels, []cortexpb.Sample) {
 				metrics := make([]labels.Labels, numSeriesPerRequest)
@@ -2055,9 +2055,9 @@ func TestDistributorValidation(t *testing.T) {
 			var limits validation.Limits
 			flagext.DefaultValues(&limits)
 
-			limits.CreationGracePeriod = 2 * time.Hour
+			limits.CreationGracePeriod = model.Duration(2 * time.Hour)
 			limits.RejectOldSamples = true
-			limits.RejectOldSamplesMaxAge = 24 * time.Hour
+			limits.RejectOldSamplesMaxAge = model.Duration(24 * time.Hour)
 			limits.MaxLabelNamesPerSeries = 2
 
 			ds, _, r, _ := prepare(t, prepConfig{
diff --git a/pkg/querier/querier_test.go b/pkg/querier/querier_test.go
@@ -439,7 +439,7 @@ func TestQuerier_ValidateQueryTimeRange_MaxQueryLength(t *testing.T) {
 			flagext.DefaultValues(&cfg)
 
 			limits := defaultLimitsConfig()
-			limits.MaxQueryLength = maxQueryLength
+			limits.MaxQueryLength = model.Duration(maxQueryLength)
 			overrides, err := validation.NewOverrides(limits, nil)
 			require.NoError(t, err)
 
diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go
@@ -46,8 +46,8 @@ type Limits struct {
 	MaxLabelNamesPerSeries    int                 `yaml:"max_label_names_per_series" json:"max_label_names_per_series"`
 	MaxMetadataLength         int                 `yaml:"max_metadata_length" json:"max_metadata_length"`
 	RejectOldSamples          bool                `yaml:"reject_old_samples" json:"reject_old_samples"`
-	RejectOldSamplesMaxAge    time.Duration       `yaml:"reject_old_samples_max_age" json:"reject_old_samples_max_age"`
-	CreationGracePeriod       time.Duration       `yaml:"creation_grace_period" json:"creation_grace_period"`
+	RejectOldSamplesMaxAge    model.Duration      `yaml:"reject_old_samples_max_age" json:"reject_old_samples_max_age"`
+	CreationGracePeriod       model.Duration      `yaml:"creation_grace_period" json:"creation_grace_period"`
 	EnforceMetadataMetricName bool                `yaml:"enforce_metadata_metric_name" json:"enforce_metadata_metric_name"`
 	EnforceMetricName         bool                `yaml:"enforce_metric_name" json:"enforce_metric_name"`
 	IngestionTenantShardSize  int                 `yaml:"ingestion_tenant_shard_size" json:"ingestion_tenant_shard_size"`
@@ -71,17 +71,17 @@ type Limits struct {
 	// Querier enforced limits.
 	MaxChunksPerQuery    int            `yaml:"max_chunks_per_query" json:"max_chunks_per_query"`
 	MaxQueryLookback     model.Duration `yaml:"max_query_lookback" json:"max_query_lookback"`
-	MaxQueryLength       time.Duration  `yaml:"max_query_length" json:"max_query_length"`
+	MaxQueryLength       model.Duration `yaml:"max_query_length" json:"max_query_length"`
 	MaxQueryParallelism  int            `yaml:"max_query_parallelism" json:"max_query_parallelism"`
 	CardinalityLimit     int            `yaml:"cardinality_limit" json:"cardinality_limit"`
-	MaxCacheFreshness    time.Duration  `yaml:"max_cache_freshness" json:"max_cache_freshness"`
+	MaxCacheFreshness    model.Duration `yaml:"max_cache_freshness" json:"max_cache_freshness"`
 	MaxQueriersPerTenant int            `yaml:"max_queriers_per_tenant" json:"max_queriers_per_tenant"`
 
 	// Ruler defaults and limits.
-	RulerEvaluationDelay        time.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"`
-	RulerTenantShardSize        int           `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"`
-	RulerMaxRulesPerRuleGroup   int           `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"`
-	RulerMaxRuleGroupsPerTenant int           `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"`
+	RulerEvaluationDelay        model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"`
+	RulerTenantShardSize        int            `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"`
+	RulerMaxRulesPerRuleGroup   int            `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"`
+	RulerMaxRuleGroupsPerTenant int            `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"`
 
 	// Store-gateway.
 	StoreGatewayTenantShardSize int `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"`
@@ -96,8 +96,8 @@ type Limits struct {
 	S3SSEKMSEncryptionContext string `yaml:"s3_sse_kms_encryption_context" json:"s3_sse_kms_encryption_context" doc:"nocli|description=S3 server-side encryption KMS encryption context. If unset and the key ID override is set, the encryption context will not be provided to S3. Ignored if the SSE type override is not set."`
 
 	// Config for overrides, convenient if it goes here. [Deprecated in favor of RuntimeConfig flag in cortex.Config]
-	PerTenantOverrideConfig string        `yaml:"per_tenant_override_config" json:"per_tenant_override_config"`
-	PerTenantOverridePeriod time.Duration `yaml:"per_tenant_override_period" json:"per_tenant_override_period"`
+	PerTenantOverrideConfig string         `yaml:"per_tenant_override_config" json:"per_tenant_override_config"`
+	PerTenantOverridePeriod model.Duration `yaml:"per_tenant_override_period" json:"per_tenant_override_period"`
 }
 
 // RegisterFlags adds the flags required to config this to the given FlagSet
@@ -116,8 +116,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	f.IntVar(&l.MaxLabelNamesPerSeries, "validation.max-label-names-per-series", 30, "Maximum number of label names per series.")
 	f.IntVar(&l.MaxMetadataLength, "validation.max-metadata-length", 1024, "Maximum length accepted for metric metadata. Metadata refers to Metric Name, HELP and UNIT.")
 	f.BoolVar(&l.RejectOldSamples, "validation.reject-old-samples", false, "Reject old samples.")
-	f.DurationVar(&l.RejectOldSamplesMaxAge, "validation.reject-old-samples.max-age", 14*24*time.Hour, "Maximum accepted sample age before rejecting.")
-	f.DurationVar(&l.CreationGracePeriod, "validation.create-grace-period", 10*time.Minute, "Duration which table will be created/deleted before/after it's needed; we won't accept sample from before this time.")
+	_ = l.RejectOldSamplesMaxAge.Set("14d")
+	f.Var(&l.RejectOldSamplesMaxAge, "validation.reject-old-samples.max-age", "Maximum accepted sample age before rejecting.")
+	_ = l.CreationGracePeriod.Set("10m")
+	f.Var(&l.CreationGracePeriod, "validation.create-grace-period", "Duration which table will be created/deleted before/after it's needed; we won't accept sample from before this time.")
 	f.BoolVar(&l.EnforceMetricName, "validation.enforce-metric-name", true, "Enforce every sample has a metric name.")
 	f.BoolVar(&l.EnforceMetadataMetricName, "validation.enforce-metadata-metric-name", true, "Enforce every metadata has a metric name.")
 
@@ -135,22 +137,24 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	f.IntVar(&l.MaxGlobalMetadataPerMetric, "ingester.max-global-metadata-per-metric", 0, "The maximum number of metadata per metric, across the cluster. 0 to disable.")
 
 	f.IntVar(&l.MaxChunksPerQuery, "store.query-chunk-limit", 2e6, "Maximum number of chunks that can be fetched in a single query. This limit is enforced when fetching chunks from the long-term storage. When running the Cortex chunks storage, this limit is enforced in the querier, while when running the Cortex blocks storage this limit is both enforced in the querier and store-gateway. 0 to disable.")
-	f.DurationVar(&l.MaxQueryLength, "store.max-query-length", 0, "Limit the query time range (end - start time). This limit is enforced in the query-frontend (on the received query), in the querier (on the query possibly split by the query-frontend) and in the chunks storage. 0 to disable.")
+	f.Var(&l.MaxQueryLength, "store.max-query-length", "Limit the query time range (end - start time). This limit is enforced in the query-frontend (on the received query), in the querier (on the query possibly split by the query-frontend) and in the chunks storage. 0 to disable.")
 	f.Var(&l.MaxQueryLookback, "querier.max-query-lookback", "Limit how long back data (series and metadata) can be queried, up until <lookback> duration ago. This limit is enforced in the query-frontend, querier and ruler. If the requested time range is outside the allowed range, the request will not fail but will be manipulated to only query data within the allowed time range. 0 to disable.")
 	f.IntVar(&l.MaxQueryParallelism, "querier.max-query-parallelism", 14, "Maximum number of split queries will be scheduled in parallel by the frontend.")
 	f.IntVar(&l.CardinalityLimit, "store.cardinality-limit", 1e5, "Cardinality limit for index queries. This limit is ignored when running the Cortex blocks storage. 0 to disable.")
-	f.DurationVar(&l.MaxCacheFreshness, "frontend.max-cache-freshness", 1*time.Minute, "Most recent allowed cacheable result per-tenant, to prevent caching very recent results that might still be in flux.")
+	_ = l.MaxCacheFreshness.Set("1m")
+	f.Var(&l.MaxCacheFreshness, "frontend.max-cache-freshness", "Most recent allowed cacheable result per-tenant, to prevent caching very recent results that might still be in flux.")
 	f.IntVar(&l.MaxQueriersPerTenant, "frontend.max-queriers-per-tenant", 0, "Maximum number of queriers that can handle requests for a single tenant. If set to 0 or value higher than number of available queriers, *all* queriers will handle requests for the tenant. Each frontend (or query-scheduler, if used) will select the same set of queriers for the same tenant (given that all queriers are connected to all frontends / query-schedulers). This option only works with queriers connecting to the query-frontend / query-scheduler, not when using downstream URL.")
 
-	f.DurationVar(&l.RulerEvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure the underlying metrics have been pushed to Cortex.")
+	f.Var(&l.RulerEvaluationDelay, "ruler.evaluation-delay-duration", "Duration to delay the evaluation of rules to ensure the underlying metrics have been pushed to Cortex.")
 	f.IntVar(&l.RulerTenantShardSize, "ruler.tenant-shard-size", 0, "The default tenant's shard size when the shuffle-sharding strategy is used by ruler. When this setting is specified in the per-tenant overrides, a value of 0 disables shuffle sharding for the tenant.")
 	f.IntVar(&l.RulerMaxRulesPerRuleGroup, "ruler.max-rules-per-rule-group", 0, "Maximum number of rules per rule group per-tenant. 0 to disable.")
 	f.IntVar(&l.RulerMaxRuleGroupsPerTenant, "ruler.max-rule-groups-per-tenant", 0, "Maximum number of rule groups per-tenant. 0 to disable.")
 
 	f.Var(&l.CompactorBlocksRetentionPeriod, "compactor.blocks-retention-period", "Delete blocks containing samples older than the specified retention period. 0 to disable.")
 
 	f.StringVar(&l.PerTenantOverrideConfig, "limits.per-user-override-config", "", "File name of per-user overrides. [deprecated, use -runtime-config.file instead]")
-	f.DurationVar(&l.PerTenantOverridePeriod, "limits.per-user-override-period", 10*time.Second, "Period with which to reload the overrides. [deprecated, use -runtime-config.reload-period instead]")
+	_ = l.PerTenantOverridePeriod.Set("10s")
+	f.Var(&l.PerTenantOverridePeriod, "limits.per-user-override-period", "Period with which to reload the overrides. [deprecated, use -runtime-config.reload-period instead]")
 
 	// Store-gateway.
 	f.IntVar(&l.StoreGatewayTenantShardSize, "store-gateway.tenant-shard-size", 0, "The default tenant's shard size when the shuffle-sharding strategy is used. Must be set when the store-gateway sharding is enabled with the shuffle-sharding strategy. When this setting is specified in the per-tenant overrides, a value of 0 disables shuffle sharding for the tenant.")
@@ -299,13 +303,13 @@ func (o *Overrides) RejectOldSamples(userID string) bool {
 
 // RejectOldSamplesMaxAge returns the age at which samples should be rejected.
 func (o *Overrides) RejectOldSamplesMaxAge(userID string) time.Duration {
-	return o.getOverridesForUser(userID).RejectOldSamplesMaxAge
+	return time.Duration(o.getOverridesForUser(userID).RejectOldSamplesMaxAge)
 }
 
 // CreationGracePeriod is misnamed, and actually returns how far into the future
 // we should accept samples.
 func (o *Overrides) CreationGracePeriod(userID string) time.Duration {
-	return o.getOverridesForUser(userID).CreationGracePeriod
+	return time.Duration(o.getOverridesForUser(userID).CreationGracePeriod)
 }
 
 // MaxSeriesPerQuery returns the maximum number of series a query is allowed to hit.
@@ -350,13 +354,13 @@ func (o *Overrides) MaxQueryLookback(userID string) time.Duration {
 
 // MaxQueryLength returns the limit of the length (in time) of a query.
 func (o *Overrides) MaxQueryLength(userID string) time.Duration {
-	return o.getOverridesForUser(userID).MaxQueryLength
+	return time.Duration(o.getOverridesForUser(userID).MaxQueryLength)
 }
 
 // MaxCacheFreshness returns the period after which results are cacheable,
 // to prevent caching of very recent results.
 func (o *Overrides) MaxCacheFreshness(userID string) time.Duration {
-	return o.getOverridesForUser(userID).MaxCacheFreshness
+	return time.Duration(o.getOverridesForUser(userID).MaxCacheFreshness)
 }
 
 // MaxQueriersPerUser returns the maximum number of queriers that can handle requests for this user.
@@ -417,7 +421,7 @@ func (o *Overrides) IngestionTenantShardSize(userID string) int {
 
 // EvaluationDelay returns the rules evaluation delay for a given user.
 func (o *Overrides) EvaluationDelay(userID string) time.Duration {
-	return o.getOverridesForUser(userID).RulerEvaluationDelay
+	return time.Duration(o.getOverridesForUser(userID).RulerEvaluationDelay)
 }
 
 // CompactorBlocksRetentionPeriod returns the retention period for a given user.
diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go
@@ -148,6 +148,40 @@ func TestLimitsTagsYamlMatchJson(t *testing.T) {
 	assert.Empty(t, mismatch, "expected no mismatched JSON and YAML tags")
 }
 
+func TestLimitsStringDurationYamlMatchJson(t *testing.T) {
+	inputYAML := `
+max_query_lookback: 1s
+max_query_length: 1s
+`
+	inputJSON := `{"max_query_lookback": "1s", "max_query_length": "1s"}`
+
+	limitsYAML := Limits{}
+	err := yaml.Unmarshal([]byte(inputYAML), &limitsYAML)
+	require.NoError(t, err, "expected to be able to unmarshal from YAML")
+
+	limitsJSON := Limits{}
+	err = json.Unmarshal([]byte(inputJSON), &limitsJSON)
+	require.NoError(t, err, "expected to be able to unmarshal from JSON")
+
+	assert.Equal(t, limitsYAML, limitsJSON)
+}
+
+func TestLimitsAlwaysUsesPromDuration(t *testing.T) {
+	stdlibDuration := reflect.TypeOf(time.Duration(0))
+	limits := reflect.TypeOf(Limits{})
+	n := limits.NumField()
+	var badDurationType []string
+
+	for i := 0; i < n; i++ {
+		field := limits.Field(i)
+		if field.Type == stdlibDuration {
+			badDurationType = append(badDurationType, field.Name)
+		}
+	}
+
+	assert.Empty(t, badDurationType, "some Limits fields are using stdlib time.Duration instead of model.Duration")
+}
+
 func TestMetricRelabelConfigLimitsLoadingFromYaml(t *testing.T) {
 	SetDefaultLimitsForYAMLUnmarshalling(Limits{})
 
@@ -238,10 +272,10 @@ func TestSmallestPositiveNonZeroIntPerTenant(t *testing.T) {
 func TestSmallestPositiveNonZeroDurationPerTenant(t *testing.T) {
 	tenantLimits := map[string]*Limits{
 		"tenant-a": {
-			MaxQueryLength: time.Hour,
+			MaxQueryLength: model.Duration(time.Hour),
 		},
 		"tenant-b": {
-			MaxQueryLength: 4 * time.Hour,
+			MaxQueryLength: model.Duration(4 * time.Hour),
 		},
 	}
 

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ func (t *Cortex) initRing() (serv services.Service, err error) {`
`151`	`151`	`func (t *Cortex) initRuntimeConfig() (services.Service, error) {`
`152`	`152`	`if t.Cfg.RuntimeConfig.LoadPath == "" {`
`153`	`153`	`t.Cfg.RuntimeConfig.LoadPath = t.Cfg.LimitsConfig.PerTenantOverrideConfig`
`154`		`- t.Cfg.RuntimeConfig.ReloadPeriod = t.Cfg.LimitsConfig.PerTenantOverridePeriod`
	`154`	`+ t.Cfg.RuntimeConfig.ReloadPeriod = time.Duration(t.Cfg.LimitsConfig.PerTenantOverridePeriod)`
`155`	`155`	`}`
`156`	`156`
`157`	`157`	`if t.Cfg.RuntimeConfig.LoadPath == "" {`