Skip to content

Commit 7f379b9

Browse files
authored
enhancement(sample transform): add stratified sampling capability (#21274)
* (enhancement sample transform): add stratified sampling capability * Convert group_by key to Template * Generate documentation * Add changelog * Fix typo in docs * more documentation clean up * Fix formatting to pass linter, fix punctuation in docs
1 parent d2f855c commit 7f379b9

File tree

4 files changed

+106
-6
lines changed

4 files changed

+106
-6
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The `sample` transform can now take in a `group_by` configuration option that will allow logs with unique values for the patterns passed in to be sampled independently. This can reduce the complexity of the topology, since users would no longer need to create separate samplers with similar configuration to handle different log streams.
2+
3+
authors: hillmandj

src/transforms/sample/config.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use crate::{
1010
TransformOutput,
1111
},
1212
schema,
13+
template::Template,
1314
transforms::Transform,
1415
};
1516

@@ -44,6 +45,16 @@ pub struct SampleConfig {
4445
#[configurable(metadata(docs::examples = "message"))]
4546
pub key_field: Option<String>,
4647

48+
/// The value to group events into separate buckets to be sampled independently.
49+
///
50+
/// If left unspecified, or if the event doesn't have `group_by`, then the event is not
51+
/// sampled separately.
52+
#[configurable(metadata(
53+
docs::examples = "{{ service }}",
54+
docs::examples = "{{ hostname }}-{{ service }}"
55+
))]
56+
pub group_by: Option<Template>,
57+
4758
/// A logical condition used to exclude events from sampling.
4859
pub exclude: Option<AnyCondition>,
4960
}
@@ -53,6 +64,7 @@ impl GenerateConfig for SampleConfig {
5364
toml::Value::try_from(Self {
5465
rate: 10,
5566
key_field: None,
67+
group_by: None,
5668
exclude: None::<AnyCondition>,
5769
})
5870
.unwrap()
@@ -67,6 +79,7 @@ impl TransformConfig for SampleConfig {
6779
Self::NAME.to_string(),
6880
self.rate,
6981
self.key_field.clone(),
82+
self.group_by.clone(),
7083
self.exclude
7184
.as_ref()
7285
.map(|condition| condition.build(&context.enrichment_tables))
@@ -126,6 +139,7 @@ mod tests {
126139
let config = SampleConfig {
127140
rate: 1,
128141
key_field: None,
142+
group_by: None,
129143
exclude: None,
130144
};
131145
let (tx, rx) = mpsc::channel(1);

src/transforms/sample/transform.rs

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
use std::collections::HashMap;
12
use vector_lib::config::LegacyKey;
23
use vrl::event_path;
34

45
use crate::{
56
conditions::Condition,
67
event::Event,
78
internal_events::SampleEventDiscarded,
9+
sinks::prelude::TemplateRenderingError,
10+
template::Template,
811
transforms::{FunctionTransform, OutputBuffer},
912
};
1013

@@ -13,26 +16,29 @@ pub struct Sample {
1316
name: String,
1417
rate: u64,
1518
key_field: Option<String>,
19+
group_by: Option<Template>,
1620
exclude: Option<Condition>,
17-
count: u64,
21+
counter: HashMap<Option<String>, u64>,
1822
}
1923

2024
impl Sample {
2125
// This function is dead code when the feature flag `transforms-impl-sample` is specified but not
2226
// `transforms-sample`.
2327
#![allow(dead_code)]
24-
pub const fn new(
28+
pub fn new(
2529
name: String,
2630
rate: u64,
2731
key_field: Option<String>,
32+
group_by: Option<Template>,
2833
exclude: Option<Condition>,
2934
) -> Self {
3035
Self {
3136
name,
3237
rate,
3338
key_field,
39+
group_by,
3440
exclude,
35-
count: 0,
41+
counter: HashMap::new(),
3642
}
3743
}
3844
}
@@ -69,13 +75,42 @@ impl FunctionTransform for Sample {
6975
})
7076
.map(|v| v.to_string_lossy());
7177

78+
// Fetch actual field value if group_by option is set.
79+
let group_by_key = self.group_by.as_ref().and_then(|group_by| match &event {
80+
Event::Log(event) => group_by
81+
.render_string(event)
82+
.map_err(|error| {
83+
emit!(TemplateRenderingError {
84+
error,
85+
field: Some("group_by"),
86+
drop_event: false,
87+
})
88+
})
89+
.ok(),
90+
Event::Trace(event) => group_by
91+
.render_string(event)
92+
.map_err(|error| {
93+
emit!(TemplateRenderingError {
94+
error,
95+
field: Some("group_by"),
96+
drop_event: false,
97+
})
98+
})
99+
.ok(),
100+
Event::Metric(_) => panic!("component can never receive metric events"),
101+
});
102+
103+
let counter_value: u64 = *self.counter.entry(group_by_key.clone()).or_default();
104+
72105
let num = if let Some(value) = value {
73106
seahash::hash(value.as_bytes())
74107
} else {
75-
self.count
108+
counter_value
76109
};
77110

78-
self.count = (self.count + 1) % self.rate;
111+
// reset counter for particular key, or default key if group_by option isn't provided
112+
let increment: u64 = (counter_value + 1) % self.rate;
113+
self.counter.insert(group_by_key.clone(), increment);
79114

80115
if num % self.rate == 0 {
81116
match event {
@@ -134,6 +169,7 @@ mod tests {
134169
"sample".to_string(),
135170
2,
136171
log_schema().message_key().map(ToString::to_string),
172+
None,
137173
Some(condition_contains(
138174
log_schema().message_key().unwrap().to_string().as_str(),
139175
"na",
@@ -156,6 +192,7 @@ mod tests {
156192
"sample".to_string(),
157193
25,
158194
log_schema().message_key().map(ToString::to_string),
195+
None,
159196
Some(condition_contains(
160197
log_schema().message_key().unwrap().to_string().as_str(),
161198
"na",
@@ -181,6 +218,7 @@ mod tests {
181218
"sample".to_string(),
182219
2,
183220
log_schema().message_key().map(ToString::to_string),
221+
None,
184222
Some(condition_contains(
185223
log_schema().message_key().unwrap().to_string().as_str(),
186224
"na",
@@ -216,6 +254,7 @@ mod tests {
216254
"sample".to_string(),
217255
0,
218256
key_field.clone(),
257+
None,
219258
Some(condition_contains(
220259
log_schema().message_key().unwrap().to_string().as_str(),
221260
"important",
@@ -232,6 +271,33 @@ mod tests {
232271
}
233272
}
234273

274+
#[test]
275+
fn handles_group_by() {
276+
for group_by in &[None, Some(Template::try_from("{{ other_field }}").unwrap())] {
277+
let mut event = Event::Log(LogEvent::from("nananana"));
278+
let log = event.as_mut_log();
279+
log.insert("other_field", "foo");
280+
let mut sampler = Sample::new(
281+
"sample".to_string(),
282+
0,
283+
log_schema().message_key().map(ToString::to_string),
284+
group_by.clone(),
285+
Some(condition_contains(
286+
log_schema().message_key().unwrap().to_string().as_str(),
287+
"na",
288+
)),
289+
);
290+
let iterations = 0..1000;
291+
let total_passed = iterations
292+
.filter_map(|_| {
293+
transform_one(&mut sampler, event.clone())
294+
.map(|result| assert_eq!(result, event))
295+
})
296+
.count();
297+
assert_eq!(total_passed, 1000);
298+
}
299+
}
300+
235301
#[test]
236302
fn handles_key_field() {
237303
for key_field in &[None, Some("other_field".into())] {
@@ -242,6 +308,7 @@ mod tests {
242308
"sample".to_string(),
243309
0,
244310
key_field.clone(),
311+
None,
245312
Some(condition_contains("other_field", "foo")),
246313
);
247314
let iterations = 0..1000;
@@ -264,6 +331,7 @@ mod tests {
264331
"sample".to_string(),
265332
10,
266333
key_field.clone(),
334+
None,
267335
Some(condition_contains(&message_key, "na")),
268336
);
269337
let passing = events
@@ -278,6 +346,7 @@ mod tests {
278346
"sample".to_string(),
279347
25,
280348
key_field.clone(),
349+
None,
281350
Some(condition_contains(&message_key, "na")),
282351
);
283352
let passing = events
@@ -292,6 +361,7 @@ mod tests {
292361
"sample".to_string(),
293362
25,
294363
key_field.clone(),
364+
None,
295365
Some(condition_contains(&message_key, "na")),
296366
);
297367
let event = Event::Log(LogEvent::from("nananana"));
@@ -304,7 +374,7 @@ mod tests {
304374
fn handles_trace_event() {
305375
let event: TraceEvent = LogEvent::from("trace").into();
306376
let trace = Event::Trace(event);
307-
let mut sampler = Sample::new("sample".to_string(), 2, None, None);
377+
let mut sampler = Sample::new("sample".to_string(), 2, None, None, None);
308378
let iterations = 0..2;
309379
let total_passed = iterations
310380
.filter_map(|_| transform_one(&mut sampler, trace.clone()))

website/cue/reference/components/transforms/base/sample.cue

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,19 @@ base: components: transforms: sample: configuration: {
66
required: false
77
type: condition: {}
88
}
9+
group_by: {
10+
description: """
11+
The value to group events into separate buckets to be sampled independently.
12+
13+
If left unspecified, or if the event doesn't have `group_by`, then the event is not
14+
sampled separately.
15+
"""
16+
required: false
17+
type: string: {
18+
examples: ["{{ service }}", "{{ hostname }}-{{ service }}"]
19+
syntax: "template"
20+
}
21+
}
922
key_field: {
1023
description: """
1124
The name of the field whose value is hashed to determine if the event should be

0 commit comments

Comments
 (0)