Skip to content

Commit 94c7c84

Browse files
committed
[Refactor] Complete metrics overhaul
Metrics got an entire overhaul. Instead of relying on a broken prometheus library to publish our metrics, we now use the `tracing` library and with OpenTelemetry that we bind together then publish into a prometheus library. Metrics are now mostly derive-macros. This means that the struct can express what it wants to export and a help text. The library will choose if it is able to export it. Tracing now works by calling `.publish()` on the parent structs, those structs need to call `.publish()` on all the child members it wishes to publish data about. If a "group" is requested, use the `group!()` macro, which under-the-hood calls `tracing::span` with some special labels. At primitive layers, it will call the `publish!()` macro, which will call `tracing::event!()` macro under-the-hood with some special fields set. A custom `tracing::Subscriber` will intercept all the events and spans and convert them into a json-like object. This object can then be exported as real json or encoded into other formats like otel/prometheus. closes: #1164, #650, #384, #209 towards: #206
1 parent 3574149 commit 94c7c84

File tree

72 files changed

+2647
-1466
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+2647
-1466
lines changed

Cargo.lock

Lines changed: 306 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ nativelink-service = { path = "nativelink-service" }
4040
nativelink-store = { path = "nativelink-store" }
4141
nativelink-util = { path = "nativelink-util" }
4242
nativelink-worker = { path = "nativelink-worker" }
43+
nativelink-metric = { path = "nativelink-metric" }
44+
nativelink-metric-collector = { path = "nativelink-metric-collector" }
4345

4446
async-lock = "3.3.0"
4547
axum = "0.6.20"
@@ -58,3 +60,12 @@ tokio-rustls = "0.25.0"
5860
tonic = { version = "0.11.0", features = ["gzip", "tls"] }
5961
tower = "0.4.13"
6062
tracing = "0.1.40"
63+
opentelemetry_sdk = { version = "0.23.0", features = ["metrics"] }
64+
tracing-subscriber = "0.3.18"
65+
tracing-opentelemetry = { version = "0.25.0", features = ["metrics"] }
66+
opentelemetry-stdout = "0.5.0"
67+
opentelemetry_api = { version = "0.20.0", features = ["metrics"] }
68+
opentelemetry = { version = "0.23.0", features = ["metrics"] }
69+
prometheus = "0.13.4"
70+
opentelemetry-prometheus = "0.16.0"
71+
serde_json = "1.0.120"

nativelink-error/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ autobenches = false
99

1010
[dependencies]
1111
nativelink-proto = { path = "../nativelink-proto" }
12+
nativelink-metric = { path = "../nativelink-metric" }
1213

1314
hex = "0.4.3"
1415
prost = "0.12.4"

nativelink-error/src/lib.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use nativelink_metric::{
16+
MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent,
17+
};
1518
use prost_types::TimestampError;
1619
use serde::{Deserialize, Serialize};
1720

@@ -47,6 +50,16 @@ pub struct Error {
4750
pub messages: Vec<String>,
4851
}
4952

53+
impl MetricsComponent for Error {
54+
fn publish(
55+
&self,
56+
kind: MetricKind,
57+
field_metadata: MetricFieldData,
58+
) -> Result<MetricPublishKnownKindData, nativelink_metric::Error> {
59+
self.to_string().publish(kind, field_metadata)
60+
}
61+
}
62+
5063
impl Error {
5164
pub fn new(code: Code, msg: String) -> Self {
5265
let mut msgs = Vec::with_capacity(1);
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[package]
2+
name = "nativelink-metric-collector"
3+
version = "0.4.0"
4+
edition = "2021"
5+
rust-version = "1.79.0"
6+
7+
[dependencies]
8+
nativelink-metric = { path = "../nativelink-metric" }
9+
10+
tracing = "0.1.40"
11+
tracing-subscriber = "0.3.18"
12+
opentelemetry = { version = "0.23.0", features = ["metrics"] }
13+
parking_lot = "0.12.2"
14+
serde = "1.0.204"
15+
16+
[dev-dependencies]
17+
nativelink-macro = { path = "../nativelink-macro" }
18+
nativelink-error = { path = "../nativelink-error" }
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// Copyright 2024 The NativeLink Authors. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
pub use otel_exporter::otel_export;
16+
pub use tracing_layers::MetricsCollectorLayer;
17+
18+
mod metrics_collection;
19+
mod metrics_visitors;
20+
mod otel_exporter;
21+
mod tracing_layers;
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
// Copyright 2024 The NativeLink Authors. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::{
16+
borrow::Cow,
17+
collections::HashMap,
18+
ops::{Deref, DerefMut},
19+
};
20+
21+
use serde::Serialize;
22+
23+
use crate::metrics_visitors::CollectionKind;
24+
25+
/// The final-metric primitive value that was collected with type.
26+
#[derive(Debug, Serialize)]
27+
#[serde(untagged)]
28+
pub enum CollectedMetricPrimitiveValue {
29+
Counter(u64),
30+
String(Cow<'static, str>),
31+
}
32+
33+
/// The final-metric primitive field that was collected.
34+
#[derive(Default, Debug)]
35+
pub struct CollectedMetricPrimitive {
36+
pub value: Option<CollectedMetricPrimitiveValue>,
37+
pub help: String,
38+
pub value_type: CollectionKind,
39+
}
40+
41+
impl Serialize for CollectedMetricPrimitive {
42+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
43+
where
44+
S: serde::Serializer,
45+
{
46+
match &self.value {
47+
Some(CollectedMetricPrimitiveValue::Counter(value)) => serializer.serialize_u64(*value),
48+
Some(CollectedMetricPrimitiveValue::String(value)) => serializer.serialize_str(value),
49+
None => serializer.serialize_none(),
50+
}
51+
}
52+
}
53+
54+
/// Key-value represented output.
55+
pub type CollectedMetricChildren = HashMap<String, CollectedMetrics>;
56+
57+
/// The type of the collected metric (eg: nested vs primitive).
58+
#[derive(Debug, Serialize)]
59+
#[serde(untagged)]
60+
pub enum CollectedMetrics {
61+
Primitive(CollectedMetricPrimitive),
62+
Component(Box<CollectedMetricChildren>),
63+
}
64+
65+
impl CollectedMetrics {
66+
pub fn new_component() -> Self {
67+
Self::Component(Box::new(CollectedMetricChildren::default()))
68+
}
69+
}
70+
71+
/// The root metric component that was collected.
72+
#[derive(Default, Debug, Serialize)]
73+
pub struct RootMetricCollectedMetrics {
74+
#[serde(flatten)]
75+
inner: CollectedMetricChildren,
76+
}
77+
78+
impl Deref for RootMetricCollectedMetrics {
79+
type Target = CollectedMetricChildren;
80+
81+
fn deref(&self) -> &Self::Target {
82+
&self.inner
83+
}
84+
}
85+
86+
impl DerefMut for RootMetricCollectedMetrics {
87+
fn deref_mut(&mut self) -> &mut Self::Target {
88+
&mut self.inner
89+
}
90+
}
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// Copyright 2024 The NativeLink Authors. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::{borrow::Cow, fmt::Debug};
16+
17+
use nativelink_metric::MetricKind;
18+
use serde::Serialize;
19+
use tracing::field::{Field, Visit};
20+
21+
use crate::metrics_collection::{CollectedMetricPrimitive, CollectedMetricPrimitiveValue};
22+
23+
/// The type of the collected primitive metric.
24+
#[derive(Default, Debug, Serialize)]
25+
pub enum CollectionKind {
26+
#[default]
27+
Counter = 0,
28+
String = 1,
29+
}
30+
31+
impl From<MetricKind> for CollectionKind {
32+
fn from(kind: MetricKind) -> Self {
33+
match kind {
34+
MetricKind::Counter => CollectionKind::Counter,
35+
MetricKind::String => CollectionKind::String,
36+
_ => CollectionKind::String,
37+
}
38+
}
39+
}
40+
41+
/// The final-metric primitive value and type that was collected.
42+
#[derive(Debug)]
43+
enum ValueWithPrimitiveType {
44+
String(String),
45+
U64(u64),
46+
}
47+
48+
impl Default for ValueWithPrimitiveType {
49+
fn default() -> Self {
50+
ValueWithPrimitiveType::U64(0)
51+
}
52+
}
53+
54+
/// An intermediate structed that will have it's contents populated
55+
/// by the `tracing` layer for a given field.
56+
/// This is done by implementing the `Visit` trait and asking the
57+
/// `tracing` library to visit the fields of the captured event
58+
/// and populate this struct.
59+
#[derive(Default, Debug)]
60+
pub struct MetricDataVisitor {
61+
pub name: String,
62+
value: ValueWithPrimitiveType,
63+
help: String,
64+
value_type: Option<CollectionKind>,
65+
}
66+
67+
impl From<MetricDataVisitor> for CollectedMetricPrimitive {
68+
fn from(visitor: MetricDataVisitor) -> Self {
69+
let (value, derived_type) = match visitor.value {
70+
ValueWithPrimitiveType::String(s) => (
71+
CollectedMetricPrimitiveValue::String(Cow::Owned(s)),
72+
CollectionKind::String,
73+
),
74+
ValueWithPrimitiveType::U64(u) => (
75+
CollectedMetricPrimitiveValue::Counter(u),
76+
CollectionKind::Counter,
77+
),
78+
};
79+
CollectedMetricPrimitive {
80+
value: Some(value),
81+
help: visitor.help,
82+
value_type: visitor.value_type.unwrap_or(derived_type),
83+
}
84+
}
85+
}
86+
87+
impl Visit for MetricDataVisitor {
88+
fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {}
89+
90+
fn record_f64(&mut self, field: &Field, value: f64) {
91+
if field.name() == "__value" {
92+
self.value = ValueWithPrimitiveType::String(value.to_string())
93+
}
94+
}
95+
fn record_i64(&mut self, field: &Field, value: i64) {
96+
if field.name() == "__value" {
97+
match u64::try_from(value) {
98+
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
99+
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
100+
}
101+
}
102+
}
103+
fn record_u64(&mut self, field: &Field, value: u64) {
104+
match field.name() {
105+
"__value" => self.value = ValueWithPrimitiveType::U64(value),
106+
"__type" => self.value_type = Some(MetricKind::from(value).into()),
107+
"__help" => self.help = value.to_string(),
108+
"__name" => self.name = value.to_string(),
109+
field => panic!("UNKNOWN FIELD {field}"),
110+
}
111+
}
112+
fn record_i128(&mut self, field: &Field, value: i128) {
113+
if field.name() == "__value" {
114+
match u64::try_from(value) {
115+
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
116+
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
117+
}
118+
}
119+
}
120+
fn record_u128(&mut self, field: &Field, value: u128) {
121+
if field.name() == "__value" {
122+
match u64::try_from(value) {
123+
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
124+
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
125+
}
126+
}
127+
}
128+
fn record_bool(&mut self, field: &Field, value: bool) {
129+
if field.name() == "__value" {
130+
self.value = ValueWithPrimitiveType::U64(u64::from(value));
131+
}
132+
}
133+
fn record_str(&mut self, field: &Field, value: &str) {
134+
match field.name() {
135+
"__value" => self.value = ValueWithPrimitiveType::String(value.to_string()),
136+
"__help" => self.help = value.to_string(),
137+
"__name" => self.name = value.to_string(),
138+
field => panic!("UNKNOWN FIELD {field}"),
139+
}
140+
}
141+
fn record_error(&mut self, _field: &Field, _value: &(dyn std::error::Error + 'static)) {}
142+
}
143+
144+
/// An intermediate structed that will have it's contents populated
145+
/// by the `tracing` layer for a given field.
146+
/// This is the same as `MetricDataVisitor` but only captures info
147+
/// about a given span on span creation.
148+
pub struct SpanFields {
149+
pub name: Cow<'static, str>,
150+
}
151+
152+
impl Visit for SpanFields {
153+
fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {}
154+
155+
fn record_str(&mut self, field: &Field, value: &str) {
156+
if field.name() == "__name" {
157+
self.name = Cow::Owned(value.to_string());
158+
}
159+
}
160+
}

0 commit comments

Comments
 (0)