31
31
import com .google .cloud .bigquery .storage .v1 .StreamConnection .RequestCallback ;
32
32
import com .google .common .annotations .VisibleForTesting ;
33
33
import com .google .common .base .Preconditions ;
34
+ import com .google .common .collect .ImmutableList ;
34
35
import com .google .common .util .concurrent .Uninterruptibles ;
35
36
import com .google .protobuf .Int64Value ;
36
37
import io .grpc .Status ;
40
41
import io .opentelemetry .api .common .Attributes ;
41
42
import io .opentelemetry .api .common .AttributesBuilder ;
42
43
import io .opentelemetry .api .metrics .LongCounter ;
44
+ import io .opentelemetry .api .metrics .LongHistogram ;
43
45
import io .opentelemetry .api .metrics .Meter ;
44
46
import io .opentelemetry .api .metrics .MeterProvider ;
45
47
import java .io .IOException ;
@@ -259,6 +261,7 @@ class ConnectionWorker implements AutoCloseable {
259
261
private static Pattern streamPatternTable = Pattern .compile (tableMatching );
260
262
private Meter writeMeter ;
261
263
static AttributeKey <String > telemetryKeyTableId = AttributeKey .stringKey ("table_id" );
264
+ static AttributeKey <String > telemetryKeyWriterId = AttributeKey .stringKey ("writer_id" );
262
265
private static String dataflowPrefix = "dataflow:" ;
263
266
static List <AttributeKey <String >> telemetryKeysTraceId =
264
267
new ArrayList <AttributeKey <String >>() {
@@ -268,10 +271,25 @@ class ConnectionWorker implements AutoCloseable {
268
271
add (AttributeKey .stringKey ("trace_field_3" ));
269
272
}
270
273
};
274
+ static AttributeKey <String > telemetryKeyErrorCode = AttributeKey .stringKey ("error_code" );
275
+ static AttributeKey <String > telemetryKeyIsRetry = AttributeKey .stringKey ("is_retry" );
271
276
private Attributes telemetryAttributes ;
272
- private LongCounter instrumentIncomingRequestCount ;
273
- private LongCounter instrumentIncomingRequestSize ;
274
- private LongCounter instrumentIncomingRequestRows ;
277
+ // Latency buckets are based on a list of 1.5 ^ n
278
+ private static final List <Long > METRICS_MILLISECONDS_LATENCY_BUCKETS =
279
+ ImmutableList .of (
280
+ 0L , 17L , 38L , 86L , 195L , 438L , 985L , 2217L , 4988L , 11223L , 25251L , 56815L , 127834L ,
281
+ 287627L , 647160L );
282
+
283
+ private static final class OpenTelemetryMetrics {
284
+ private LongCounter instrumentAckedRequestCount ;
285
+ private LongCounter instrumentAckedRequestSize ;
286
+ private LongCounter instrumentAckedRequestRows ;
287
+ private LongHistogram instrumentNetworkResponseLatency ;
288
+ private LongCounter instrumentConnectionStartCount ;
289
+ private LongCounter instrumentConnectionEndCount ;
290
+ }
291
+
292
+ private OpenTelemetryMetrics telemetryMetrics = new OpenTelemetryMetrics ();
275
293
276
294
public static Boolean isDefaultStreamName (String streamName ) {
277
295
Matcher matcher = DEFAULT_STREAM_PATTERN .matcher (streamName );
@@ -327,16 +345,21 @@ private void setTraceIdAttributes(AttributesBuilder builder) {
327
345
}
328
346
}
329
347
348
+ // Specify common attributes for all metrics.
349
+ // For example, table name and writer id.
350
+ // Metrics dashboards can be filtered on available attributes.
330
351
private Attributes buildOpenTelemetryAttributes () {
331
352
AttributesBuilder builder = Attributes .builder ();
332
353
String tableName = getTableName ();
333
354
if (!tableName .isEmpty ()) {
334
355
builder .put (telemetryKeyTableId , tableName );
335
356
}
357
+ builder .put (telemetryKeyWriterId , writerId );
336
358
setTraceIdAttributes (builder );
337
359
return builder .build ();
338
360
}
339
361
362
+ // Refresh the table name attribute when multiplexing switches between tables.
340
363
private void refreshOpenTelemetryTableNameAttributes () {
341
364
String tableName = getTableName ();
342
365
if (!tableName .isEmpty ()
@@ -347,6 +370,22 @@ private void refreshOpenTelemetryTableNameAttributes() {
347
370
}
348
371
}
349
372
373
+ // Build new attributes augmented with an error code string.
374
+ private Attributes augmentAttributesWithErrorCode (Attributes attributes , String errorCode ) {
375
+ AttributesBuilder builder = attributes .toBuilder ();
376
+ if ((errorCode != null ) && !errorCode .isEmpty ()) {
377
+ builder .put (telemetryKeyErrorCode , errorCode );
378
+ }
379
+ return builder .build ();
380
+ }
381
+
382
+ // Build new attributes augmented with a flag indicating this was a retry.
383
+ private Attributes augmentAttributesWithRetry (Attributes attributes ) {
384
+ AttributesBuilder builder = attributes .toBuilder ();
385
+ builder .put (telemetryKeyIsRetry , "1" );
386
+ return builder .build ();
387
+ }
388
+
350
389
@ VisibleForTesting
351
390
Attributes getTelemetryAttributes () {
352
391
return telemetryAttributes ;
@@ -360,20 +399,72 @@ private void registerOpenTelemetryMetrics() {
360
399
.setInstrumentationVersion (
361
400
ConnectionWorker .class .getPackage ().getImplementationVersion ())
362
401
.build ();
363
- instrumentIncomingRequestCount =
402
+ telemetryMetrics .instrumentAckedRequestCount =
403
+ writeMeter
404
+ .counterBuilder ("append_requests_acked" )
405
+ .setDescription ("Counts number of requests acked by the server" )
406
+ .build ();
407
+ telemetryMetrics .instrumentAckedRequestSize =
408
+ writeMeter
409
+ .counterBuilder ("append_request_bytes_acked" )
410
+ .setDescription ("Counts byte size of requests acked by the server" )
411
+ .build ();
412
+ telemetryMetrics .instrumentAckedRequestRows =
413
+ writeMeter
414
+ .counterBuilder ("append_rows_acked" )
415
+ .setDescription ("Counts number of request rows acked by the server" )
416
+ .build ();
417
+ writeMeter
418
+ .gaugeBuilder ("active_connection_count" )
419
+ .ofLongs ()
420
+ .setDescription ("Reports number of active connections" )
421
+ .buildWithCallback (
422
+ measurement -> {
423
+ int count = 0 ;
424
+ this .lock .lock ();
425
+ try {
426
+ if (streamConnectionIsConnected ) {
427
+ count = 1 ;
428
+ }
429
+ } finally {
430
+ this .lock .unlock ();
431
+ }
432
+ measurement .record (count , getTelemetryAttributes ());
433
+ });
434
+ writeMeter
435
+ .gaugeBuilder ("inflight_queue_length" )
436
+ .ofLongs ()
437
+ .setDescription (
438
+ "Reports length of inflight queue. This queue contains sent append requests waiting for response from the server." )
439
+ .buildWithCallback (
440
+ measurement -> {
441
+ int length = 0 ;
442
+ this .lock .lock ();
443
+ try {
444
+ length = inflightRequestQueue .size ();
445
+ } finally {
446
+ this .lock .unlock ();
447
+ }
448
+ measurement .record (length , getTelemetryAttributes ());
449
+ });
450
+ telemetryMetrics .instrumentNetworkResponseLatency =
364
451
writeMeter
365
- .counterBuilder ("append_requests" )
366
- .setDescription ("Counts number of incoming requests" )
452
+ .histogramBuilder ("network_response_latency" )
453
+ .ofLongs ()
454
+ .setDescription (
455
+ "Reports time taken in milliseconds for a response to arrive once a message has been sent over the network." )
456
+ .setExplicitBucketBoundariesAdvice (METRICS_MILLISECONDS_LATENCY_BUCKETS )
367
457
.build ();
368
- instrumentIncomingRequestSize =
458
+ telemetryMetrics . instrumentConnectionStartCount =
369
459
writeMeter
370
- .counterBuilder ("append_request_bytes" )
371
- .setDescription ("Counts byte size of incoming requests" )
460
+ .counterBuilder ("connection_start_count" )
461
+ .setDescription (
462
+ "Counts number of connection attempts made, regardless of whether these are initial or retry." )
372
463
.build ();
373
- instrumentIncomingRequestRows =
464
+ telemetryMetrics . instrumentConnectionEndCount =
374
465
writeMeter
375
- .counterBuilder ("append_rows " )
376
- .setDescription ("Counts number of incoming request rows " )
466
+ .counterBuilder ("connection_end_count " )
467
+ .setDescription ("Counts number of connection end events. " )
377
468
.build ();
378
469
}
379
470
@@ -465,6 +556,7 @@ public void run() {
465
556
466
557
private void resetConnection () {
467
558
log .info ("Start connecting stream: " + streamName + " id: " + writerId );
559
+ telemetryMetrics .instrumentConnectionStartCount .add (1 , getTelemetryAttributes ());
468
560
if (this .streamConnection != null ) {
469
561
// It's safe to directly close the previous connection as the in flight messages
470
562
// will be picked up by the next connection.
@@ -618,9 +710,6 @@ private ApiFuture<AppendRowsResponse> appendInternal(
618
710
+ requestWrapper .messageSize )));
619
711
return requestWrapper .appendResult ;
620
712
}
621
- instrumentIncomingRequestCount .add (1 , getTelemetryAttributes ());
622
- instrumentIncomingRequestSize .add (requestWrapper .messageSize , getTelemetryAttributes ());
623
- instrumentIncomingRequestRows .add (message .getProtoRows ().getRows ().getSerializedRowsCount ());
624
713
this .lock .lock ();
625
714
try {
626
715
if (userClosed ) {
@@ -1214,6 +1303,13 @@ private void requestCallback(AppendRowsResponse response) {
1214
1303
connectionRetryStartTime = 0 ;
1215
1304
}
1216
1305
if (!this .inflightRequestQueue .isEmpty ()) {
1306
+ Instant sendInstant = inflightRequestQueue .getFirst ().requestSendTimeStamp ;
1307
+ if (sendInstant != null ) {
1308
+ Duration durationLatency = Duration .between (sendInstant , Instant .now ());
1309
+ telemetryMetrics .instrumentNetworkResponseLatency .record (
1310
+ durationLatency .toMillis (), getTelemetryAttributes ());
1311
+ }
1312
+
1217
1313
requestWrapper = pollFirstInflightRequestQueue ();
1218
1314
requestProfilerHook .endOperation (
1219
1315
RequestProfiler .OperationName .RESPONSE_LATENCY , requestWrapper .requestUniqueId );
@@ -1234,6 +1330,22 @@ private void requestCallback(AppendRowsResponse response) {
1234
1330
this .lock .unlock ();
1235
1331
}
1236
1332
1333
+ Attributes augmentedTelemetryAttributes =
1334
+ augmentAttributesWithErrorCode (
1335
+ getTelemetryAttributes (),
1336
+ Code .values ()[
1337
+ response .hasError () ? response .getError ().getCode () : Status .Code .OK .ordinal ()]
1338
+ .toString ());
1339
+ if (requestWrapper .retryCount > 0 ) {
1340
+ augmentedTelemetryAttributes = augmentAttributesWithRetry (augmentedTelemetryAttributes );
1341
+ }
1342
+ telemetryMetrics .instrumentAckedRequestCount .add (1 , augmentedTelemetryAttributes );
1343
+ telemetryMetrics .instrumentAckedRequestSize .add (
1344
+ requestWrapper .messageSize , augmentedTelemetryAttributes );
1345
+ telemetryMetrics .instrumentAckedRequestRows .add (
1346
+ requestWrapper .message .getProtoRows ().getRows ().getSerializedRowsCount (),
1347
+ augmentedTelemetryAttributes );
1348
+
1237
1349
// Retries need to happen on the same thread as queue locking may occur
1238
1350
if (response .hasError ()) {
1239
1351
if (retryOnRetryableError (Code .values ()[response .getError ().getCode ()], requestWrapper )) {
@@ -1316,6 +1428,11 @@ private void doneCallback(Throwable finalStatus) {
1316
1428
this .lock .lock ();
1317
1429
try {
1318
1430
this .streamConnectionIsConnected = false ;
1431
+ this .telemetryMetrics .instrumentConnectionEndCount .add (
1432
+ 1 ,
1433
+ augmentAttributesWithErrorCode (
1434
+ getTelemetryAttributes (),
1435
+ Code .values ()[Status .fromThrowable (finalStatus ).getCode ().ordinal ()].toString ()));
1319
1436
if (connectionFinalStatus == null ) {
1320
1437
if (connectionRetryStartTime == 0 ) {
1321
1438
connectionRetryStartTime = System .currentTimeMillis ();
@@ -1327,6 +1444,8 @@ private void doneCallback(Throwable finalStatus) {
1327
1444
|| System .currentTimeMillis () - connectionRetryStartTime
1328
1445
<= maxRetryDuration .toMillis ())) {
1329
1446
this .conectionRetryCountWithoutCallback ++;
1447
+ this .telemetryMetrics .instrumentConnectionStartCount .add (
1448
+ 1 , augmentAttributesWithRetry (getTelemetryAttributes ()));
1330
1449
log .info (
1331
1450
"Connection is going to be reestablished with the next request. Retriable error "
1332
1451
+ finalStatus .toString ()
0 commit comments