Skip to content

Commit afb3678

Browse files
authored
CA-412313: don't lose distributed tracing spans when XAPI is shut down (#6525)
Soon after Host.evacuate XAPI could be restarted (e.g. on coordinator promotion). But we only export traces every 30s, so we lose the spans from the last 30s, including the toplevel Host.evacuate span (which although long running is only emitted on completion). After this change I'm now able to see Host.evacuate and all the migrate calls in the exported distributed trace.
2 parents 502fc45 + a54505e commit afb3678

File tree

6 files changed

+53
-33
lines changed

6 files changed

+53
-33
lines changed

ocaml/libs/tracing/tracing_export.ml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ module Destination = struct
306306
(* Note this signal will flush the spans and terminate the exporter thread *)
307307
let signal () = Delay.signal delay
308308

309+
let wait_exit = Delay.make ()
310+
309311
let create_exporter () =
310312
enable_span_garbage_collector () ;
311313
Thread.create
@@ -319,7 +321,8 @@ module Destination = struct
319321
signaled := true
320322
) ;
321323
flush_spans ()
322-
done
324+
done ;
325+
Delay.signal wait_exit
323326
)
324327
()
325328

@@ -339,6 +342,12 @@ module Destination = struct
339342
)
340343
end
341344

342-
let flush_and_exit = Destination.signal
345+
let flush_and_exit ~max_wait () =
346+
D.debug "flush_and_exit: signaling thread to export now" ;
347+
Destination.signal () ;
348+
if Delay.wait Destination.wait_exit max_wait then
349+
D.info "flush_and_exit: timeout on span export"
350+
else
351+
D.debug "flush_and_exit: span export finished"
343352

344353
let main = Destination.main

ocaml/libs/tracing/tracing_export.mli

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ module Destination : sig
8585
end
8686
end
8787

88-
val flush_and_exit : unit -> unit
89-
(** [flush_and_exit ()] sends a signal to flush the finish spans and terminate
90-
the exporter thread.
88+
val flush_and_exit : max_wait:float -> unit -> unit
89+
(** [flush_and_exit ~max_wait ()] sends a signal to flush the finish spans and terminate
90+
the exporter thread. It waits at most [max_wait] seconds.
9191
*)
9292

9393
val main : unit -> Thread.t

ocaml/tests/bench/bench_tracing.ml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ let export_thread =
2525
(* need to ensure this isn't running outside the benchmarked section,
2626
or bechamel might fail with 'Failed to stabilize GC'
2727
*)
28-
let after _ = Tracing_export.flush_and_exit () in
28+
let after _ = Tracing_export.flush_and_exit ~max_wait:0. () in
2929
Bechamel_simple_cli.thread_workload ~before:Tracing_export.main ~after
3030
~run:ignore
3131

@@ -52,7 +52,7 @@ let allocate () =
5252

5353
let free t =
5454
Tracing.TracerProvider.destroy ~uuid ;
55-
Tracing_export.flush_and_exit () ;
55+
Tracing_export.flush_and_exit ~max_wait:0. () ;
5656
Thread.join t
5757

5858
let test_tracing_on ?(overflow = false) ~name f =

ocaml/xapi/xapi_fuse.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ let light_fuse_and_run ?(fuse_length = !Constants.fuse_time) () =
5252
in
5353
let new_fuse_length = max 5. (fuse_length -. delay_so_far) in
5454
debug "light_fuse_and_run: current RRDs have been saved" ;
55+
ignore
56+
(Thread.create Tracing_export.(flush_and_exit ~max_wait:new_fuse_length) ()) ;
5557
ignore
5658
(Thread.create
5759
(fun () ->

ocaml/xs-trace/dune

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
11
(executable
2-
(modes exe)
3-
(name xs_trace)
4-
(public_name xs-trace)
5-
(package xapi-tools)
6-
(libraries
7-
uri
8-
tracing
9-
cmdliner
10-
tracing_export
11-
xapi-stdext-unix
12-
zstd
13-
)
14-
)
2+
(modes exe)
3+
(name xs_trace)
4+
(public_name xs-trace)
5+
(package xapi-tools)
6+
(libraries uri tracing cmdliner tracing_export yojson xapi-stdext-unix zstd))
157

168
(rule
17-
(targets xs-trace.1)
18-
(deps (:exe xs_trace.exe))
19-
(action (with-stdout-to %{targets} (run %{exe} --help=groff)))
20-
)
9+
(targets xs-trace.1)
10+
(deps
11+
(:exe xs_trace.exe))
12+
(action
13+
(with-stdout-to
14+
%{targets}
15+
(run %{exe} --help=groff))))
2116

2217
; not expected by the specfile
2318
;(install

ocaml/xs-trace/xs_trace.ml

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,7 @@ module Exporter = struct
2525
| _ ->
2626
()
2727

28-
(** Export traces from file system to a remote endpoint. *)
29-
let export erase src dst =
30-
let dst = Uri.of_string dst in
31-
let submit_json = submit_json dst in
28+
let iter_src src f =
3229
let rec export_file = function
3330
| path when Sys.is_directory path ->
3431
(* Recursively export trace files. *)
@@ -38,7 +35,7 @@ module Exporter = struct
3835
(* Decompress compressed trace file and submit each line iteratively *)
3936
let args = [|"zstdcat"; path|] in
4037
let ic = Unix.open_process_args_in args.(0) args in
41-
Unixext.lines_iter submit_json ic ;
38+
Unixext.lines_iter f ic ;
4239
match Unix.close_process_in ic with
4340
| Unix.WEXITED 0 ->
4441
()
@@ -47,15 +44,27 @@ module Exporter = struct
4744
)
4845
| path when Filename.check_suffix path ".ndjson" ->
4946
(* Submit traces line by line. *)
50-
Unixext.readfile_line submit_json path
47+
Unixext.readfile_line f path
5148
| path ->
5249
(* Assume any other extension is a valid JSON file. *)
5350
let json = Unixext.string_of_file path in
54-
submit_json json
51+
f json
5552
in
56-
export_file src ;
53+
export_file src
54+
55+
(** Export traces from file system to a remote endpoint. *)
56+
let export erase src dst =
57+
let dst = Uri.of_string dst in
58+
let submit_json = submit_json dst in
59+
iter_src src submit_json ;
5760
if erase then
5861
Unixext.rm_rec ~rm_top:true src
62+
63+
let pretty_print src =
64+
iter_src src @@ fun line ->
65+
line
66+
|> Yojson.Safe.from_string
67+
|> Yojson.Safe.pretty_to_channel ~std:true stdout
5968
end
6069

6170
module Cli = struct
@@ -83,6 +92,11 @@ module Cli = struct
8392
let doc = "copy a trace to an endpoint and erase it afterwards" in
8493
Cmd.(v (info "mv" ~doc) term)
8594

95+
let pp_cmd =
96+
let term = Term.(const Exporter.pretty_print $ src) in
97+
let doc = "Pretty print NDJSON traces" in
98+
Cmd.(v (info "pp" ~doc) term)
99+
86100
let xs_trace_cmd =
87101
let man =
88102
[
@@ -94,7 +108,7 @@ module Cli = struct
94108
let doc = "utility for working with local trace files" in
95109
Cmd.info "xs-trace" ~doc ~version:"0.1" ~man
96110
in
97-
Cmd.group desc [cp_cmd; mv_cmd]
111+
Cmd.group desc [cp_cmd; mv_cmd; pp_cmd]
98112

99113
let main () = Cmd.eval xs_trace_cmd
100114
end

0 commit comments

Comments
 (0)