Skip to content

Commit 82af736

Browse files
committed
CP-54207: Move VBD_attach outside of VM migrate downtime
VBDs can be attached to multiple VMs, so now that VBD_plug has been split into VBD_attach and VBD_activate, the attach can happen outside of the VM migrate downtime. This doesn't change the overall duration of the migration but can reduce the downtime by several seconds. This new functionality is dependent on two flags: firstly, xenopsd_vbd_plug_unplug_legacy must be false so that the VBD_attach and VBD_activate are separate atoms. This is off by default. Then there is another flag can_attach_early which is currently true iff the VBD's SM has required_api_version >= 3.0 Signed-off-by: Steven Woods <[email protected]>
1 parent 3ae8ff9 commit 82af736

File tree

5 files changed

+86
-10
lines changed

5 files changed

+86
-10
lines changed

ocaml/xapi-idl/xen/xenops_interface.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ module Vbd = struct
303303
; extra_private_keys: (string * string) list [@default []]
304304
; qos: qos option [@default None]
305305
; persistent: bool [@default true]
306+
; can_attach_early: bool [@default false]
306307
}
307308
[@@deriving rpcty]
308309

ocaml/xapi/xapi_sr.ml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,3 +1080,15 @@ let get_live_hosts ~__context ~sr =
10801080
Xapi_vm_helpers.assert_can_see_specified_SRs ~__context ~reqd_srs:[sr] ~host
10811081
in
10821082
Xapi_vm_helpers.possible_hosts ~__context ~choose_fn ()
1083+
1084+
let required_api_version_of_sr ~__context ~sr =
1085+
let sr_type = Db.SR.get_type ~__context ~self:sr in
1086+
let expr =
1087+
Xapi_database.Db_filter_types.(Eq (Field "type", Literal sr_type))
1088+
in
1089+
match Db.SM.get_records_where ~__context ~expr with
1090+
| (_, sm) :: _ ->
1091+
Some sm.API.sM_required_api_version
1092+
| [] ->
1093+
warn "Couldn't find SM with type %s" sr_type ;
1094+
None

ocaml/xapi/xapi_xenops.ml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,10 @@ let list_net_sriov_vf_pcis ~__context ~vm =
553553
None
554554
)
555555

556+
module StringMap = Map.Make (String)
557+
558+
let sr_version_cache = ref StringMap.empty
559+
556560
module MD = struct
557561
(** Convert between xapi DB records and xenopsd records *)
558562

@@ -684,6 +688,28 @@ module MD = struct
684688
) else
685689
disk_of_vdi ~__context ~self:vbd.API.vBD_VDI
686690
in
691+
let can_attach_early =
692+
let vdi = vbd.API.vBD_VDI in
693+
try
694+
let sr = Db.VDI.get_SR ~__context ~self:vdi in
695+
let sr_key = Ref.string_of sr in
696+
match StringMap.find_opt sr_key !sr_version_cache with
697+
| Some cached_api_version ->
698+
Version.String.ge cached_api_version "3.0"
699+
| None -> (
700+
match Xapi_sr.required_api_version_of_sr ~__context ~sr with
701+
| Some api_version ->
702+
sr_version_cache :=
703+
StringMap.add sr_key api_version !sr_version_cache ;
704+
Version.String.ge api_version "3.0"
705+
| None ->
706+
false
707+
)
708+
with Db_exn.DBCache_NotFound (_, _, _) as e ->
709+
info "Caught error %s; Defaulting can_attach_early to false"
710+
(Printexc.to_string e) ;
711+
false
712+
in
687713
{
688714
id= (vm.API.vM_uuid, Device_number.to_linux_device device_number)
689715
; position= Some device_number
@@ -707,6 +733,7 @@ module MD = struct
707733
( try Db.VDI.get_on_boot ~__context ~self:vbd.API.vBD_VDI = `persist
708734
with _ -> true
709735
)
736+
; can_attach_early
710737
}
711738

712739
let of_pvs_proxy ~__context vif proxy =

ocaml/xenopsd/cli/xn.ml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ let vbd_of_disk_info vm_id info =
278278
; extra_private_keys= []
279279
; qos= None
280280
; persistent= true
281+
; can_attach_early= false
281282
}
282283

283284
let print_disk vbd =

ocaml/xenopsd/lib/xenops_server.ml

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1763,7 +1763,8 @@ let rec atomics_of_operation = function
17631763
serial "VIF.activate_and_plug" ~id
17641764
[VIF_set_active (vif.Vif.id, true); VIF_plug vif.Vif.id]
17651765
)
1766-
| VM_restore_devices (id, restore_vifs) ->
1766+
| VM_restore_devices (id, migration) ->
1767+
let restore_vifs = not migration in
17671768
let vbds_rw, vbds_ro = VBD_DB.vbds id |> vbd_plug_sets in
17681769
let vgpus = VGPU_DB.vgpus id in
17691770
let pcis = PCI_DB.pcis id |> pci_plug_order in
@@ -1773,8 +1774,22 @@ let rec atomics_of_operation = function
17731774
let name_multi = pf "VBDs.activate_and_plug %s" typ in
17741775
let name_one = pf "VBD.activate_and_plug %s" typ in
17751776
parallel_map name_multi ~id vbds (fun vbd ->
1776-
serial name_one ~id
1777-
[VBD_set_active (vbd.Vbd.id, true); vbd_plug vbd.Vbd.id]
1777+
(* When migrating, attach early if the vbd's SM allows it.
1778+
Note: there is a bug here for SxM if migrating between API
1779+
versions as the Vbd's new SR won't have propagated to xenopsd
1780+
yet. This means can_attach_early will be based on the origin SR.
1781+
This is a non-issue as v1 <-> v3 migration is still experimental
1782+
and v1 is already early-attaching in SxM through mirroring.
1783+
*)
1784+
if
1785+
migration
1786+
&& (not !xenopsd_vbd_plug_unplug_legacy)
1787+
&& vbd.Vbd.can_attach_early
1788+
then
1789+
[VBD_activate vbd.Vbd.id]
1790+
else
1791+
serial name_one ~id
1792+
[VBD_set_active (vbd.Vbd.id, true); vbd_plug vbd.Vbd.id]
17781793
)
17791794
in
17801795
[
@@ -1897,7 +1912,7 @@ let rec atomics_of_operation = function
18971912
]
18981913
; vgpu_start_operations
18991914
; [VM_restore (id, data, vgpu_data)]
1900-
; atomics_of_operation (VM_restore_devices (id, true))
1915+
; atomics_of_operation (VM_restore_devices (id, false))
19011916
; [
19021917
(* At this point the domain is considered survivable. *)
19031918
VM_set_domain_action_request (id, None)
@@ -2696,9 +2711,9 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit =
26962711
| VM_restore_vifs id ->
26972712
debug "VM_restore_vifs %s" id ;
26982713
perform_atomics (atomics_of_operation op) t
2699-
| VM_restore_devices (id, restore_vifs) ->
2714+
| VM_restore_devices (id, migration) ->
27002715
(* XXX: this is delayed due to the 'attach'/'activate' behaviour *)
2701-
debug "VM_restore_devices %s %b" id restore_vifs ;
2716+
debug "VM_restore_devices %s %b" id migration ;
27022717
perform_atomics (atomics_of_operation op) t
27032718
| VM_resume (id, _data) ->
27042719
debug "VM.resume %s" id ;
@@ -3022,11 +3037,31 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit =
30223037
( try
30233038
let no_sharept = VGPU_DB.vgpus id |> List.exists is_no_sharept in
30243039
debug "VM %s no_sharept=%b (%s)" id no_sharept __LOC__ ;
3040+
(* If plug is split into activate and attach, we could attach
3041+
early so that it is outside of the VM downtime (if the SM
3042+
supports this) *)
3043+
let early_attach =
3044+
parallel_map "VBDs.set_active_and_attach" ~id (VBD_DB.vbds id)
3045+
(fun vbd ->
3046+
if
3047+
(not !xenopsd_vbd_plug_unplug_legacy)
3048+
&& vbd.Vbd.can_attach_early
3049+
then
3050+
serial "VBD.set_active_and_attach" ~id
3051+
[
3052+
VBD_set_active (vbd.Vbd.id, true)
3053+
; VBD_attach vbd.Vbd.id
3054+
]
3055+
else
3056+
[]
3057+
)
3058+
in
30253059
perform_atomics
30263060
([VM_create (id, Some memory_limit, Some final_id, no_sharept)]
3027-
@ (* Perform as many operations as possible on the destination
3028-
domain before pausing the original domain *)
3029-
atomics_of_operation (VM_restore_vifs id)
3061+
(* Perform as many operations as possible on the destination
3062+
domain before pausing the original domain *)
3063+
@ atomics_of_operation (VM_restore_vifs id)
3064+
@ early_attach
30303065
)
30313066
t ;
30323067
Handshake.send s Handshake.Success
@@ -3142,7 +3177,7 @@ and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit =
31423177
) ;
31433178
debug "VM.receive_memory: restoring remaining devices and unpausing" ;
31443179
perform_atomics
3145-
(atomics_of_operation (VM_restore_devices (final_id, false))
3180+
(atomics_of_operation (VM_restore_devices (final_id, true))
31463181
@ [
31473182
VM_unpause final_id
31483183
; VM_set_domain_action_request (final_id, None)

0 commit comments

Comments
 (0)