Skip to content

Commit 206d60e

Browse files
cunniewayneadams
authored andcommitted
🐞 Fix: deploying opsman to vSphere 15% boot fail
When deploying opsman to vSphere, it fails to boot 15% of the time. It happens very early in the boot process, apparently even before loading the kernel. When viewing the opsman's VM's console, the symptom is a flashing cursor in the upper left hand side of the screen. This commit fixes that failure by waiting 80 seconds for the opsman VM to report its IP address to vCenter, and if it hasn't reported its IP address by then, it sends a hardware reset to the VM. An opsman VM typically reports its IP address to vCenter 43 seconds after being powered-on. We verified this fix by successfully deploying & booting opsman 146 times in a row. More about the boot failure: - The boot failure only occurs the very first time an opsman is booted; subsequent boots will always succeed. We tested 100 shutdown/boots to confirm. - The failure was seen both on vSphere 7 and vSphere 8. - Sending a reset or a ctl-alt-del to the machine within the first few seconds of being powered-on reduced but did not eliminate the failure. This fix should have negligible impact on the length of time to deploy opsman. Typical output when resetting a failed initial boot: ``` Executing: "govc vm.info -vm.ipath=/dc/vm/pcf_vms/om.tas.nono.io -waitip" This could take a few moments... VM hasn't acquired IP, is probably stuck, resetting VM to free it Executing: "govc vm.power -vm.ipath=/dc/vm/pcf_vms/om.tas.nono.io -reset" This could take a few moments... govc[stdout]: Reset VirtualMachine:vm-42616... OK ```
1 parent 2af88e8 commit 206d60e

File tree

4 files changed

+155
-10
lines changed

4 files changed

+155
-10
lines changed

vmlifecycle/runner/runner.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package runner
22

33
import (
44
"bytes"
5+
"context"
56
"fmt"
67
"github.com/fatih/color"
78
"github.com/onsi/gomega/gexec"
@@ -35,6 +36,10 @@ func (r *Runner) Execute(args []interface{}) (*bytes.Buffer, *bytes.Buffer, erro
3536
}
3637

3738
func (r *Runner) ExecuteWithEnvVars(env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error) {
39+
return r.ExecuteWithEnvVarsCtx(context.Background(), env, args)
40+
}
41+
42+
func (r *Runner) ExecuteWithEnvVarsCtx(ctx context.Context, env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error) {
3843
var outBufWriter bytes.Buffer
3944
var errBufWriter bytes.Buffer
4045

@@ -53,7 +58,7 @@ func (r *Runner) ExecuteWithEnvVars(env []string, args []interface{}) (*bytes.Bu
5358
}
5459
}
5560

56-
command := exec.Command(r.command, stringArgs...)
61+
command := exec.CommandContext(ctx, r.command, stringArgs...)
5762
if len(env) > 0 {
5863
command.Env = append(os.Environ(), env...)
5964
}

vmlifecycle/vmmanagers/fakes/govcRunner.go

+98
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vmlifecycle/vmmanagers/vsphere.go

+42-7
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@ package vmmanagers
33
import (
44
"archive/tar"
55
"bytes"
6+
"context"
67
"encoding/json"
78
"errors"
89
"fmt"
10+
"github.com/blang/semver"
11+
"github.com/pivotal-cf/om/vmlifecycle/extractopsmansemver"
912
"io/ioutil"
1013
"log"
1114
"os"
1215
"strconv"
1316
"strings"
14-
15-
"github.com/blang/semver"
16-
"github.com/pivotal-cf/om/vmlifecycle/extractopsmansemver"
17+
"time"
1718
)
1819

1920
type VcenterCredential struct {
@@ -74,6 +75,7 @@ type networkMapping struct {
7475
//go:generate counterfeiter -o ./fakes/govcRunner.go --fake-name GovcRunner . govcRunner
7576
type govcRunner interface {
7677
ExecuteWithEnvVars(env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error)
78+
ExecuteWithEnvVarsCtx(ctx context.Context, env []string, args []interface{}) (*bytes.Buffer, *bytes.Buffer, error)
7779
}
7880

7981
type VsphereVMManager struct {
@@ -166,7 +168,7 @@ func (v *VsphereVMManager) CreateVM() (Status, StateInfo, error) {
166168

167169
ipath := v.createIpath()
168170

169-
errBufWriter, err := v.createVM(env, optionFilename)
171+
errBufWriter, err := v.createVM(env, optionFilename, ipath)
170172
fullState := StateInfo{IAAS: "vsphere", ID: ipath}
171173

172174
if err != nil {
@@ -319,14 +321,47 @@ func (v *VsphereVMManager) validateImage() error {
319321
}
320322
}
321323

322-
func (v *VsphereVMManager) createVM(env []string, optionFilename string) (errorBuffer *bytes.Buffer, err error) {
323-
_, errBufWriter, err := v.runner.ExecuteWithEnvVars(env, []interface{}{
324+
func (v *VsphereVMManager) createVM(env []string, optionFilename string, ipath string) (errBufWriter *bytes.Buffer, err error) {
325+
_, errBufWriter, err = v.runner.ExecuteWithEnvVars(env, []interface{}{
324326
"import.ova",
325327
"-options=" + optionFilename,
326328
v.ImageOVA,
327329
})
330+
if err != nil {
331+
return errBufWriter, checkFormatedError("govc error: %s", err)
332+
}
328333

329-
return errBufWriter, checkFormatedError("govc error: %s", err)
334+
ctx, cancel := context.WithTimeout(context.Background(), 80*time.Second) // 80 seconds is adequate time for OM to get IP; typically it's 43 seconds
335+
defer cancel()
336+
// Wait 80 seconds for VM to boot and acquire its IP
337+
_, errBufWriter, err = v.runner.ExecuteWithEnvVarsCtx(ctx, env, []interface{}{
338+
"vm.info",
339+
fmt.Sprintf(`-vm.ipath=%s`, ipath),
340+
"-waitip",
341+
})
342+
if ctx.Err() != nil {
343+
// VM hasn't acquired IP, is likely stuck, reset VM to free it (to boot)
344+
buf, errPowerReset := v.resetVM(env, ipath)
345+
if errPowerReset != nil {
346+
// we don't need to return errBuffWriter because we already know it's nil
347+
// because the ExecuteWithEnvVarsCtx that sets it never completes
348+
return buf, fmt.Errorf("govc error: could not power-reset: %s", errPowerReset)
349+
}
350+
} else {
351+
if err != nil {
352+
return errBufWriter, checkFormatedError("govc error: %s", err)
353+
}
354+
}
355+
return errBufWriter, nil
356+
}
357+
358+
func (v *VsphereVMManager) resetVM(env []string, ipath string) (errBufWriter *bytes.Buffer, err error) {
359+
_, errBufWriter, err = v.runner.ExecuteWithEnvVars(env, []interface{}{
360+
"vm.power",
361+
fmt.Sprintf(`-vm.ipath=%s`, ipath),
362+
"-reset",
363+
})
364+
return errBufWriter, err
330365
}
331366

332367
func (v *VsphereVMManager) addDefaultConfigFields() {

vmlifecycle/vmmanagers/vsphere_test.go

+9-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
package vmmanagers_test
22

33
import (
4+
"archive/tar"
45
"fmt"
56
"io/ioutil"
67
"os"
78

8-
"archive/tar"
9-
109
"bytes"
1110
"errors"
1211
"io"
@@ -111,6 +110,14 @@ opsman-configuration:
111110
"-on=true",
112111
"-vm.ipath=/datacenter/vm/folder/vm_name",
113112
))
113+
114+
_, _, args = runner.ExecuteWithEnvVarsCtxArgsForCall(0)
115+
Expect(args).To(matchers.OrderedConsistOf(
116+
"vm.info",
117+
"-vm.ipath=/datacenter/vm/folder/vm_name",
118+
"-waitip",
119+
))
120+
Expect(runner.ExecuteWithEnvVarsCtxCallCount()).To(Equal(1))
114121
})
115122

116123
When("setting custom cpu and memory", func() {

0 commit comments

Comments
 (0)