Skip to content

Commit b53df26

Browse files
Try all available fault domains in case of out of host capacity
1 parent f638fc5 commit b53df26

File tree

3 files changed

+139
-3
lines changed

3 files changed

+139
-3
lines changed

cloud/ociutil/ociutil.go

+6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"net/http"
23+
"strings"
2324
"time"
2425

2526
nlb "github.com/oracle/cluster-api-provider-oci/cloud/services/networkloadbalancer"
@@ -38,6 +39,7 @@ const (
3839
CreatedBy = "CreatedBy"
3940
OCIClusterAPIProvider = "OCIClusterAPIProvider"
4041
ClusterResourceIdentifier = "ClusterResourceIdentifier"
42+
OutOfHostCapacityErr = "Out of host capacity"
4143
)
4244

4345
// ErrNotFound is for simulation during testing, OCI SDK does not have a way
@@ -58,6 +60,10 @@ func IsNotFound(err error) bool {
5860
return ok && serviceErr.GetHTTPStatusCode() == http.StatusNotFound
5961
}
6062

63+
func IsOutOfHostCapacity(err error) bool {
64+
return strings.Contains(err.Error(), OutOfHostCapacityErr)
65+
}
66+
6167
// AwaitLBWorkRequest waits for the LB work request to either succeed, fail. See k8s.io/apimachinery/pkg/util/wait
6268
func AwaitLBWorkRequest(ctx context.Context, networkLoadBalancerClient nlb.NetworkLoadBalancerClient, workRequestId *string) (*networkloadbalancer.WorkRequest, error) {
6369
var wr *networkloadbalancer.WorkRequest

cloud/scope/machine.go

+38-3
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,11 @@ import (
2121
"encoding/base64"
2222
"fmt"
2323
"math/rand"
24-
"sigs.k8s.io/cluster-api/util/conditions"
2524
"strconv"
2625
"time"
2726

27+
"sigs.k8s.io/cluster-api/util/conditions"
28+
2829
"github.com/oracle/cluster-api-provider-oci/cloud/services/vcn"
2930

3031
"github.com/go-logr/logr"
@@ -246,16 +247,50 @@ func (m *MachineScope) GetOrCreateMachine(ctx context.Context) (*core.Instance,
246247
if (shapeConfig != core.LaunchInstanceShapeConfigDetails{}) {
247248
launchDetails.ShapeConfig = &shapeConfig
248249
}
249-
if faultDomain != "" {
250-
launchDetails.FaultDomain = common.String(faultDomain)
250+
initialFaultDomain := faultDomain
251+
adMap := m.OCICluster.Status.AvailabilityDomains[availabilityDomain]
252+
if initialFaultDomain == "" {
253+
// pick a random fault domain
254+
rand.Seed(time.Now().UnixNano())
255+
// rand.Intn(3) will produce a random number from 0(inclusive) to 3(exclusive)
256+
faultDomainIndex := rand.Intn(3)
257+
initialFaultDomain = adMap.FaultDomains[faultDomainIndex]
251258
}
259+
260+
m.Logger.Info("Fault Domain being used", "fault-domain", initialFaultDomain)
261+
m.Logger.Info("AD being used", "ad", availabilityDomain)
262+
263+
launchDetails.FaultDomain = common.String(initialFaultDomain)
252264
if nsgId != nil {
253265
launchDetails.CreateVnicDetails.NsgIds = []string{*nsgId}
254266
}
255267
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
256268
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
257269
resp, err := m.ComputeClient.LaunchInstance(ctx, req)
258270
if err != nil {
271+
// try other fault domains unless user specified a specific one
272+
if ociutil.IsOutOfHostCapacity(err) && faultDomain != "" {
273+
m.Logger.Info("The chosen fault domain did not have capacity, trying other fault domains")
274+
for fdIndex, fd := range adMap.FaultDomains {
275+
if fd != faultDomain {
276+
m.Logger.Info("Fault Domain being used for retry", "fault-domain", fd)
277+
launchDetails.FaultDomain = common.String(fd)
278+
req := core.LaunchInstanceRequest{LaunchInstanceDetails: launchDetails,
279+
OpcRetryToken: ociutil.GetOPCRetryToken(string(m.OCIMachine.UID))}
280+
resp, err = m.ComputeClient.LaunchInstance(ctx, req)
281+
if err != nil {
282+
// if another out of host error comes, try other fault domains
283+
// till we are out of fault domains, in which case return the last error
284+
if ociutil.IsOutOfHostCapacity(err) && fdIndex != (len(adMap.FaultDomains)-1) {
285+
continue
286+
} else {
287+
return nil, err
288+
}
289+
}
290+
return &resp.Instance, nil
291+
}
292+
}
293+
}
259294
return nil, err
260295
} else {
261296
return &resp.Instance, nil

cloud/scope/machine_test.go

+95
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ import (
4040
"sigs.k8s.io/controller-runtime/pkg/client/fake"
4141
)
4242

43+
var fdList = []string{"FAULT-DOMAIN-1", "FAULT-DOMAIN-2", "FAULT-DOMAIN-3"}
44+
4345
func TestInstanceReconciliation(t *testing.T) {
4446
var (
4547
ms *MachineScope
@@ -277,6 +279,47 @@ func TestInstanceReconciliation(t *testing.T) {
277279
})).Return(core.LaunchInstanceResponse{}, nil)
278280
},
279281
},
282+
{
283+
name: "try all fds",
284+
errorExpected: true,
285+
matchError: TestError{errorString: ociutil.OutOfHostCapacityErr},
286+
testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) {
287+
setupAllParams(ms)
288+
computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{
289+
DisplayName: common.String("name"),
290+
CompartmentId: common.String("test"),
291+
})).Return(core.ListInstancesResponse{}, nil)
292+
293+
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
294+
return instanceFDMatcher(request, "FAULT-DOMAIN-1")
295+
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
296+
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
297+
return instanceFDMatcher(request, "FAULT-DOMAIN-2")
298+
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
299+
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
300+
return instanceFDMatcher(request, "FAULT-DOMAIN-3")
301+
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
302+
},
303+
},
304+
{
305+
name: "second fd works",
306+
errorExpected: false,
307+
matchError: TestError{errorString: ociutil.OutOfHostCapacityErr},
308+
testSpecificSetup: func(machineScope *MachineScope, computeClient *mock_compute.MockComputeClient) {
309+
setupAllParams(ms)
310+
computeClient.EXPECT().ListInstances(gomock.Any(), gomock.Eq(core.ListInstancesRequest{
311+
DisplayName: common.String("name"),
312+
CompartmentId: common.String("test"),
313+
})).Return(core.ListInstancesResponse{}, nil)
314+
315+
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
316+
return anyFdMatcher(request)
317+
})).Return(core.LaunchInstanceResponse{}, TestError{errorString: ociutil.OutOfHostCapacityErr})
318+
computeClient.EXPECT().LaunchInstance(gomock.Any(), Eq(func(request interface{}) error {
319+
return anyFdMatcher(request)
320+
})).Return(core.LaunchInstanceResponse{}, nil)
321+
},
322+
},
280323
{
281324
name: "check compartment at cluster",
282325
errorExpected: false,
@@ -341,6 +384,7 @@ func TestInstanceReconciliation(t *testing.T) {
341384
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
342385
},
343386
AvailabilityDomain: common.String("ad2"),
387+
FaultDomain: common.String("FAULT-DOMAIN-2"),
344388
CompartmentId: common.String("test"),
345389
IsPvEncryptionInTransitEnabled: common.Bool(true),
346390
DefinedTags: map[string]map[string]interface{}{},
@@ -379,6 +423,7 @@ func TestInstanceReconciliation(t *testing.T) {
379423
},
380424
Shape: common.String("shape"),
381425
AvailabilityDomain: common.String("ad2"),
426+
FaultDomain: common.String("FAULT-DOMAIN-2"),
382427
CompartmentId: common.String("test"),
383428
IsPvEncryptionInTransitEnabled: common.Bool(true),
384429
DefinedTags: map[string]map[string]interface{}{},
@@ -425,6 +470,7 @@ func TestInstanceReconciliation(t *testing.T) {
425470
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
426471
},
427472
AvailabilityDomain: common.String("ad2"),
473+
FaultDomain: common.String("FAULT-DOMAIN-2"),
428474
CompartmentId: common.String("test"),
429475
IsPvEncryptionInTransitEnabled: common.Bool(true),
430476
DefinedTags: map[string]map[string]interface{}{},
@@ -473,6 +519,7 @@ func TestInstanceReconciliation(t *testing.T) {
473519
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
474520
},
475521
AvailabilityDomain: common.String("ad2"),
522+
FaultDomain: common.String("FAULT-DOMAIN-2"),
476523
CompartmentId: common.String("test"),
477524
IsPvEncryptionInTransitEnabled: common.Bool(true),
478525
DefinedTags: map[string]map[string]interface{}{},
@@ -525,6 +572,7 @@ func TestInstanceReconciliation(t *testing.T) {
525572
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
526573
},
527574
AvailabilityDomain: common.String("ad2"),
575+
FaultDomain: common.String("FAULT-DOMAIN-2"),
528576
CompartmentId: common.String("test"),
529577
IsPvEncryptionInTransitEnabled: common.Bool(true),
530578
DefinedTags: map[string]map[string]interface{}{},
@@ -578,6 +626,7 @@ func TestInstanceReconciliation(t *testing.T) {
578626
BaselineOcpuUtilization: core.LaunchInstanceShapeConfigDetailsBaselineOcpuUtilization8,
579627
},
580628
AvailabilityDomain: common.String("ad2"),
629+
FaultDomain: common.String("FAULT-DOMAIN-2"),
581630
CompartmentId: common.String("test"),
582631
IsPvEncryptionInTransitEnabled: common.Bool(true),
583632
DefinedTags: map[string]map[string]interface{}{},
@@ -636,6 +685,30 @@ func instanceCompartmentIDMatcher(request interface{}, matchStr string) error {
636685
return nil
637686
}
638687

688+
func instanceFDMatcher(request interface{}, matchStr string) error {
689+
r, ok := request.(core.LaunchInstanceRequest)
690+
if !ok {
691+
return errors.New("expecting LaunchInstanceRequest type")
692+
}
693+
if *r.LaunchInstanceDetails.FaultDomain != matchStr {
694+
return errors.New(fmt.Sprintf("expecting fd as %s", matchStr))
695+
}
696+
return nil
697+
}
698+
699+
func anyFdMatcher(request interface{}) error {
700+
r, ok := request.(core.LaunchInstanceRequest)
701+
if !ok {
702+
return errors.New("expecting LaunchInstanceRequest type")
703+
}
704+
for _, f := range fdList {
705+
if f == *r.FaultDomain {
706+
return nil
707+
}
708+
}
709+
return errors.New(fmt.Sprintf("invalid fd"))
710+
}
711+
639712
func TestLBReconciliationCreation(t *testing.T) {
640713
var (
641714
ms *MachineScope
@@ -1304,6 +1377,7 @@ func setupAllParams(ms *MachineScope) {
13041377
"2": {
13051378
Attributes: map[string]string{
13061379
"AvailabilityDomain": "ad2",
1380+
"FaultDomain": "FAULT-DOMAIN-2",
13071381
},
13081382
},
13091383
"3": {
@@ -1312,6 +1386,17 @@ func setupAllParams(ms *MachineScope) {
13121386
},
13131387
},
13141388
}
1389+
ms.OCICluster.Status.AvailabilityDomains = map[string]infrastructurev1beta1.OCIAvailabilityDomain{
1390+
"ad1": {
1391+
FaultDomains: fdList,
1392+
},
1393+
"ad2": {
1394+
FaultDomains: fdList,
1395+
},
1396+
"ad3": {
1397+
FaultDomains: fdList,
1398+
},
1399+
}
13151400
ms.Machine.Spec.FailureDomain = common.String("2")
13161401
ms.OCICluster.Spec.NetworkSpec.Vcn.Subnets = []*infrastructurev1beta1.Subnet{
13171402
{
@@ -1323,3 +1408,13 @@ func setupAllParams(ms *MachineScope) {
13231408
ms.OCICluster.Spec.OCIResourceIdentifier = "resource_uid"
13241409
ms.OCIMachine.UID = "machineuid"
13251410
}
1411+
1412+
// The error built-in interface type is the conventional interface for
1413+
// representing an error condition, with the nil value representing no error.
1414+
type TestError struct {
1415+
errorString string
1416+
}
1417+
1418+
func (t TestError) Error() string {
1419+
return t.errorString
1420+
}

0 commit comments

Comments
 (0)