Skip to content

Commit 7955d77

Browse files
committed
connect: fix failover through a mesh gateway to a remote datacenter
Failover is pushed entirely down to the data plane by creating envoy clusters and putting each successive destination in a different load assignment priority band. For example this shows that normally requests go to 1.2.3.4:8080 but when that fails they go to 6.7.8.9:8080: - name: foo load_assignment: cluster_name: foo policy: overprovisioning_factor: 100000 endpoints: - priority: 0 lb_endpoints: - endpoint: address: socket_address: address: 1.2.3.4 port_value: 8080 - priority: 1 lb_endpoints: - endpoint: address: socket_address: address: 6.7.8.9 port_value: 8080 Mesh gateways route requests based solely on the SNI header tacked onto the TLS layer. Envoy currently only lets you configure the outbound SNI header at the cluster layer. If you try to failover through a mesh gateway you ideally would configure the SNI value per endpoint, but that's not possible in envoy today. This PR introduces a simpler way around the problem for now: 1. We identify any target of failover that will use mesh gateway mode local or remote and then further isolate any resolver node in the compiled discovery chain that has a failover destination set to one of those targets. 2. For each of these resolvers we will perform a small measurement of comparative healths of the endpoints that come back from the health API for the set of primary target and serial failover targets. We walk the list of targets in order and if any endpoint is healthy we return that target, otherwise we move on to the next target. 3. The CDS and EDS endpoints both perform the measurements in (2) for the affected resolver nodes. 4. For CDS this measurement selects which TLS SNI field to use for the cluster (note the cluster is always going to be named for the primary target) 5. For EDS this measurement selects which set of endpoints will populate the cluster. Priority tiered failover is ignored. One of the big downsides to this approach to failover is that the failover detection and correction is going to be controlled by consul rather than deferring that entirely to the data plane as with the prior version. This also means that we are bound to only failover using official health signals and cannot make use of data plane signals like outlier detection to affect failover. In this specific scenario the lack of data plane signals is ok because the effectiveness is already muted by the fact that the ultimate destination endpoints will have their data plane signals scrambled when they pass through the mesh gateway wrapper anyway so we're not losing much.
1 parent 7bddbec commit 7955d77

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1127
-72
lines changed

agent/cache-types/discovery_chain_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ func TestCompiledDiscoveryChain(t *testing.T) {
1717

1818
// just do the default chain
1919
entries := structs.NewDiscoveryChainConfigEntries()
20-
chain := discoverychain.TestCompileConfigEntries(t, "web", "default", "dc1", nil)
20+
chain := discoverychain.TestCompileConfigEntries(t, "web", "default", "dc1", "dc1", nil)
2121

2222
// Expect the proper RPC call. This also sets the expected value
2323
// since that is return-by-pointer in the arguments.

agent/consul/discovery_chain_endpoint.go

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (c *DiscoveryChain) Get(args *structs.DiscoveryChainRequest, reply *structs
6060
ServiceName: args.Name,
6161
CurrentNamespace: evalNS,
6262
CurrentDatacenter: evalDC,
63+
UseInDatacenter: c.srv.config.Datacenter,
6364
OverrideMeshGateway: args.OverrideMeshGateway,
6465
OverrideProtocol: args.OverrideProtocol,
6566
OverrideConnectTimeout: args.OverrideConnectTimeout,

agent/consul/discoverychain/compile.go

+23-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ import (
1212

1313
type CompileRequest struct {
1414
ServiceName string
15-
CurrentNamespace string
16-
CurrentDatacenter string
15+
CurrentNamespace string // TODO(rb): rename to EvaluateInDatacenter
16+
CurrentDatacenter string // TODO(rb): rename to EvaluateInNamespace
17+
UseInDatacenter string // where the results will be used from
1718

1819
// OverrideMeshGateway allows for the setting to be overridden for any
1920
// resolver in the compiled chain.
@@ -55,6 +56,7 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
5556
serviceName = req.ServiceName
5657
currentNamespace = req.CurrentNamespace
5758
currentDatacenter = req.CurrentDatacenter
59+
useInDatacenter = req.UseInDatacenter
5860
entries = req.Entries
5961
)
6062
if serviceName == "" {
@@ -66,6 +68,9 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
6668
if currentDatacenter == "" {
6769
return nil, fmt.Errorf("currentDatacenter is required")
6870
}
71+
if useInDatacenter == "" {
72+
return nil, fmt.Errorf("useInDatacenter is required")
73+
}
6974
if entries == nil {
7075
return nil, fmt.Errorf("entries is required")
7176
}
@@ -74,6 +79,7 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
7479
serviceName: serviceName,
7580
currentNamespace: currentNamespace,
7681
currentDatacenter: currentDatacenter,
82+
useInDatacenter: useInDatacenter,
7783
overrideMeshGateway: req.OverrideMeshGateway,
7884
overrideProtocol: req.OverrideProtocol,
7985
overrideConnectTimeout: req.OverrideConnectTimeout,
@@ -108,6 +114,7 @@ type compiler struct {
108114
serviceName string
109115
currentNamespace string
110116
currentDatacenter string
117+
useInDatacenter string
111118
overrideMeshGateway structs.MeshGatewayConfig
112119
overrideProtocol string
113120
overrideConnectTimeout time.Duration
@@ -252,6 +259,20 @@ func (c *compiler) compile() (*structs.CompiledDiscoveryChain, error) {
252259
return nil, err
253260
}
254261

262+
// Flip mesh gateway modes back to none if sharing a datacenter.
263+
// TODO (mesh-gateway)- maybe allow using a gateway within a datacenter at some point
264+
for target, targetConfig := range c.targets {
265+
meshGateway := structs.MeshGatewayModeDefault
266+
if target.Datacenter != c.useInDatacenter {
267+
meshGateway = targetConfig.MeshGateway.Mode
268+
}
269+
270+
if meshGateway != targetConfig.MeshGateway.Mode {
271+
targetConfig.MeshGateway.Mode = meshGateway
272+
c.targets[target] = targetConfig
273+
}
274+
}
275+
255276
if !enableAdvancedRoutingForProtocol(c.protocol) && c.usesAdvancedRoutingFeatures {
256277
return nil, &structs.ConfigEntryGraphError{
257278
Message: fmt.Sprintf(

agent/consul/discoverychain/compile_test.go

+58-12
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@ func TestCompile(t *testing.T) {
3838
"service redirect": testcase_ServiceRedirect(),
3939
"service and subset redirect": testcase_ServiceAndSubsetRedirect(),
4040
"datacenter redirect": testcase_DatacenterRedirect(),
41+
"datacenter redirect with mesh gateways": testcase_DatacenterRedirect_WithMeshGateways(),
4142
"service failover": testcase_ServiceFailover(),
4243
"service and subset failover": testcase_ServiceAndSubsetFailover(),
4344
"datacenter failover": testcase_DatacenterFailover(),
44-
"service failover with mesh gateways": testcase_ServiceFailover_WithMeshGateways(),
45+
"datacenter failover with mesh gateways": testcase_DatacenterFailover_WithMeshGateways(),
4546
"noop split to resolver with default subset": testcase_NoopSplit_WithDefaultSubset(),
4647
"resolver with default subset": testcase_Resolve_WithDefaultSubset(),
4748
"resolver with no entries and inferring defaults": testcase_DefaultResolver(),
@@ -90,6 +91,7 @@ func TestCompile(t *testing.T) {
9091
ServiceName: "main",
9192
CurrentNamespace: "default",
9293
CurrentDatacenter: "dc1",
94+
UseInDatacenter: "dc1",
9395
Entries: tc.entries,
9496
}
9597
if tc.setup != nil {
@@ -984,6 +986,52 @@ func testcase_DatacenterRedirect() compileTestCase {
984986
return compileTestCase{entries: entries, expect: expect}
985987
}
986988

989+
func testcase_DatacenterRedirect_WithMeshGateways() compileTestCase {
990+
entries := newEntries()
991+
entries.GlobalProxy = &structs.ProxyConfigEntry{
992+
Kind: structs.ProxyDefaults,
993+
Name: structs.ProxyConfigGlobal,
994+
MeshGateway: structs.MeshGatewayConfig{
995+
Mode: structs.MeshGatewayModeRemote,
996+
},
997+
}
998+
entries.AddResolvers(
999+
&structs.ServiceResolverConfigEntry{
1000+
Kind: "service-resolver",
1001+
Name: "main",
1002+
Redirect: &structs.ServiceResolverRedirect{
1003+
Datacenter: "dc9",
1004+
},
1005+
},
1006+
)
1007+
1008+
resolver := entries.GetResolver("main")
1009+
1010+
expect := &structs.CompiledDiscoveryChain{
1011+
Protocol: "tcp",
1012+
StartNode: "resolver:main,,,dc9",
1013+
Nodes: map[string]*structs.DiscoveryGraphNode{
1014+
"resolver:main,,,dc9": &structs.DiscoveryGraphNode{
1015+
Type: structs.DiscoveryGraphNodeTypeResolver,
1016+
Name: "main,,,dc9",
1017+
Resolver: &structs.DiscoveryResolver{
1018+
Definition: resolver,
1019+
ConnectTimeout: 5 * time.Second,
1020+
Target: newTarget("main", "", "default", "dc9"),
1021+
},
1022+
},
1023+
},
1024+
Targets: map[structs.DiscoveryTarget]structs.DiscoveryTargetConfig{
1025+
newTarget("main", "", "default", "dc9"): structs.DiscoveryTargetConfig{
1026+
MeshGateway: structs.MeshGatewayConfig{
1027+
Mode: structs.MeshGatewayModeRemote,
1028+
},
1029+
},
1030+
},
1031+
}
1032+
return compileTestCase{entries: entries, expect: expect}
1033+
}
1034+
9871035
func testcase_ServiceFailover() compileTestCase {
9881036
entries := newEntries()
9891037
entries.AddResolvers(
@@ -1125,7 +1173,7 @@ func testcase_DatacenterFailover() compileTestCase {
11251173
return compileTestCase{entries: entries, expect: expect}
11261174
}
11271175

1128-
func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
1176+
func testcase_DatacenterFailover_WithMeshGateways() compileTestCase {
11291177
entries := newEntries()
11301178
entries.GlobalProxy = &structs.ProxyConfigEntry{
11311179
Kind: structs.ProxyDefaults,
@@ -1139,13 +1187,12 @@ func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
11391187
Kind: "service-resolver",
11401188
Name: "main",
11411189
Failover: map[string]structs.ServiceResolverFailover{
1142-
"*": {Service: "backup"},
1190+
"*": {Datacenters: []string{"dc2", "dc4"}},
11431191
},
11441192
},
11451193
)
11461194

11471195
resolverMain := entries.GetResolver("main")
1148-
11491196
wildFail := resolverMain.Failover["*"]
11501197

11511198
expect := &structs.CompiledDiscoveryChain{
@@ -1162,19 +1209,21 @@ func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
11621209
Failover: &structs.DiscoveryFailover{
11631210
Definition: &wildFail,
11641211
Targets: []structs.DiscoveryTarget{
1165-
newTarget("backup", "", "default", "dc1"),
1212+
newTarget("main", "", "default", "dc2"),
1213+
newTarget("main", "", "default", "dc4"),
11661214
},
11671215
},
11681216
},
11691217
},
11701218
},
11711219
Targets: map[structs.DiscoveryTarget]structs.DiscoveryTargetConfig{
1172-
newTarget("backup", "", "default", "dc1"): structs.DiscoveryTargetConfig{
1220+
newTarget("main", "", "default", "dc1"): structs.DiscoveryTargetConfig{},
1221+
newTarget("main", "", "default", "dc2"): structs.DiscoveryTargetConfig{
11731222
MeshGateway: structs.MeshGatewayConfig{
11741223
Mode: structs.MeshGatewayModeRemote,
11751224
},
11761225
},
1177-
newTarget("main", "", "default", "dc1"): structs.DiscoveryTargetConfig{
1226+
newTarget("main", "", "default", "dc4"): structs.DiscoveryTargetConfig{
11781227
MeshGateway: structs.MeshGatewayConfig{
11791228
Mode: structs.MeshGatewayModeRemote,
11801229
},
@@ -1304,11 +1353,8 @@ func testcase_DefaultResolver_WithProxyDefaults() compileTestCase {
13041353
},
13051354
},
13061355
Targets: map[structs.DiscoveryTarget]structs.DiscoveryTargetConfig{
1307-
newTarget("main", "", "default", "dc1"): structs.DiscoveryTargetConfig{
1308-
MeshGateway: structs.MeshGatewayConfig{
1309-
Mode: structs.MeshGatewayModeRemote,
1310-
},
1311-
},
1356+
// mesh gateway mode is stripped because we are sharing a dc
1357+
newTarget("main", "", "default", "dc1"): structs.DiscoveryTargetConfig{},
13121358
},
13131359
}
13141360
return compileTestCase{entries: entries, expect: expect, expectIsDefault: true}

agent/consul/discoverychain/testing.go

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ func TestCompileConfigEntries(
1111
serviceName string,
1212
currentNamespace string,
1313
currentDatacenter string,
14+
useInDatacenter string,
1415
setup func(req *CompileRequest),
1516
entries ...structs.ConfigEntry,
1617
) *structs.CompiledDiscoveryChain {
@@ -22,6 +23,7 @@ func TestCompileConfigEntries(
2223
ServiceName: serviceName,
2324
CurrentNamespace: currentNamespace,
2425
CurrentDatacenter: currentDatacenter,
26+
UseInDatacenter: useInDatacenter,
2527
Entries: set,
2628
}
2729
if setup != nil {

agent/consul/state/config_entry.go

+1
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ func (s *Store) testCompileDiscoveryChain(
450450
ServiceName: chainName,
451451
CurrentNamespace: "default",
452452
CurrentDatacenter: "dc1",
453+
UseInDatacenter: "dc1",
453454
Entries: speculativeEntries,
454455
}
455456
_, err = discoverychain.Compile(req)

agent/proxycfg/manager_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func TestManager_BasicLifecycle(t *testing.T) {
6363
Datacenter: "dc1",
6464
}
6565
dbDefaultChain := func() *structs.CompiledDiscoveryChain {
66-
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1",
66+
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1", "dc1",
6767
func(req *discoverychain.CompileRequest) {
6868
// This is because structs.TestUpstreams uses an opaque config
6969
// to override connect timeouts.
@@ -76,7 +76,7 @@ func TestManager_BasicLifecycle(t *testing.T) {
7676
)
7777
}
7878
dbSplitChain := func() *structs.CompiledDiscoveryChain {
79-
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1",
79+
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1", "dc1",
8080
func(req *discoverychain.CompileRequest) {
8181
// This is because structs.TestUpstreams uses an opaque config
8282
// to override connect timeouts.

agent/proxycfg/state.go

+1-7
Original file line numberDiff line numberDiff line change
@@ -587,13 +587,7 @@ func (s *state) resetWatchesFromChain(
587587

588588
ctx, cancel := context.WithCancel(s.ctx)
589589

590-
// TODO (mesh-gateway)- maybe allow using a gateway within a datacenter at some point
591-
meshGateway := structs.MeshGatewayModeDefault
592-
if target.Datacenter != s.source.Datacenter {
593-
meshGateway = targetConfig.MeshGateway.Mode
594-
}
595-
596-
// if the default mode
590+
meshGateway := targetConfig.MeshGateway.Mode
597591
if meshGateway == structs.MeshGatewayModeDefault {
598592
meshGateway = structs.MeshGatewayModeNone
599593
}

agent/proxycfg/state_test.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
420420
cache.UpdateEvent{
421421
CorrelationID: "discovery-chain:api",
422422
Result: &structs.DiscoveryChainResponse{
423-
Chain: discoverychain.TestCompileConfigEntries(t, "api", "default", "dc1",
423+
Chain: discoverychain.TestCompileConfigEntries(t, "api", "default", "dc1", "dc1",
424424
func(req *discoverychain.CompileRequest) {
425425
req.OverrideMeshGateway.Mode = meshGatewayProxyConfigValue
426426
}),
@@ -430,7 +430,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
430430
cache.UpdateEvent{
431431
CorrelationID: "discovery-chain:api-failover-remote?dc=dc2",
432432
Result: &structs.DiscoveryChainResponse{
433-
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-remote", "default", "dc2",
433+
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-remote", "default", "dc2", "dc1",
434434
func(req *discoverychain.CompileRequest) {
435435
req.OverrideMeshGateway.Mode = structs.MeshGatewayModeRemote
436436
}),
@@ -440,7 +440,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
440440
cache.UpdateEvent{
441441
CorrelationID: "discovery-chain:api-failover-local?dc=dc2",
442442
Result: &structs.DiscoveryChainResponse{
443-
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-local", "default", "dc2",
443+
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-local", "default", "dc2", "dc1",
444444
func(req *discoverychain.CompileRequest) {
445445
req.OverrideMeshGateway.Mode = structs.MeshGatewayModeLocal
446446
}),
@@ -450,7 +450,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
450450
cache.UpdateEvent{
451451
CorrelationID: "discovery-chain:api-failover-direct?dc=dc2",
452452
Result: &structs.DiscoveryChainResponse{
453-
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-direct", "default", "dc2",
453+
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-direct", "default", "dc2", "dc1",
454454
func(req *discoverychain.CompileRequest) {
455455
req.OverrideMeshGateway.Mode = structs.MeshGatewayModeNone
456456
}),
@@ -460,7 +460,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
460460
cache.UpdateEvent{
461461
CorrelationID: "discovery-chain:api-dc2",
462462
Result: &structs.DiscoveryChainResponse{
463-
Chain: discoverychain.TestCompileConfigEntries(t, "api-dc2", "default", "dc1",
463+
Chain: discoverychain.TestCompileConfigEntries(t, "api-dc2", "default", "dc1", "dc1",
464464
func(req *discoverychain.CompileRequest) {
465465
req.OverrideMeshGateway.Mode = meshGatewayProxyConfigValue
466466
},

0 commit comments

Comments
 (0)