Skip to content

Commit 8e22d80

Browse files
authored
connect: fix failover through a mesh gateway to a remote datacenter (#6259)
Failover is pushed entirely down to the data plane by creating envoy clusters and putting each successive destination in a different load assignment priority band. For example this shows that normally requests go to 1.2.3.4:8080 but when that fails they go to 6.7.8.9:8080: - name: foo load_assignment: cluster_name: foo policy: overprovisioning_factor: 100000 endpoints: - priority: 0 lb_endpoints: - endpoint: address: socket_address: address: 1.2.3.4 port_value: 8080 - priority: 1 lb_endpoints: - endpoint: address: socket_address: address: 6.7.8.9 port_value: 8080 Mesh gateways route requests based solely on the SNI header tacked onto the TLS layer. Envoy currently only lets you configure the outbound SNI header at the cluster layer. If you try to failover through a mesh gateway you ideally would configure the SNI value per endpoint, but that's not possible in envoy today. This PR introduces a simpler way around the problem for now: 1. We identify any target of failover that will use mesh gateway mode local or remote and then further isolate any resolver node in the compiled discovery chain that has a failover destination set to one of those targets. 2. For each of these resolvers we will perform a small measurement of comparative healths of the endpoints that come back from the health API for the set of primary target and serial failover targets. We walk the list of targets in order and if any endpoint is healthy we return that target, otherwise we move on to the next target. 3. The CDS and EDS endpoints both perform the measurements in (2) for the affected resolver nodes. 4. For CDS this measurement selects which TLS SNI field to use for the cluster (note the cluster is always going to be named for the primary target) 5. For EDS this measurement selects which set of endpoints will populate the cluster. Priority tiered failover is ignored. One of the big downsides to this approach to failover is that the failover detection and correction is going to be controlled by consul rather than deferring that entirely to the data plane as with the prior version. This also means that we are bound to only failover using official health signals and cannot make use of data plane signals like outlier detection to affect failover. In this specific scenario the lack of data plane signals is ok because the effectiveness is already muted by the fact that the ultimate destination endpoints will have their data plane signals scrambled when they pass through the mesh gateway wrapper anyway so we're not losing much. Another related fix is that we now use the endpoint health from the underlying service, not the health of the gateway (regardless of failover mode).
1 parent 9f58504 commit 8e22d80

File tree

67 files changed

+2886
-205
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+2886
-205
lines changed

agent/cache-types/discovery_chain_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ func TestCompiledDiscoveryChain(t *testing.T) {
1616
typ := &CompiledDiscoveryChain{RPC: rpc}
1717

1818
// just do the default chain
19-
chain := discoverychain.TestCompileConfigEntries(t, "web", "default", "dc1", nil)
19+
chain := discoverychain.TestCompileConfigEntries(t, "web", "default", "dc1", "dc1", nil)
2020

2121
// Expect the proper RPC call. This also sets the expected value
2222
// since that is return-by-pointer in the arguments.

agent/consul/discovery_chain_endpoint.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,9 @@ func (c *DiscoveryChain) Get(args *structs.DiscoveryChainRequest, reply *structs
5858
// Then we compile it into something useful.
5959
chain, err := discoverychain.Compile(discoverychain.CompileRequest{
6060
ServiceName: args.Name,
61-
CurrentNamespace: evalNS,
62-
CurrentDatacenter: evalDC,
61+
EvaluateInNamespace: evalNS,
62+
EvaluateInDatacenter: evalDC,
63+
UseInDatacenter: c.srv.config.Datacenter,
6364
OverrideMeshGateway: args.OverrideMeshGateway,
6465
OverrideProtocol: args.OverrideProtocol,
6566
OverrideConnectTimeout: args.OverrideConnectTimeout,

agent/consul/discoverychain/compile.go

+42-30
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@ import (
1111
)
1212

1313
type CompileRequest struct {
14-
ServiceName string
15-
CurrentNamespace string
16-
CurrentDatacenter string
14+
ServiceName string
15+
EvaluateInNamespace string
16+
EvaluateInDatacenter string
17+
UseInDatacenter string // where the results will be used from
1718

1819
// OverrideMeshGateway allows for the setting to be overridden for any
1920
// resolver in the compiled chain.
@@ -52,28 +53,33 @@ type CompileRequest struct {
5253
// valid.
5354
func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
5455
var (
55-
serviceName = req.ServiceName
56-
currentNamespace = req.CurrentNamespace
57-
currentDatacenter = req.CurrentDatacenter
58-
entries = req.Entries
56+
serviceName = req.ServiceName
57+
evaluateInNamespace = req.EvaluateInNamespace
58+
evaluateInDatacenter = req.EvaluateInDatacenter
59+
useInDatacenter = req.UseInDatacenter
60+
entries = req.Entries
5961
)
6062
if serviceName == "" {
6163
return nil, fmt.Errorf("serviceName is required")
6264
}
63-
if currentNamespace == "" {
64-
return nil, fmt.Errorf("currentNamespace is required")
65+
if evaluateInNamespace == "" {
66+
return nil, fmt.Errorf("evaluateInNamespace is required")
6567
}
66-
if currentDatacenter == "" {
67-
return nil, fmt.Errorf("currentDatacenter is required")
68+
if evaluateInDatacenter == "" {
69+
return nil, fmt.Errorf("evaluateInDatacenter is required")
70+
}
71+
if useInDatacenter == "" {
72+
return nil, fmt.Errorf("useInDatacenter is required")
6873
}
6974
if entries == nil {
7075
return nil, fmt.Errorf("entries is required")
7176
}
7277

7378
c := &compiler{
7479
serviceName: serviceName,
75-
currentNamespace: currentNamespace,
76-
currentDatacenter: currentDatacenter,
80+
evaluateInNamespace: evaluateInNamespace,
81+
evaluateInDatacenter: evaluateInDatacenter,
82+
useInDatacenter: useInDatacenter,
7783
overrideMeshGateway: req.OverrideMeshGateway,
7884
overrideProtocol: req.OverrideProtocol,
7985
overrideConnectTimeout: req.OverrideConnectTimeout,
@@ -106,8 +112,9 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
106112
// for assembling a discovery chain from raw config entries.
107113
type compiler struct {
108114
serviceName string
109-
currentNamespace string
110-
currentDatacenter string
115+
evaluateInNamespace string
116+
evaluateInDatacenter string
117+
useInDatacenter string
111118
overrideMeshGateway structs.MeshGatewayConfig
112119
overrideProtocol string
113120
overrideConnectTimeout time.Duration
@@ -298,8 +305,8 @@ func (c *compiler) compile() (*structs.CompiledDiscoveryChain, error) {
298305

299306
return &structs.CompiledDiscoveryChain{
300307
ServiceName: c.serviceName,
301-
Namespace: c.currentNamespace,
302-
Datacenter: c.currentDatacenter,
308+
Namespace: c.evaluateInNamespace,
309+
Datacenter: c.evaluateInDatacenter,
303310
CustomizationHash: customizationHash,
304311
Protocol: c.protocol,
305312
StartNode: c.startNode,
@@ -590,8 +597,8 @@ func (c *compiler) newTarget(service, serviceSubset, namespace, datacenter strin
590597
t := structs.NewDiscoveryTarget(
591598
service,
592599
serviceSubset,
593-
defaultIfEmpty(namespace, c.currentNamespace),
594-
defaultIfEmpty(datacenter, c.currentDatacenter),
600+
defaultIfEmpty(namespace, c.evaluateInNamespace),
601+
defaultIfEmpty(datacenter, c.evaluateInDatacenter),
595602
)
596603

597604
prev, ok := c.loadedTargets[t.ID]
@@ -806,19 +813,24 @@ RESOLVE_AGAIN:
806813

807814
target.Subset = resolver.Subsets[target.ServiceSubset]
808815

809-
// Default mesh gateway settings
810-
if serviceDefault := c.entries.GetService(target.Service); serviceDefault != nil {
811-
target.MeshGateway = serviceDefault.MeshGateway
812-
}
816+
// TODO (mesh-gateway)- maybe allow using a gateway within a datacenter at some point
817+
if target.Datacenter == c.useInDatacenter {
818+
target.MeshGateway.Mode = structs.MeshGatewayModeDefault
819+
} else {
820+
// Default mesh gateway settings
821+
if serviceDefault := c.entries.GetService(target.Service); serviceDefault != nil {
822+
target.MeshGateway = serviceDefault.MeshGateway
823+
}
813824

814-
if c.entries.GlobalProxy != nil && target.MeshGateway.Mode == structs.MeshGatewayModeDefault {
815-
target.MeshGateway.Mode = c.entries.GlobalProxy.MeshGateway.Mode
816-
}
825+
if c.entries.GlobalProxy != nil && target.MeshGateway.Mode == structs.MeshGatewayModeDefault {
826+
target.MeshGateway.Mode = c.entries.GlobalProxy.MeshGateway.Mode
827+
}
817828

818-
if c.overrideMeshGateway.Mode != structs.MeshGatewayModeDefault {
819-
if target.MeshGateway.Mode != c.overrideMeshGateway.Mode {
820-
target.MeshGateway.Mode = c.overrideMeshGateway.Mode
821-
c.customizedBy.MeshGateway = true
829+
if c.overrideMeshGateway.Mode != structs.MeshGatewayModeDefault {
830+
if target.MeshGateway.Mode != c.overrideMeshGateway.Mode {
831+
target.MeshGateway.Mode = c.overrideMeshGateway.Mode
832+
c.customizedBy.MeshGateway = true
833+
}
822834
}
823835
}
824836

agent/consul/discoverychain/compile_test.go

+60-15
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,13 @@ func TestCompile(t *testing.T) {
3737
"service redirect": testcase_ServiceRedirect(),
3838
"service and subset redirect": testcase_ServiceAndSubsetRedirect(),
3939
"datacenter redirect": testcase_DatacenterRedirect(),
40+
"datacenter redirect with mesh gateways": testcase_DatacenterRedirect_WithMeshGateways(),
4041
"service failover": testcase_ServiceFailover(),
4142
"service failover through redirect": testcase_ServiceFailoverThroughRedirect(),
4243
"circular resolver failover": testcase_Resolver_CircularFailover(),
4344
"service and subset failover": testcase_ServiceAndSubsetFailover(),
4445
"datacenter failover": testcase_DatacenterFailover(),
45-
"service failover with mesh gateways": testcase_ServiceFailover_WithMeshGateways(),
46+
"datacenter failover with mesh gateways": testcase_DatacenterFailover_WithMeshGateways(),
4647
"noop split to resolver with default subset": testcase_NoopSplit_WithDefaultSubset(),
4748
"resolver with default subset": testcase_Resolve_WithDefaultSubset(),
4849
"resolver with no entries and inferring defaults": testcase_DefaultResolver(),
@@ -91,10 +92,11 @@ func TestCompile(t *testing.T) {
9192
}
9293

9394
req := CompileRequest{
94-
ServiceName: "main",
95-
CurrentNamespace: "default",
96-
CurrentDatacenter: "dc1",
97-
Entries: tc.entries,
95+
ServiceName: "main",
96+
EvaluateInNamespace: "default",
97+
EvaluateInDatacenter: "dc1",
98+
UseInDatacenter: "dc1",
99+
Entries: tc.entries,
98100
}
99101
if tc.setup != nil {
100102
tc.setup(&req)
@@ -941,6 +943,49 @@ func testcase_DatacenterRedirect() compileTestCase {
941943
return compileTestCase{entries: entries, expect: expect}
942944
}
943945

946+
func testcase_DatacenterRedirect_WithMeshGateways() compileTestCase {
947+
entries := newEntries()
948+
entries.GlobalProxy = &structs.ProxyConfigEntry{
949+
Kind: structs.ProxyDefaults,
950+
Name: structs.ProxyConfigGlobal,
951+
MeshGateway: structs.MeshGatewayConfig{
952+
Mode: structs.MeshGatewayModeRemote,
953+
},
954+
}
955+
entries.AddResolvers(
956+
&structs.ServiceResolverConfigEntry{
957+
Kind: "service-resolver",
958+
Name: "main",
959+
Redirect: &structs.ServiceResolverRedirect{
960+
Datacenter: "dc9",
961+
},
962+
},
963+
)
964+
965+
expect := &structs.CompiledDiscoveryChain{
966+
Protocol: "tcp",
967+
StartNode: "resolver:main.default.dc9",
968+
Nodes: map[string]*structs.DiscoveryGraphNode{
969+
"resolver:main.default.dc9": &structs.DiscoveryGraphNode{
970+
Type: structs.DiscoveryGraphNodeTypeResolver,
971+
Name: "main.default.dc9",
972+
Resolver: &structs.DiscoveryResolver{
973+
ConnectTimeout: 5 * time.Second,
974+
Target: "main.default.dc9",
975+
},
976+
},
977+
},
978+
Targets: map[string]*structs.DiscoveryTarget{
979+
"main.default.dc9": newTarget("main", "", "default", "dc9", func(t *structs.DiscoveryTarget) {
980+
t.MeshGateway = structs.MeshGatewayConfig{
981+
Mode: structs.MeshGatewayModeRemote,
982+
}
983+
}),
984+
},
985+
}
986+
return compileTestCase{entries: entries, expect: expect}
987+
}
988+
944989
func testcase_ServiceFailover() compileTestCase {
945990
entries := newEntries()
946991
entries.AddResolvers(
@@ -1145,7 +1190,7 @@ func testcase_DatacenterFailover() compileTestCase {
11451190
return compileTestCase{entries: entries, expect: expect}
11461191
}
11471192

1148-
func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
1193+
func testcase_DatacenterFailover_WithMeshGateways() compileTestCase {
11491194
entries := newEntries()
11501195
entries.GlobalProxy = &structs.ProxyConfigEntry{
11511196
Kind: structs.ProxyDefaults,
@@ -1159,7 +1204,7 @@ func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
11591204
Kind: "service-resolver",
11601205
Name: "main",
11611206
Failover: map[string]structs.ServiceResolverFailover{
1162-
"*": {Service: "backup"},
1207+
"*": {Datacenters: []string{"dc2", "dc4"}},
11631208
},
11641209
},
11651210
)
@@ -1175,18 +1220,22 @@ func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
11751220
ConnectTimeout: 5 * time.Second,
11761221
Target: "main.default.dc1",
11771222
Failover: &structs.DiscoveryFailover{
1178-
Targets: []string{"backup.default.dc1"},
1223+
Targets: []string{
1224+
"main.default.dc2",
1225+
"main.default.dc4",
1226+
},
11791227
},
11801228
},
11811229
},
11821230
},
11831231
Targets: map[string]*structs.DiscoveryTarget{
1184-
"main.default.dc1": newTarget("main", "", "default", "dc1", func(t *structs.DiscoveryTarget) {
1232+
"main.default.dc1": newTarget("main", "", "default", "dc1", nil),
1233+
"main.default.dc2": newTarget("main", "", "default", "dc2", func(t *structs.DiscoveryTarget) {
11851234
t.MeshGateway = structs.MeshGatewayConfig{
11861235
Mode: structs.MeshGatewayModeRemote,
11871236
}
11881237
}),
1189-
"backup.default.dc1": newTarget("backup", "", "default", "dc1", func(t *structs.DiscoveryTarget) {
1238+
"main.default.dc4": newTarget("main", "", "default", "dc4", func(t *structs.DiscoveryTarget) {
11901239
t.MeshGateway = structs.MeshGatewayConfig{
11911240
Mode: structs.MeshGatewayModeRemote,
11921241
}
@@ -1308,11 +1357,7 @@ func testcase_DefaultResolver_WithProxyDefaults() compileTestCase {
13081357
},
13091358
},
13101359
Targets: map[string]*structs.DiscoveryTarget{
1311-
"main.default.dc1": newTarget("main", "", "default", "dc1", func(t *structs.DiscoveryTarget) {
1312-
t.MeshGateway = structs.MeshGatewayConfig{
1313-
Mode: structs.MeshGatewayModeRemote,
1314-
}
1315-
}),
1360+
"main.default.dc1": newTarget("main", "", "default", "dc1", nil),
13161361
},
13171362
}
13181363
return compileTestCase{entries: entries, expect: expect, expectIsDefault: true}

agent/consul/discoverychain/testing.go

+8-6
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ import (
99
func TestCompileConfigEntries(
1010
t testing.T,
1111
serviceName string,
12-
currentNamespace string,
13-
currentDatacenter string,
12+
evaluateInNamespace string,
13+
evaluateInDatacenter string,
14+
useInDatacenter string,
1415
setup func(req *CompileRequest),
1516
entries ...structs.ConfigEntry,
1617
) *structs.CompiledDiscoveryChain {
@@ -19,10 +20,11 @@ func TestCompileConfigEntries(
1920
set.AddEntries(entries...)
2021

2122
req := CompileRequest{
22-
ServiceName: serviceName,
23-
CurrentNamespace: currentNamespace,
24-
CurrentDatacenter: currentDatacenter,
25-
Entries: set,
23+
ServiceName: serviceName,
24+
EvaluateInNamespace: evaluateInNamespace,
25+
EvaluateInDatacenter: evaluateInDatacenter,
26+
UseInDatacenter: useInDatacenter,
27+
Entries: set,
2628
}
2729
if setup != nil {
2830
setup(&req)

agent/consul/state/config_entry.go

+7-4
Original file line numberDiff line numberDiff line change
@@ -444,11 +444,14 @@ func (s *Store) testCompileDiscoveryChain(
444444

445445
// Note we use an arbitrary namespace and datacenter as those would not
446446
// currently affect the graph compilation in ways that matter here.
447+
//
448+
// TODO(rb): we should thread a better value than "dc1" down here as that is going to sometimes show up in user facing errors
447449
req := discoverychain.CompileRequest{
448-
ServiceName: chainName,
449-
CurrentNamespace: "default",
450-
CurrentDatacenter: "dc1",
451-
Entries: speculativeEntries,
450+
ServiceName: chainName,
451+
EvaluateInNamespace: "default",
452+
EvaluateInDatacenter: "dc1",
453+
UseInDatacenter: "dc1",
454+
Entries: speculativeEntries,
452455
}
453456
_, err = discoverychain.Compile(req)
454457
return err

0 commit comments

Comments
 (0)