Skip to content

Commit 08406fa

Browse files
committed
connect: fix failover through a mesh gateway to a remote datacenter
Failover is pushed entirely down to the data plane by creating envoy clusters and putting each successive destination in a different load assignment priority band. For example this shows that normally requests go to 1.2.3.4:8080 but when that fails they go to 6.7.8.9:8080: - name: foo load_assignment: cluster_name: foo policy: overprovisioning_factor: 100000 endpoints: - priority: 0 lb_endpoints: - endpoint: address: socket_address: address: 1.2.3.4 port_value: 8080 - priority: 1 lb_endpoints: - endpoint: address: socket_address: address: 6.7.8.9 port_value: 8080 Mesh gateways route requests based solely on the SNI header tacked onto the TLS layer. Envoy currently only lets you configure the outbound SNI header at the cluster layer. If you try to failover through a mesh gateway you ideally would configure the SNI value per endpoint, but that's not possible in envoy today. This PR introduces a simpler way around the problem for now: 1. We identify any target of failover that will use mesh gateway mode local or remote and then further isolate any resolver node in the compiled discovery chain that has a failover destination set to one of those targets. 2. For each of these resolvers we will perform a small measurement of comparative healths of the endpoints that come back from the health API for the set of primary target and serial failover targets. We walk the list of targets in order and if any endpoint is healthy we return that target, otherwise we move on to the next target. 3. The CDS and EDS endpoints both perform the measurements in (2) for the affected resolver nodes. 4. For CDS this measurement selects which TLS SNI field to use for the cluster (note the cluster is always going to be named for the primary target) 5. For EDS this measurement selects which set of endpoints will populate the cluster. Priority tiered failover is ignored. One of the big downsides to this approach to failover is that the failover detection and correction is going to be controlled by consul rather than deferring that entirely to the data plane as with the prior version. This also means that we are bound to only failover using official health signals and cannot make use of data plane signals like outlier detection to affect failover. In this specific scenario the lack of data plane signals is ok because the effectiveness is already muted by the fact that the ultimate destination endpoints will have their data plane signals scrambled when they pass through the mesh gateway wrapper anyway so we're not losing much. Another related fix is that we now use the endpoint health from the underlying service, not the health of the gateway (regardless of failover mode).
1 parent 856090e commit 08406fa

File tree

53 files changed

+1775
-145
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+1775
-145
lines changed

agent/consul/discovery_chain_endpoint.go

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ func (c *DiscoveryChain) Get(args *structs.DiscoveryChainRequest, reply *structs
6060
ServiceName: args.Name,
6161
CurrentNamespace: evalNS,
6262
CurrentDatacenter: evalDC,
63+
UseInDatacenter: c.srv.config.Datacenter,
6364
OverrideMeshGateway: args.OverrideMeshGateway,
6465
OverrideProtocol: args.OverrideProtocol,
6566
OverrideConnectTimeout: args.OverrideConnectTimeout,

agent/consul/discoverychain/compile.go

+23-4
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ import (
1212

1313
type CompileRequest struct {
1414
ServiceName string
15-
CurrentNamespace string
16-
CurrentDatacenter string
15+
CurrentNamespace string // TODO(rb): rename to EvaluateInDatacenter
16+
CurrentDatacenter string // TODO(rb): rename to EvaluateInNamespace
17+
UseInDatacenter string // where the results will be used from
1718

1819
// OverrideMeshGateway allows for the setting to be overridden for any
1920
// resolver in the compiled chain.
@@ -55,6 +56,7 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
5556
serviceName = req.ServiceName
5657
currentNamespace = req.CurrentNamespace
5758
currentDatacenter = req.CurrentDatacenter
59+
useInDatacenter = req.UseInDatacenter
5860
entries = req.Entries
5961
)
6062
if serviceName == "" {
@@ -66,6 +68,9 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
6668
if currentDatacenter == "" {
6769
return nil, fmt.Errorf("currentDatacenter is required")
6870
}
71+
if useInDatacenter == "" {
72+
return nil, fmt.Errorf("useInDatacenter is required")
73+
}
6974
if entries == nil {
7075
return nil, fmt.Errorf("entries is required")
7176
}
@@ -74,6 +79,7 @@ func Compile(req CompileRequest) (*structs.CompiledDiscoveryChain, error) {
7479
serviceName: serviceName,
7580
currentNamespace: currentNamespace,
7681
currentDatacenter: currentDatacenter,
82+
useInDatacenter: useInDatacenter,
7783
overrideMeshGateway: req.OverrideMeshGateway,
7884
overrideProtocol: req.OverrideProtocol,
7985
overrideConnectTimeout: req.OverrideConnectTimeout,
@@ -108,6 +114,7 @@ type compiler struct {
108114
serviceName string
109115
currentNamespace string
110116
currentDatacenter string
117+
useInDatacenter string
111118
overrideMeshGateway structs.MeshGatewayConfig
112119
overrideProtocol string
113120
overrideConnectTimeout time.Duration
@@ -250,8 +257,20 @@ func (c *compiler) compile() (*structs.CompiledDiscoveryChain, error) {
250257
return nil, err
251258
}
252259

253-
for targetID, _ := range c.loadedTargets {
254-
if _, ok := c.retainedTargets[targetID]; !ok {
260+
for targetID, target := range c.loadedTargets {
261+
if _, ok := c.retainedTargets[targetID]; ok {
262+
// Flip mesh gateway modes back to none if sharing a datacenter.
263+
// TODO (mesh-gateway)- maybe allow using a gateway within a datacenter at some point
264+
265+
meshGateway := structs.MeshGatewayModeDefault
266+
if target.Datacenter != c.useInDatacenter {
267+
meshGateway = target.MeshGateway.Mode
268+
}
269+
270+
if meshGateway != target.MeshGateway.Mode {
271+
target.MeshGateway.Mode = meshGateway
272+
}
273+
} else {
255274
delete(c.loadedTargets, targetID)
256275
}
257276
}

agent/consul/discoverychain/compile_test.go

+56-11
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,13 @@ func TestCompile(t *testing.T) {
3737
"service redirect": testcase_ServiceRedirect(),
3838
"service and subset redirect": testcase_ServiceAndSubsetRedirect(),
3939
"datacenter redirect": testcase_DatacenterRedirect(),
40+
"datacenter redirect with mesh gateways": testcase_DatacenterRedirect_WithMeshGateways(),
4041
"service failover": testcase_ServiceFailover(),
4142
"service failover through redirect": testcase_ServiceFailoverThroughRedirect(),
4243
"circular resolver failover": testcase_Resolver_CircularFailover(),
4344
"service and subset failover": testcase_ServiceAndSubsetFailover(),
4445
"datacenter failover": testcase_DatacenterFailover(),
45-
"service failover with mesh gateways": testcase_ServiceFailover_WithMeshGateways(),
46+
"datacenter failover with mesh gateways": testcase_DatacenterFailover_WithMeshGateways(),
4647
"noop split to resolver with default subset": testcase_NoopSplit_WithDefaultSubset(),
4748
"resolver with default subset": testcase_Resolve_WithDefaultSubset(),
4849
"resolver with no entries and inferring defaults": testcase_DefaultResolver(),
@@ -94,6 +95,7 @@ func TestCompile(t *testing.T) {
9495
ServiceName: "main",
9596
CurrentNamespace: "default",
9697
CurrentDatacenter: "dc1",
98+
UseInDatacenter: "dc1",
9799
Entries: tc.entries,
98100
}
99101
if tc.setup != nil {
@@ -941,6 +943,49 @@ func testcase_DatacenterRedirect() compileTestCase {
941943
return compileTestCase{entries: entries, expect: expect}
942944
}
943945

946+
func testcase_DatacenterRedirect_WithMeshGateways() compileTestCase {
947+
entries := newEntries()
948+
entries.GlobalProxy = &structs.ProxyConfigEntry{
949+
Kind: structs.ProxyDefaults,
950+
Name: structs.ProxyConfigGlobal,
951+
MeshGateway: structs.MeshGatewayConfig{
952+
Mode: structs.MeshGatewayModeRemote,
953+
},
954+
}
955+
entries.AddResolvers(
956+
&structs.ServiceResolverConfigEntry{
957+
Kind: "service-resolver",
958+
Name: "main",
959+
Redirect: &structs.ServiceResolverRedirect{
960+
Datacenter: "dc9",
961+
},
962+
},
963+
)
964+
965+
expect := &structs.CompiledDiscoveryChain{
966+
Protocol: "tcp",
967+
StartNode: "resolver:main.default.dc9",
968+
Nodes: map[string]*structs.DiscoveryGraphNode{
969+
"resolver:main.default.dc9": &structs.DiscoveryGraphNode{
970+
Type: structs.DiscoveryGraphNodeTypeResolver,
971+
Name: "main.default.dc9",
972+
Resolver: &structs.DiscoveryResolver{
973+
ConnectTimeout: 5 * time.Second,
974+
Target: "main.default.dc9",
975+
},
976+
},
977+
},
978+
Targets: map[string]*structs.DiscoveryTarget{
979+
"main.default.dc9": newTarget("main", "", "default", "dc9", func(t *structs.DiscoveryTarget) {
980+
t.MeshGateway = structs.MeshGatewayConfig{
981+
Mode: structs.MeshGatewayModeRemote,
982+
}
983+
}),
984+
},
985+
}
986+
return compileTestCase{entries: entries, expect: expect}
987+
}
988+
944989
func testcase_ServiceFailover() compileTestCase {
945990
entries := newEntries()
946991
entries.AddResolvers(
@@ -1145,7 +1190,7 @@ func testcase_DatacenterFailover() compileTestCase {
11451190
return compileTestCase{entries: entries, expect: expect}
11461191
}
11471192

1148-
func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
1193+
func testcase_DatacenterFailover_WithMeshGateways() compileTestCase {
11491194
entries := newEntries()
11501195
entries.GlobalProxy = &structs.ProxyConfigEntry{
11511196
Kind: structs.ProxyDefaults,
@@ -1159,7 +1204,7 @@ func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
11591204
Kind: "service-resolver",
11601205
Name: "main",
11611206
Failover: map[string]structs.ServiceResolverFailover{
1162-
"*": {Service: "backup"},
1207+
"*": {Datacenters: []string{"dc2", "dc4"}},
11631208
},
11641209
},
11651210
)
@@ -1175,18 +1220,22 @@ func testcase_ServiceFailover_WithMeshGateways() compileTestCase {
11751220
ConnectTimeout: 5 * time.Second,
11761221
Target: "main.default.dc1",
11771222
Failover: &structs.DiscoveryFailover{
1178-
Targets: []string{"backup.default.dc1"},
1223+
Targets: []string{
1224+
"main.default.dc2",
1225+
"main.default.dc4",
1226+
},
11791227
},
11801228
},
11811229
},
11821230
},
11831231
Targets: map[string]*structs.DiscoveryTarget{
1184-
"main.default.dc1": newTarget("main", "", "default", "dc1", func(t *structs.DiscoveryTarget) {
1232+
"main.default.dc1": newTarget("main", "", "default", "dc1", nil),
1233+
"main.default.dc2": newTarget("main", "", "default", "dc2", func(t *structs.DiscoveryTarget) {
11851234
t.MeshGateway = structs.MeshGatewayConfig{
11861235
Mode: structs.MeshGatewayModeRemote,
11871236
}
11881237
}),
1189-
"backup.default.dc1": newTarget("backup", "", "default", "dc1", func(t *structs.DiscoveryTarget) {
1238+
"main.default.dc4": newTarget("main", "", "default", "dc4", func(t *structs.DiscoveryTarget) {
11901239
t.MeshGateway = structs.MeshGatewayConfig{
11911240
Mode: structs.MeshGatewayModeRemote,
11921241
}
@@ -1308,11 +1357,7 @@ func testcase_DefaultResolver_WithProxyDefaults() compileTestCase {
13081357
},
13091358
},
13101359
Targets: map[string]*structs.DiscoveryTarget{
1311-
"main.default.dc1": newTarget("main", "", "default", "dc1", func(t *structs.DiscoveryTarget) {
1312-
t.MeshGateway = structs.MeshGatewayConfig{
1313-
Mode: structs.MeshGatewayModeRemote,
1314-
}
1315-
}),
1360+
"main.default.dc1": newTarget("main", "", "default", "dc1", nil),
13161361
},
13171362
}
13181363
return compileTestCase{entries: entries, expect: expect, expectIsDefault: true}

agent/consul/discoverychain/testing.go

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ func TestCompileConfigEntries(
1111
serviceName string,
1212
currentNamespace string,
1313
currentDatacenter string,
14+
useInDatacenter string,
1415
setup func(req *CompileRequest),
1516
entries ...structs.ConfigEntry,
1617
) *structs.CompiledDiscoveryChain {
@@ -22,6 +23,7 @@ func TestCompileConfigEntries(
2223
ServiceName: serviceName,
2324
CurrentNamespace: currentNamespace,
2425
CurrentDatacenter: currentDatacenter,
26+
UseInDatacenter: useInDatacenter,
2527
Entries: set,
2628
}
2729
if setup != nil {

agent/consul/state/config_entry.go

+3
Original file line numberDiff line numberDiff line change
@@ -444,10 +444,13 @@ func (s *Store) testCompileDiscoveryChain(
444444

445445
// Note we use an arbitrary namespace and datacenter as those would not
446446
// currently affect the graph compilation in ways that matter here.
447+
//
448+
// TODO(rb): we should thread a better value than "dc1" down here as that is going to sometimes show up in user facing errors
447449
req := discoverychain.CompileRequest{
448450
ServiceName: chainName,
449451
CurrentNamespace: "default",
450452
CurrentDatacenter: "dc1",
453+
UseInDatacenter: "dc1",
451454
Entries: speculativeEntries,
452455
}
453456
_, err = discoverychain.Compile(req)

agent/proxycfg/manager_test.go

+10-2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func TestManager_BasicLifecycle(t *testing.T) {
4646
roots, leaf := TestCerts(t)
4747

4848
dbDefaultChain := func() *structs.CompiledDiscoveryChain {
49-
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1",
49+
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1", "dc1",
5050
func(req *discoverychain.CompileRequest) {
5151
// This is because structs.TestUpstreams uses an opaque config
5252
// to override connect timeouts.
@@ -59,7 +59,7 @@ func TestManager_BasicLifecycle(t *testing.T) {
5959
)
6060
}
6161
dbSplitChain := func() *structs.CompiledDiscoveryChain {
62-
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1",
62+
return discoverychain.TestCompileConfigEntries(t, "db", "default", "dc1", "dc1",
6363
func(req *discoverychain.CompileRequest) {
6464
// This is because structs.TestUpstreams uses an opaque config
6565
// to override connect timeouts.
@@ -201,6 +201,10 @@ func TestManager_BasicLifecycle(t *testing.T) {
201201
"db.default.dc1": TestUpstreamNodes(t),
202202
},
203203
},
204+
WatchedGateways: nil, // Clone() clears this out
205+
WatchedGatewayEndpoints: map[string]map[string]structs.CheckServiceNodes{
206+
"db": {},
207+
},
204208
UpstreamEndpoints: map[string]structs.CheckServiceNodes{},
205209
},
206210
Datacenter: "dc1",
@@ -241,6 +245,10 @@ func TestManager_BasicLifecycle(t *testing.T) {
241245
"v2.db.default.dc1": TestUpstreamNodesAlternate(t),
242246
},
243247
},
248+
WatchedGateways: nil, // Clone() clears this out
249+
WatchedGatewayEndpoints: map[string]map[string]structs.CheckServiceNodes{
250+
"db": {},
251+
},
244252
UpstreamEndpoints: map[string]structs.CheckServiceNodes{},
245253
},
246254
Datacenter: "dc1",

agent/proxycfg/snapshot.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ type configSnapshotConnectProxy struct {
1212
DiscoveryChain map[string]*structs.CompiledDiscoveryChain // this is keyed by the Upstream.Identifier(), not the chain name
1313
WatchedUpstreams map[string]map[string]context.CancelFunc
1414
WatchedUpstreamEndpoints map[string]map[string]structs.CheckServiceNodes
15-
UpstreamEndpoints map[string]structs.CheckServiceNodes // DEPRECATED:see:WatchedUpstreamEndpoints
15+
WatchedGateways map[string]map[string]context.CancelFunc
16+
WatchedGatewayEndpoints map[string]map[string]structs.CheckServiceNodes
17+
18+
UpstreamEndpoints map[string]structs.CheckServiceNodes // DEPRECATED:see:WatchedUpstreamEndpoints
1619
}
1720

1821
type configSnapshotMeshGateway struct {
@@ -74,6 +77,7 @@ func (s *ConfigSnapshot) Clone() (*ConfigSnapshot, error) {
7477
switch s.Kind {
7578
case structs.ServiceKindConnectProxy:
7679
snap.ConnectProxy.WatchedUpstreams = nil
80+
snap.ConnectProxy.WatchedGateways = nil
7781
case structs.ServiceKindMeshGateway:
7882
snap.MeshGateway.WatchedDatacenters = nil
7983
snap.MeshGateway.WatchedServices = nil

0 commit comments

Comments
 (0)