Skip to content

Commit f09af53

Browse files
schristoffbanks
andauthored
Remove failed nodes from serfWAN (#6028)
* Prune Servers from WAN and LAN * cleaned up and fixed LAN to WAN * moving things around * force-leave remove from serfWAN, create pruneSerfWAN * removed serfWAN remove, reduced complexity, fixed comments * add another place to remove from serfWAN * add nil check * Update agent/consul/server.go Co-Authored-By: Paul Banks <[email protected]>
1 parent 81d47a8 commit f09af53

File tree

3 files changed

+29
-10
lines changed

3 files changed

+29
-10
lines changed

agent/consul/autopilot.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ func (d *AutopilotDelegate) Raft() *raft.Raft {
7777
return d.server.raft
7878
}
7979

80-
func (d *AutopilotDelegate) Serf() *serf.Serf {
80+
func (d *AutopilotDelegate) SerfLAN() *serf.Serf {
8181
return d.server.serfLAN
8282
}
83+
84+
func (d *AutopilotDelegate) SerfWAN() *serf.Serf {
85+
return d.server.serfWAN
86+
}

agent/consul/autopilot/autopilot.go

+18-9
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ type Delegate interface {
2222
NotifyHealth(OperatorHealthReply)
2323
PromoteNonVoters(*Config, OperatorHealthReply) ([]raft.Server, error)
2424
Raft() *raft.Raft
25-
Serf() *serf.Serf
25+
SerfLAN() *serf.Serf
26+
SerfWAN() *serf.Serf
2627
}
2728

2829
// Autopilot is a mechanism for automatically managing the Raft
@@ -182,7 +183,7 @@ func (a *Autopilot) pruneDeadServers() error {
182183

183184
// Failed servers are known to Serf and marked failed, and stale servers
184185
// are known to Raft but not Serf.
185-
var failed []string
186+
var failed []serf.Member
186187
staleRaftServers := make(map[string]raft.Server)
187188
raftNode := a.delegate.Raft()
188189
future := raftNode.GetConfiguration()
@@ -194,8 +195,8 @@ func (a *Autopilot) pruneDeadServers() error {
194195
for _, server := range raftConfig.Servers {
195196
staleRaftServers[string(server.Address)] = server
196197
}
197-
198-
serfLAN := a.delegate.Serf()
198+
serfWAN := a.delegate.SerfWAN()
199+
serfLAN := a.delegate.SerfLAN()
199200
for _, member := range serfLAN.Members() {
200201
server, err := a.delegate.IsServer(member)
201202
if err != nil {
@@ -214,8 +215,12 @@ func (a *Autopilot) pruneDeadServers() error {
214215
if found && s.Suffrage == raft.Nonvoter {
215216
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", member.Name)
216217
go serfLAN.RemoveFailedNode(member.Name)
218+
if serfWAN != nil {
219+
go serfWAN.RemoveFailedNode(member.Name)
220+
}
217221
} else {
218-
failed = append(failed, member.Name)
222+
failed = append(failed, member)
223+
219224
}
220225
}
221226
}
@@ -231,8 +236,12 @@ func (a *Autopilot) pruneDeadServers() error {
231236
peers := NumPeers(raftConfig)
232237
if removalCount < peers/2 {
233238
for _, node := range failed {
234-
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node)
235-
go serfLAN.RemoveFailedNode(node)
239+
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
240+
go serfLAN.RemoveFailedNode(node.Name)
241+
if serfWAN != nil {
242+
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
243+
}
244+
236245
}
237246

238247
minRaftProtocol, err := a.MinRaftProtocol()
@@ -260,7 +269,7 @@ func (a *Autopilot) pruneDeadServers() error {
260269

261270
// MinRaftProtocol returns the lowest supported Raft protocol among alive servers
262271
func (a *Autopilot) MinRaftProtocol() (int, error) {
263-
return minRaftProtocol(a.delegate.Serf().Members(), a.delegate.IsServer)
272+
return minRaftProtocol(a.delegate.SerfLAN().Members(), a.delegate.IsServer)
264273
}
265274

266275
func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (*ServerInfo, error)) (int, error) {
@@ -369,7 +378,7 @@ func (a *Autopilot) updateClusterHealth() error {
369378
// Get the the serf members which are Consul servers
370379
var serverMembers []serf.Member
371380
serverMap := make(map[string]*ServerInfo)
372-
for _, member := range a.delegate.Serf().Members() {
381+
for _, member := range a.delegate.SerfLAN().Members() {
373382
if member.Status == serf.StatusLeft {
374383
continue
375384
}

agent/consul/server.go

+6
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"path/filepath"
1414
"reflect"
1515
"strconv"
16+
"strings"
1617
"sync"
1718
"sync/atomic"
1819
"time"
@@ -1003,6 +1004,11 @@ func (s *Server) RemoveFailedNode(node string) error {
10031004
if err := s.serfLAN.RemoveFailedNode(node); err != nil {
10041005
return err
10051006
}
1007+
// The Serf WAN pool stores members as node.datacenter
1008+
// so the dc is appended if not present
1009+
if !strings.HasSuffix(node, "."+s.config.Datacenter) {
1010+
node = node + "." + s.config.Datacenter
1011+
}
10061012
if s.serfWAN != nil {
10071013
if err := s.serfWAN.RemoveFailedNode(node); err != nil {
10081014
return err

0 commit comments

Comments
 (0)