Skip to content

Commit 135c360

Browse files
yichengqMichal Witkowski
authored and
Michal Witkowski
committed
etcdctl: refactor the way to check cluster health
This method uses raft status exposed at /debug/varz to determine the health of the cluster. It uses whether commit index increases to determine the cluster health, and uses whether match index increases to determine the member health. This could fix the bug etcd-io#2711 that fails to detect follower is unhealthy because it doesn't rely on whether message in long-polling connection is sent. This health check is stricter than the old one, and reflects the situation that whether followers are healthy in the view of the leader. One example is that if the follower is receiving the snapshot, it will turns out to be unhealthy because it doesn't move forward. `etcdctl cluster-health` will reflect the healthy view in the raft level, while connectivity checks reflects the healthy view in transport level.
1 parent d7f9f83 commit 135c360

File tree

1 file changed

+42
-19
lines changed

1 file changed

+42
-19
lines changed

etcdctl/command/cluster_health.go

+42-19
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111

1212
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
1313
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
14-
"github.com/coreos/etcd/etcdserver/stats"
1514
)
1615

1716
func NewClusterHealthCommand() cli.Command {
@@ -42,7 +41,7 @@ func handleClusterHealth(c *cli.Context) {
4241

4342
// check the /health endpoint of all members first
4443

45-
ep, ls0, err := getLeaderStats(tr, cl)
44+
ep, rs0, err := getLeaderStatus(tr, cl)
4645
if err != nil {
4746
fmt.Println("cluster may be unhealthy: failed to connect", cl)
4847
os.Exit(1)
@@ -51,27 +50,31 @@ func handleClusterHealth(c *cli.Context) {
5150
time.Sleep(time.Second)
5251

5352
// are all the members makeing progress?
54-
_, ls1, err := getLeaderStats(tr, []string{ep})
53+
_, rs1, err := getLeaderStatus(tr, []string{ep})
5554
if err != nil {
5655
fmt.Println("cluster is unhealthy")
5756
os.Exit(1)
5857
}
5958

60-
fmt.Println("cluster is healthy")
61-
// self is healthy
59+
if rs1.Commit > rs0.Commit {
60+
fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
61+
} else {
62+
fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
63+
}
64+
fmt.Printf("leader is %v\n", rs0.Lead)
65+
6266
var prints []string
6367

64-
prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
65-
for name, fs0 := range ls0.Followers {
66-
fs1, ok := ls1.Followers[name]
68+
for id, pr0 := range rs0.Progress {
69+
pr1, ok := rs1.Progress[id]
6770
if !ok {
6871
fmt.Println("Cluster configuration changed during health checking. Please retry.")
6972
os.Exit(1)
7073
}
71-
if fs1.Counts.Success <= fs0.Counts.Success {
72-
prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
74+
if pr1.Match <= pr0.Match {
75+
prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
7376
} else {
74-
prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
77+
prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
7578
}
7679
}
7780

@@ -82,15 +85,32 @@ func handleClusterHealth(c *cli.Context) {
8285
os.Exit(0)
8386
}
8487

85-
func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
86-
// go-etcd does not support cluster stats, use http client for now
87-
// TODO: use new etcd client with new member/stats endpoint
88+
type raftStatus struct {
89+
ID string `json:"id"`
90+
Term uint64 `json:"term"`
91+
Vote string `json:"vote"`
92+
Commit uint64 `json:"commit"`
93+
Lead string `json:"lead"`
94+
RaftState string `json:"raftState"`
95+
Progress map[string]struct {
96+
Match uint64 `json:"match"`
97+
Next uint64 `json:"next"`
98+
State string `json:"state"`
99+
} `json:"progress"`
100+
}
101+
102+
type vars struct {
103+
RaftStatus raftStatus `json:"raft.status"`
104+
}
105+
106+
func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) {
107+
// TODO: use new etcd client
88108
httpclient := http.Client{
89109
Transport: tr,
90110
}
91111

92112
for _, ep := range endpoints {
93-
resp, err := httpclient.Get(ep + "/v2/stats/leader")
113+
resp, err := httpclient.Get(ep + "/debug/vars")
94114
if err != nil {
95115
continue
96116
}
@@ -99,13 +119,16 @@ func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.Lead
99119
continue
100120
}
101121

102-
ls := &stats.LeaderStats{}
122+
vs := &vars{}
103123
d := json.NewDecoder(resp.Body)
104-
err = d.Decode(ls)
124+
err = d.Decode(vs)
105125
if err != nil {
106126
continue
107127
}
108-
return ep, ls, nil
128+
if vs.RaftStatus.Lead != vs.RaftStatus.ID {
129+
continue
130+
}
131+
return ep, vs.RaftStatus, nil
109132
}
110-
return "", nil, errors.New("no leader")
133+
return "", raftStatus{}, errors.New("no leader")
111134
}

0 commit comments

Comments
 (0)