Skip to content

Commit 8d8b3f5

Browse files
Vitess: ignore unhealthy replicas with realtime stats (#136)
* Ignore Vitess replicas without running replication * Fix typo * Method rename * Add to comment * Copy logic vtgate uses to filter tablets, minus lag+tablet count * Only check if &TabletRealtimeStats{} is nil, not the HealthError * Add test for nil realtime stats * Do not ignore 'serving: false' tablets * Improve test comments
1 parent 67e708a commit 8d8b3f5

File tree

2 files changed

+70
-19
lines changed

2 files changed

+70
-19
lines changed

pkg/vitess/api_client.go

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,25 @@ import (
1515

1616
const defaultTimeout = time.Duration(5) * time.Second
1717

18+
// TabletRealtimeStats represents realtime stats from a running instance of vttablet.
19+
type TabletRealtimeStats struct {
20+
HealthError string `json:"health_error,omitempty"`
21+
}
22+
23+
// TabletStats represents stats from a running instance of vttablet.
24+
type TabletStats struct {
25+
LastError string `json:"last_error,omitempty"`
26+
Realtime *TabletRealtimeStats `json:"realtime,omitempty"`
27+
Serving bool `json:"serving,omitempty"`
28+
Up bool `json:"up,omitempty"`
29+
}
30+
1831
// Tablet represents information about a running instance of vttablet.
1932
type Tablet struct {
2033
Alias *topodata.TabletAlias `json:"alias,omitempty"`
2134
MysqlHostname string `json:"mysql_hostname,omitempty"`
2235
MysqlPort int32 `json:"mysql_port,omitempty"`
36+
Stats *TabletStats `json:"stats,omitempty"`
2337
Type topodata.TabletType `json:"type,omitempty"`
2438
}
2539

@@ -36,9 +50,23 @@ func (t Tablet) HasValidCell(validCells []string) bool {
3650
return false
3751
}
3852

53+
// IsServeable returns a bool reflecting if a tablet is eligible to serve traffic based on tablet stats. For
54+
// backwards-compatibilty tablets are assumed to be healthy if realtime stats is disabled. This method aims
55+
// to mimic the logic used by vtgate to select tablets for read queries without considering 'serving', minimum
56+
// tablet count (not important to freno) and replication lag (freno polls its own replication lag)
57+
func (t Tablet) IsServeable() bool {
58+
if t.Stats != nil {
59+
return t.Stats.LastError == "" && t.Stats.Realtime != nil
60+
}
61+
return true
62+
}
63+
3964
// IsValidReplica returns a bool reflecting if a tablet type is REPLICA
4065
func (t Tablet) IsValidReplica() bool {
41-
return t.Type == topodata.TabletType_REPLICA
66+
if t.Type != topodata.TabletType_REPLICA {
67+
return false
68+
}
69+
return t.IsServeable()
4270
}
4371

4472
var httpClient = http.Client{

pkg/vitess/api_client_test.go

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package vitess
22

33
import (
44
"encoding/json"
5-
"fmt"
65
"net/http"
76
"net/http/httptest"
87
"testing"
@@ -16,44 +15,68 @@ func TestParseTablets(t *testing.T) {
1615
vitessApi := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1716
switch r.URL.String() {
1817
case "/api/keyspace/test/tablets/00", "/api/keyspace/test/tablets/00?cells=cell2":
19-
data, _ := json.Marshal([]Tablet{
18+
json.NewEncoder(w).Encode([]Tablet{
2019
{
21-
Alias: &topodata.TabletAlias{Cell: "cell1"},
22-
MysqlHostname: "master",
23-
Type: topodata.TabletType_MASTER,
20+
// primary (should be ignored)
21+
Alias: &topodata.TabletAlias{Cell: "cell1"},
22+
Type: topodata.TabletType_MASTER,
2423
},
2524
{
25+
// replica without realtime tablet stats enabled (assumed to be healthy)
2626
Alias: &topodata.TabletAlias{Cell: "cell2"},
2727
MysqlHostname: "replica1",
2828
Type: topodata.TabletType_REPLICA,
2929
},
3030
{
31+
// replica with healthy realtime tablet stats
3132
Alias: &topodata.TabletAlias{Cell: "cell3"},
3233
MysqlHostname: "replica2",
33-
Type: topodata.TabletType_REPLICA,
34+
Stats: &TabletStats{
35+
Realtime: &TabletRealtimeStats{},
36+
},
37+
Type: topodata.TabletType_REPLICA,
38+
},
39+
{
40+
// replica with nil realtime stats (should be ignored)
41+
Alias: &topodata.TabletAlias{Cell: "cell1"},
42+
MysqlHostname: "replica3",
43+
Stats: &TabletStats{
44+
Realtime: nil,
45+
},
3446
},
3547
{
48+
// replica with realtime tablet stats and 'replication not running' error (should be ignored)
3649
Alias: &topodata.TabletAlias{Cell: "cell2"},
37-
MysqlHostname: "spare",
38-
Type: topodata.TabletType_SPARE,
50+
MysqlHostname: "replica4",
51+
Stats: &TabletStats{
52+
LastError: "vttablet error: replication is not running",
53+
Realtime: &TabletRealtimeStats{
54+
HealthError: "replication is not running",
55+
},
56+
},
57+
Type: topodata.TabletType_REPLICA,
3958
},
4059
{
41-
Alias: &topodata.TabletAlias{Cell: "cell3"},
42-
MysqlHostname: "batch",
43-
Type: topodata.TabletType_BATCH,
60+
// spare tablet (should be ignored)
61+
Alias: &topodata.TabletAlias{Cell: "cell2"},
62+
Type: topodata.TabletType_SPARE,
4463
},
4564
{
46-
Alias: &topodata.TabletAlias{Cell: "cell2"},
47-
MysqlHostname: "backup",
48-
Type: topodata.TabletType_BACKUP,
65+
// batch tablet (should be ignored)
66+
Alias: &topodata.TabletAlias{Cell: "cell3"},
67+
Type: topodata.TabletType_BATCH,
4968
},
5069
{
51-
Alias: &topodata.TabletAlias{Cell: "cell1"},
52-
MysqlHostname: "restore",
53-
Type: topodata.TabletType_RESTORE,
70+
// backup tablet (should be ignored)
71+
Alias: &topodata.TabletAlias{Cell: "cell2"},
72+
Type: topodata.TabletType_BACKUP,
73+
},
74+
{
75+
// restore tablet (should be ignored)
76+
Alias: &topodata.TabletAlias{Cell: "cell1"},
77+
Type: topodata.TabletType_RESTORE,
5478
},
5579
})
56-
fmt.Fprint(w, string(data))
5780
default:
5881
w.WriteHeader(http.StatusNotFound)
5982
}

0 commit comments

Comments
 (0)