Skip to content

Commit 0b51080

Browse files
authored
update defaultReplicationStrategy to not fail with extend-write when a single instance is unhealthy (#4636)
Signed-off-by: Roy Chiang <[email protected]>
1 parent ffc6158 commit 0b51080

File tree

3 files changed

+16
-17
lines changed

3 files changed

+16
-17
lines changed

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
* [BUGFIX] Set appropriate `Content-Type` header for /services endpoint, which previously hard-coded `text/plain`. #4596
2727
* [BUGFIX] Querier: Disable query scheduler SRV DNS lookup, which removes noisy log messages about "failed DNS SRV record lookup". #4601
2828
* [BUGFIX] Memberlist: fixed corrupted packets when sending compound messages with more than 255 messages or messages bigger than 64KB. #4601
29-
* [BUGIX] Query Frontend: If 'LogQueriesLongerThan' is set to < 0, log all queries as described in the docs. #4633
29+
* [BUGFIX] Query Frontend: If 'LogQueriesLongerThan' is set to < 0, log all queries as described in the docs. #4633
30+
* [BUGFIX] Distributor: update defaultReplicationStrategy to not fail with extend-write when a single instance is unhealthy. #4636
3031

3132
## 1.11.0 2021-11-25
3233

pkg/ring/replication_strategy.go

+9-11
Original file line numberDiff line numberDiff line change
@@ -26,19 +26,9 @@ func NewDefaultReplicationStrategy() ReplicationStrategy {
2626
// - Checks there are enough instances for an operation to succeed.
2727
// The instances argument may be overwritten.
2828
func (s *defaultReplicationStrategy) Filter(instances []InstanceDesc, op Operation, replicationFactor int, heartbeatTimeout time.Duration, zoneAwarenessEnabled bool) ([]InstanceDesc, int, error) {
29-
// We need a response from a quorum of instances, which is n/2 + 1. In the
30-
// case of a node joining/leaving, the actual replica set might be bigger
31-
// than the replication factor, so use the bigger or the two.
32-
if len(instances) > replicationFactor {
33-
replicationFactor = len(instances)
34-
}
35-
36-
minSuccess := (replicationFactor / 2) + 1
3729
now := time.Now()
3830

39-
// Skip those that have not heartbeated in a while. NB these are still
40-
// included in the calculation of minSuccess, so if too many failed instances
41-
// will cause the whole write to fail.
31+
// Skip those that have not heartbeated in a while.
4232
var unhealthy []string
4333
for i := 0; i < len(instances); {
4434
if instances[i].IsHealthy(op, heartbeatTimeout, now) {
@@ -49,6 +39,14 @@ func (s *defaultReplicationStrategy) Filter(instances []InstanceDesc, op Operati
4939
}
5040
}
5141

42+
// We need a response from a quorum of instances, which is n/2 + 1. In the
43+
// case of a node joining/leaving with extend-writes enabled, the actual replica
44+
// set will be bigger than the replication factor, so use the bigger or the two.
45+
if len(instances) > replicationFactor {
46+
replicationFactor = len(instances)
47+
}
48+
49+
minSuccess := (replicationFactor / 2) + 1
5250
// This is just a shortcut - if there are not minSuccess available instances,
5351
// after filtering out dead ones, don't even bother trying.
5452
if len(instances) < minSuccess {

pkg/ring/replication_strategy_test.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,14 @@ func TestRingReplicationStrategy(t *testing.T) {
6868
replicationFactor: 3,
6969
liveIngesters: 3,
7070
deadIngesters: 1,
71-
expectedMaxFailure: 0,
71+
expectedMaxFailure: 1,
7272
},
7373

7474
{
75-
replicationFactor: 3,
76-
liveIngesters: 2,
77-
deadIngesters: 2,
78-
expectedError: "at least 3 live replicas required, could only find 2 - unhealthy instances: dead1,dead2",
75+
replicationFactor: 3,
76+
liveIngesters: 2,
77+
deadIngesters: 2,
78+
expectedMaxFailure: 0,
7979
},
8080
} {
8181
ingesters := []InstanceDesc{}

0 commit comments

Comments
 (0)