Skip to content

BugFix: 1.0 Ingesters having availability zone value erased by pre-1.0 ingesters during rolling upgrade. #2404

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400
* [BUGFIX] Cassandra Storage: Fix endpoint TLS host verification. #2109
* [BUGFIX] Experimental TSDB: fixed response status code from `422` to `500` when an error occurs while iterating chunks with the experimental blocks storage. #2402
* [BUGFIX] Ring: Fixed a situation where upgrading from pre-1.0 cortex with a rolling strategy caused new 1.0 ingesters to lose their zone value in the ring until manually forced to re-register. #2404

## 1.0.0 / 2020-04-02

Expand Down
5 changes: 1 addition & 4 deletions pkg/ring/lifecycler.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,6 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringNa
util.WarnExperimentalUse("Zone aware replication")
}

if zone == "" {
zone = cfg.ID
}

// We do allow a nil FlushTransferer, but to keep the ring logic easier we assume
// it's always set, so we use a noop FlushTransferer
if flushTransferer == nil {
Expand Down Expand Up @@ -667,6 +663,7 @@ func (i *Lifecycler) updateConsul(ctx context.Context) error {
ingesterDesc.Timestamp = time.Now().Unix()
ingesterDesc.State = i.GetState()
ingesterDesc.Addr = i.Addr
ingesterDesc.Zone = i.Zone
ringDesc.Ingesters[i.ID] = ingesterDesc
}

Expand Down
55 changes: 55 additions & 0 deletions pkg/ring/lifecycler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func testLifecyclerConfig(ringConfig Config, id string) LifecyclerConfig {
lifecyclerConfig.RingConfig = ringConfig
lifecyclerConfig.NumTokens = 1
lifecyclerConfig.ID = id
lifecyclerConfig.Zone = "zone1"
lifecyclerConfig.FinalSleep = 0
lifecyclerConfig.HeartbeatPeriod = 100 * time.Millisecond

Expand Down Expand Up @@ -390,3 +391,57 @@ func TestJoinInLeavingState(t *testing.T) {
len(desc.Ingesters["ing2"].Tokens) == 2
})
}

func TestRestoreOfZoneWhenOverwritten(t *testing.T) {
// This test is simulating a case during upgrade of pre 1.0 cortex where
// older ingesters do not have the zone field in their ring structs
// so it gets removed. The current version of the lifecylcer should
// write it back on update during its next heartbeat.

var ringConfig Config
flagext.DefaultValues(&ringConfig)
codec := GetCodec()
ringConfig.KVStore.Mock = consul.NewInMemoryClient(codec)

r, err := New(ringConfig, "ingester", IngesterRingKey)
require.NoError(t, err)
require.NoError(t, services.StartAndAwaitRunning(context.Background(), r))
defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck

cfg := testLifecyclerConfig(ringConfig, "ing1")

// Set ing1 to not have a zone
err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) {
r := &Desc{
Ingesters: map[string]IngesterDesc{
"ing1": {
State: ACTIVE,
Addr: "0.0.0.0",
Tokens: []uint32{1, 4},
},
"ing2": {
Tokens: []uint32{2, 3},
},
},
}

return r, true, nil
})
require.NoError(t, err)

l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true)
require.NoError(t, err)
require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1))

// Check that the lifecycler was able to reset the zone value to the expected setting
test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
d, err := r.KVClient.Get(context.Background(), IngesterRingKey)
require.NoError(t, err)
desc, ok := d.(*Desc)
return ok &&
len(desc.Ingesters) == 2 &&
desc.Ingesters["ing1"].Zone == l1.Zone &&
desc.Ingesters["ing2"].Zone == ""

})
}