Skip to content

Commit 2f2b6fb

Browse files
authored
Add retries to ILM client (#7120)
## Which problem is this PR solving? - Resolves #6838 ## Description of the changes - Added simple retry logic with exponential backoff to the Exists method in ILMClient to improve resilience. This helps prevent random test failures and mitigates issues caused by transient network errors. ## How was this change tested? - All existing tests for ILMClient passed. - An additional unit test was added to verify that if the HTTP request fails a few times, the client correctly retries and eventually completes the Exists operation successfully once the service becomes healthy. ## Checklist - [x] I have read https://github.com/jaegertracing/jaeger/blob/master/CONTRIBUTING_GUIDELINES.md - [x] I have signed all commits - [x] I have added unit tests for the new functionality - [x] I have run lint and test steps successfully - for `jaeger`: `make lint test` - for `jaeger-ui`: `npm run lint` and `npm run test` --------- Signed-off-by: Ilia Yavorov Petrov <[email protected]> Signed-off-by: Ilia Petrov <[email protected]>
1 parent 7b6f356 commit 2f2b6fb

File tree

3 files changed

+69
-5
lines changed

3 files changed

+69
-5
lines changed

cmd/es-rollover/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ func main() {
5454
}
5555
ilmClient := &client.ILMClient{
5656
Client: c,
57+
Logger: logger,
5758
}
5859
return &initialize.Action{
5960
IndicesClient: indicesClient,

internal/storage/elasticsearch/client/ilm_client.go

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,19 @@
44
package client
55

66
import (
7+
"context"
78
"errors"
89
"fmt"
910
"net/http"
11+
"time"
12+
13+
"github.com/cenkalti/backoff/v5"
14+
"go.uber.org/zap"
15+
)
16+
17+
const (
18+
maxTries = 3
19+
maxElapsedTime = 5 * time.Second
1020
)
1121

1222
var _ IndexManagementLifecycleAPI = (*ILMClient)(nil)
@@ -15,22 +25,39 @@ var _ IndexManagementLifecycleAPI = (*ILMClient)(nil)
1525
type ILMClient struct {
1626
Client
1727
MasterTimeoutSeconds int
28+
Logger *zap.Logger
1829
}
1930

2031
// Exists verify if a ILM policy exists
2132
func (i ILMClient) Exists(name string) (bool, error) {
22-
_, err := i.request(elasticRequest{
23-
endpoint: "_ilm/policy/" + name,
24-
method: http.MethodGet,
25-
})
33+
operation := func() ([]byte, error) {
34+
bytes, err := i.request(elasticRequest{
35+
endpoint: "_ilm/policy/" + name,
36+
method: http.MethodGet,
37+
})
38+
if err != nil {
39+
i.Logger.Warn("Retryable error while getting ILM policy",
40+
zap.String("name", name),
41+
zap.Error(err),
42+
)
43+
return bytes, err
44+
}
45+
return bytes, nil
46+
}
47+
48+
_, err := backoff.Retry(
49+
context.TODO(),
50+
operation,
51+
backoff.WithMaxTries(maxTries),
52+
backoff.WithMaxElapsedTime(maxElapsedTime),
53+
)
2654

2755
var respError ResponseError
2856
if errors.As(err, &respError) {
2957
if respError.StatusCode == http.StatusNotFound {
3058
return false, nil
3159
}
3260
}
33-
3461
if err != nil {
3562
return false, fmt.Errorf("failed to get ILM policy: %s, %w", name, err)
3663
}

internal/storage/elasticsearch/client/ilm_client_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111

1212
"github.com/stretchr/testify/assert"
1313
"github.com/stretchr/testify/require"
14+
"go.uber.org/zap"
1415
)
1516

1617
func TestExists(t *testing.T) {
@@ -55,6 +56,7 @@ func TestExists(t *testing.T) {
5556
Endpoint: testServer.URL,
5657
BasicAuth: "foobar",
5758
},
59+
Logger: zap.NewNop(),
5860
}
5961
result, err := c.Exists("jaeger-ilm-policy")
6062
if test.errContains != "" {
@@ -64,3 +66,37 @@ func TestExists(t *testing.T) {
6466
})
6567
}
6668
}
69+
70+
func TestExists_Retries(t *testing.T) {
71+
var callCount int
72+
73+
testServer := httptest.NewServer(http.HandlerFunc(func(res http.ResponseWriter, req *http.Request) {
74+
callCount++
75+
assert.True(t, strings.HasSuffix(req.URL.String(), "_ilm/policy/jaeger-ilm-policy"))
76+
assert.Equal(t, http.MethodGet, req.Method)
77+
assert.Equal(t, "Basic foobar", req.Header.Get("Authorization"))
78+
79+
if callCount < maxTries {
80+
res.WriteHeader(http.StatusInternalServerError)
81+
res.Write([]byte(`{"error": "server error"}`))
82+
return
83+
}
84+
85+
res.WriteHeader(http.StatusOK)
86+
}))
87+
defer testServer.Close()
88+
89+
c := &ILMClient{
90+
Client: Client{
91+
Client: testServer.Client(),
92+
Endpoint: testServer.URL,
93+
BasicAuth: "foobar",
94+
},
95+
Logger: zap.NewNop(),
96+
}
97+
98+
result, err := c.Exists("jaeger-ilm-policy")
99+
require.NoError(t, err)
100+
assert.True(t, result)
101+
assert.Equal(t, maxTries, callCount, "should retry twice before succeeding")
102+
}

0 commit comments

Comments
 (0)