Skip to content

Commit c72fd7e

Browse files
authored
Merge pull request #53 from stuartleeks/sllglk/client-retry
Implement retry on transient errors
2 parents 8d25e5e + fc395f4 commit c72fd7e

File tree

2,415 files changed

+1126
-1283004
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,415 files changed

+1126
-1283004
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -327,4 +327,5 @@ website/**/*.tar.gz
327327
website/**/*.zip
328328
website/public/**
329329

330-
.vscode/private.env
330+
.vscode/private.env
331+
tf.log

client/service/client.go

+81-24
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package service
22

33
import (
44
"bytes"
5+
"context"
56
"crypto/tls"
67
"encoding/json"
78
"fmt"
@@ -10,9 +11,11 @@ import (
1011
"net/http"
1112
"net/url"
1213
"reflect"
14+
"strings"
1315
"time"
1416

1517
"github.com/google/go-querystring/query"
18+
"github.com/hashicorp/go-retryablehttp"
1619
)
1720

1821
// CloudServiceProvider is a custom type for different types of cloud service providers
@@ -62,25 +65,87 @@ type DBApiClientConfig struct {
6265
DefaultHeaders map[string]string
6366
InsecureSkipVerify bool
6467
TimeoutSeconds int
65-
client http.Client
68+
client *retryablehttp.Client
69+
}
70+
71+
var transientErrorStringMatches []string = []string{ // TODO: Should we make these regexes to match more of the message or is this sufficient?
72+
"com.databricks.backend.manager.util.UnknownWorkerEnvironmentException",
73+
"does not have any associated worker environments",
6674
}
6775

6876
// Setup initializes the client
6977
func (c *DBApiClientConfig) Setup() {
7078
if c.TimeoutSeconds == 0 {
7179
c.TimeoutSeconds = 60
7280
}
73-
c.client = http.Client{
74-
Timeout: time.Duration(time.Duration(c.TimeoutSeconds) * time.Second),
75-
Transport: &http.Transport{
76-
TLSClientConfig: &tls.Config{
77-
InsecureSkipVerify: c.InsecureSkipVerify,
81+
// Set up a retryable HTTP Client to handle cases where the service returns
82+
// a transient error on initial creation
83+
retryDelayDuration := 10 * time.Second
84+
retryMaximumDuration := 5 * time.Minute
85+
c.client = &retryablehttp.Client{
86+
HTTPClient: &http.Client{
87+
Timeout: time.Duration(time.Duration(c.TimeoutSeconds) * time.Second),
88+
Transport: &http.Transport{
89+
TLSClientConfig: &tls.Config{
90+
InsecureSkipVerify: c.InsecureSkipVerify,
91+
},
7892
},
7993
},
94+
CheckRetry: checkHTTPRetry,
95+
// Using a linear retry rather than the default exponential retry
96+
// as the creation condition is normally passed after 30-40 seconds
97+
// Setting the retry interval to 10 seconds. Setting RetryWaitMin and RetryWaitMax
98+
// to the same value removes jitter (which would be useful in a high-volume traffic scenario
99+
// but wouldn't add much here)
100+
Backoff: retryablehttp.LinearJitterBackoff,
101+
RetryWaitMin: retryDelayDuration,
102+
RetryWaitMax: retryDelayDuration,
103+
RetryMax: int(retryMaximumDuration / retryDelayDuration),
104+
}
105+
}
106+
107+
// checkHTTPRetry inspects HTTP errors from the Databricks API for known transient errors on Workspace creation
108+
func checkHTTPRetry(ctx context.Context, resp *http.Response, err error) (bool, error) {
109+
if resp == nil {
110+
// If response is nil we can't make retry choices.
111+
// In this case don't retry and return the original error from httpclient
112+
return false, err
113+
}
114+
if resp.StatusCode >= 400 {
115+
log.Printf("Failed request detected. Status Code: %v\n", resp.StatusCode)
116+
// reading the body means that the caller cannot read it themselves
117+
// But that's ok because we've hit an error case
118+
// Our job now is to
119+
// - capture the error and return it
120+
// - determine if the error is retryable
121+
122+
body, err := ioutil.ReadAll(resp.Body)
123+
if err != nil {
124+
return false, err
125+
}
126+
127+
var errorBody DBApiErrorBody
128+
err = json.Unmarshal(body, &errorBody)
129+
if err != nil {
130+
return false, fmt.Errorf("Response from server (%d) %s: %v", resp.StatusCode, string(body), err)
131+
}
132+
dbAPIError := DBApiError{
133+
ErrorBody: &errorBody,
134+
StatusCode: resp.StatusCode,
135+
Err: fmt.Errorf("Response from server %s", string(body)),
136+
}
137+
for _, substring := range transientErrorStringMatches {
138+
if strings.Contains(errorBody.Message, substring) {
139+
log.Println("Failed request detected: Retryable type found. Attempting retry...")
140+
return true, dbAPIError
141+
}
142+
}
143+
return false, dbAPIError
80144
}
145+
return false, nil
81146
}
82147

83-
func (c DBApiClientConfig) getAuthHeader() map[string]string {
148+
func (c *DBApiClientConfig) getAuthHeader() map[string]string {
84149
auth := make(map[string]string)
85150
if c.AuthType == BasicAuth {
86151
auth["Authorization"] = "Basic " + c.Token
@@ -91,7 +156,7 @@ func (c DBApiClientConfig) getAuthHeader() map[string]string {
91156
return auth
92157
}
93158

94-
func (c DBApiClientConfig) getUserAgentHeader() map[string]string {
159+
func (c *DBApiClientConfig) getUserAgentHeader() map[string]string {
95160
if reflect.ValueOf(c.UserAgent).IsZero() {
96161
return map[string]string{
97162
"User-Agent": "databricks-go-client-sdk",
@@ -102,7 +167,7 @@ func (c DBApiClientConfig) getUserAgentHeader() map[string]string {
102167
}
103168
}
104169

105-
func (c DBApiClientConfig) getDefaultHeaders() map[string]string {
170+
func (c *DBApiClientConfig) getDefaultHeaders() map[string]string {
106171
auth := c.getAuthHeader()
107172
userAgent := c.getUserAgentHeader()
108173

@@ -119,7 +184,7 @@ func (c DBApiClientConfig) getDefaultHeaders() map[string]string {
119184
return defaultHeaders
120185
}
121186

122-
func (c DBApiClientConfig) getRequestURI(path string, apiVersion string) (string, error) {
187+
func (c *DBApiClientConfig) getRequestURI(path string, apiVersion string) (string, error) {
123188
var apiVersionString string
124189
if apiVersion == "" {
125190
apiVersionString = "2.0"
@@ -189,6 +254,9 @@ func PerformQuery(config *DBApiClientConfig, method, path string, apiVersion str
189254
}
190255
}
191256
requestHeaders := config.getDefaultHeaders()
257+
if config.client == nil {
258+
config.Setup()
259+
}
192260

193261
if len(headers) > 0 {
194262
for k, v := range headers {
@@ -221,7 +289,7 @@ func PerformQuery(config *DBApiClientConfig, method, path string, apiVersion str
221289
auditNonGetPayload(method, requestURL, data, secretsMask)
222290
}
223291

224-
request, err := http.NewRequest(method, requestURL, bytes.NewBuffer(requestBody))
292+
request, err := retryablehttp.NewRequest(method, requestURL, bytes.NewBuffer(requestBody))
225293
if err != nil {
226294
return nil, err
227295
}
@@ -244,19 +312,8 @@ func PerformQuery(config *DBApiClientConfig, method, path string, apiVersion str
244312
if err != nil {
245313
return nil, err
246314
}
247-
248-
if resp.StatusCode >= 400 {
249-
var errorBody DBApiErrorBody
250-
err = json.Unmarshal(body, &errorBody)
251-
if err != nil {
252-
return nil, fmt.Errorf("Response from server (%d) %s", resp.StatusCode, string(body))
253-
}
254-
return nil, DBApiError{
255-
ErrorBody: &errorBody,
256-
StatusCode: resp.StatusCode,
257-
Err: fmt.Errorf("Response from server %s", string(body)),
258-
}
259-
}
315+
// Don't need to check the status code here as the RetryCheck for
316+
// retryablehttp.Client is doing that and returning an error
260317

261318
return body, nil
262319
}

client/service/main_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ func GetIntegrationDBAPIClient() *DBApiClient {
4444
var config DBApiClientConfig
4545
config.Token = os.Getenv("DATABRICKS_TOKEN")
4646
config.Host = os.Getenv("DATABRICKS_HOST")
47+
config.Setup()
4748

4849
var c DBApiClient
4950
c.SetConfig(&config)
@@ -77,6 +78,7 @@ func AssertRequestWithMockServer(t *testing.T, rawPayloadArgs interface{}, reque
7778
defer server.Close()
7879
var config DBApiClientConfig
7980
config.Host = server.URL
81+
config.Setup()
8082

8183
var dbClient DBApiClient
8284
dbClient.SetConfig(&config)
@@ -116,6 +118,7 @@ func AssertMultipleRequestsWithMockServer(t *testing.T, rawPayloadArgs interface
116118
defer server.Close()
117119
var config DBApiClientConfig
118120
config.Host = server.URL
121+
config.Setup()
119122

120123
var dbClient DBApiClient
121124
dbClient.SetConfig(&config)

databricks/azure_auth.go

+9-52
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@ package databricks
33
import (
44
"encoding/json"
55
"fmt"
6-
"github.com/Azure/go-autorest/autorest/adal"
7-
"github.com/Azure/go-autorest/autorest/azure"
8-
"github.com/databrickslabs/databricks-terraform/client/service"
96
"log"
107
"net/http"
118
urlParse "net/url"
12-
"strings"
13-
"time"
9+
10+
"github.com/Azure/go-autorest/autorest/adal"
11+
"github.com/Azure/go-autorest/autorest/azure"
12+
"github.com/databrickslabs/databricks-terraform/client/service"
1413
)
1514

1615
// List of management information
@@ -213,53 +212,11 @@ func (a *AzureAuth) initWorkspaceAndGetClient(config *service.DBApiClientConfig)
213212
//}
214213
dbClient.SetConfig(&newOption)
215214

216-
// So when the workspace is initially created sometimes it fails to perform api calls so this is a simple test
217-
// to verify that the workspace has been created successfully. The retry is intentional as sometimes after workspace
218-
// creation the API's will not work correctly. This may also
219-
err = validateWorkspaceApis(10, 30, func(attempt int) error {
220-
_, err = dbClient.Clusters().ListNodeTypes()
221-
return err
222-
})
215+
// Spin for a while while the workspace comes up and starts behaving.
216+
_, err = dbClient.Clusters().ListNodeTypes()
217+
if err != nil {
218+
return dbClient, err
219+
}
223220

224221
return dbClient, err
225222
}
226-
227-
func validateWorkspaceApis(sleepDurationSeconds time.Duration, timeoutDurationMinutes time.Duration, do func(attempt int) error) error {
228-
errChan := make(chan error, 1)
229-
var timeoutBool = false
230-
var attempts int
231-
var expectedError error
232-
apisAreNotYetReadErr := "com.databricks.backend.manager.util.UnknownWorkerEnvironmentException: Unknown worker environment WorkerEnvId"
233-
go func(attempts *int, expectedError *error, timeout *bool) {
234-
for {
235-
err := do(*attempts)
236-
// Timeout and terminate go routine so it does not leak
237-
if *timeout {
238-
errChan <- err
239-
return
240-
}
241-
if err == nil {
242-
errChan <- err
243-
return
244-
}
245-
if !strings.Contains(err.Error(), apisAreNotYetReadErr) {
246-
errChan <- err
247-
return
248-
}
249-
log.Println(fmt.Sprintf("Waiting for cluster apis to not throw error: %v", err))
250-
*attempts++
251-
*expectedError = err
252-
time.Sleep(sleepDurationSeconds * time.Second)
253-
}
254-
}(&attempts, &expectedError, &timeoutBool)
255-
select {
256-
case err := <-errChan:
257-
if err == nil {
258-
log.Printf("Returned nil error after %v attempts\n", attempts)
259-
}
260-
return err
261-
case <-time.After(timeoutDurationMinutes * time.Minute):
262-
timeoutBool = true
263-
return fmt.Errorf("timed out waiting for ready state after %v attempts with error %v", attempts, expectedError)
264-
}
265-
}

databricks/azure_auth_test.go

+3-36
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
package databricks
22

33
import (
4-
"errors"
4+
"os"
5+
"testing"
6+
57
"github.com/databrickslabs/databricks-terraform/client/model"
68
"github.com/databrickslabs/databricks-terraform/client/service"
79
"github.com/stretchr/testify/assert"
8-
"log"
9-
"os"
10-
"testing"
1110
)
1211

1312
func GetIntegrationDBClientOptions() *service.DBApiClientConfig {
@@ -58,35 +57,3 @@ func TestAzureAuthCreateApiToken(t *testing.T) {
5857

5958
assert.NoError(t, instancePoolErr, instancePoolErr)
6059
}
61-
62-
func TestValidateWorkspaceApis(t *testing.T) {
63-
64-
// Eventually will pass after a few attempts
65-
err := validateWorkspaceApis(0, 1, func(attempt int) error {
66-
log.Printf("Attempt: %v hello world\n", attempt)
67-
if attempt == 2 {
68-
return nil
69-
}
70-
return errors.New("com.databricks.backend.manager.util.UnknownWorkerEnvironmentException: Unknown worker environment WorkerEnvId")
71-
})
72-
assert.NoError(t, err, err)
73-
74-
// Eventually will fail with an error after a few attempts
75-
expectedError := errors.New("failing valid error")
76-
err = validateWorkspaceApis(0, 1, func(attempt int) error {
77-
log.Printf("Attempt: %v hello world\n", attempt)
78-
if attempt == 2 {
79-
return expectedError
80-
}
81-
return errors.New("com.databricks.backend.manager.util.UnknownWorkerEnvironmentException: Unknown worker environment WorkerEnvId")
82-
})
83-
assert.Error(t, err, err)
84-
85-
// Eventually will timeout with an error after 0 attempts
86-
//expectedError := errors.New("failing valid error")
87-
err = validateWorkspaceApis(0, 0, func(attempt int) error {
88-
log.Printf("Attempt: %v hello world\n", attempt)
89-
return errors.New("com.databricks.backend.manager.util.UnknownWorkerEnvironmentException: Unknown worker environment WorkerEnvId")
90-
})
91-
assert.Error(t, err, err)
92-
}

databricks/provider.go

+3
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,9 @@ func providerConfigureAzureClient(d *schema.ResourceData, providerVersion string
177177

178178
func providerConfigure(d *schema.ResourceData, providerVersion string) (interface{}, error) {
179179
var config service.DBApiClientConfig
180+
// Call setup to configure retryable httpclient
181+
config.Setup()
182+
180183
if _, ok := d.GetOk("azure_auth"); !ok {
181184
if host, ok := d.GetOk("host"); ok {
182185
config.Host = host.(string)

go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/Azure/go-autorest/autorest v0.10.0
77
github.com/Azure/go-autorest/autorest/adal v0.8.3
88
github.com/google/go-querystring v1.0.0
9+
github.com/hashicorp/go-retryablehttp v0.6.6
910
github.com/hashicorp/hcl v1.0.0 // indirect
1011
github.com/hashicorp/terraform-plugin-sdk v1.12.0
1112
github.com/joho/godotenv v1.3.0

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uP
101101
github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk=
102102
github.com/hashicorp/go-plugin v1.0.1 h1:4OtAfUGbnKC6yS48p0CtMX2oFYtzFZVv6rok3cRWgnE=
103103
github.com/hashicorp/go-plugin v1.0.1/go.mod h1:++UyYGoz3o5w9ZzAdZxtQKrWWP+iqPBn3cQptSMzBuY=
104+
github.com/hashicorp/go-retryablehttp v0.6.6 h1:HJunrbHTDDbBb/ay4kxa1n+dLmttUlnP3V9oNE4hmsM=
105+
github.com/hashicorp/go-retryablehttp v0.6.6/go.mod h1:vAew36LZh98gCBJNLH42IQ1ER/9wtLZZ8meHqQvEYWY=
104106
github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo=
105107
github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I=
106108
github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE=

integration-environment-azure/run.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
set -e
33
cd $(dirname "$0")
44

5-
# export SKIP_CLEANUP=true
6-
75
function cleanup()
86
{
97
echo -e "----> Destroy prereqs \n\n"
@@ -42,5 +40,7 @@ export TEST_LOCATION=$(terraform output location)
4240

4341

4442
echo -e "----> Running Azure Acceptance Tests \n\n"
43+
# Output debug log to file while tests run
44+
export TF_LOG_PATH=$PWD/tf.log
4545
# Run all Azure integration tests
4646
TF_LOG=debug TF_ACC=1 gotestsum --format short-verbose --raw-command go test -v -json -short -coverprofile=coverage.out -test.timeout 35m -run 'TestAccAzure' ./../...

0 commit comments

Comments
 (0)