From a135db34b5a0d1e3e0dc774d890532008ef5756f Mon Sep 17 00:00:00 2001 From: Matt Dale <9760375+matthewdale@users.noreply.github.com> Date: Tue, 9 Apr 2024 10:20:40 -0700 Subject: [PATCH 1/2] GODRIVER-3145 Don't retry on context timeout or cancellation. --- mongo/integration/client_test.go | 82 ++++++++++++++++++++++++++++++++ x/mongo/driver/operation.go | 7 +++ 2 files changed, 89 insertions(+) diff --git a/mongo/integration/client_test.go b/mongo/integration/client_test.go index 76a6d5121a..80635d179a 100644 --- a/mongo/integration/client_test.go +++ b/mongo/integration/client_test.go @@ -768,6 +768,88 @@ func TestClient(t *testing.T) { "expected 'OP_MSG' OpCode in wire message, got %q", pair.Sent.OpCode.String()) } }) + + opts := mtest.NewOptions(). + // Blocking failpoints don't work on pre-4.2 and sharded clusters. + Topologies(mtest.Single, mtest.ReplicaSet). + MinServerVersion("4.2"). + // Expliticly enable retryable reads and retryable writes. + ClientOptions(options.Client().SetRetryReads(true).SetRetryWrites(true)) + mt.RunOpts("operations don't retry after a context timeout", opts, func(mt *mtest.T) { + testCases := []struct { + desc string + failPoint mtest.FailPoint + operation func(context.Context, *mongo.Collection) error + }{ + { + desc: "read op", + failPoint: mtest.FailPoint{ + ConfigureFailPoint: "failCommand", + Mode: "alwaysOn", + Data: mtest.FailPointData{ + FailCommands: []string{"find"}, + BlockConnection: true, + BlockTimeMS: 500, + }, + }, + operation: func(ctx context.Context, coll *mongo.Collection) error { + return coll.FindOne(ctx, bson.D{}).Err() + }, + }, + { + desc: "write op", + failPoint: mtest.FailPoint{ + ConfigureFailPoint: "failCommand", + Mode: "alwaysOn", + Data: mtest.FailPointData{ + FailCommands: []string{"insert"}, + BlockConnection: true, + BlockTimeMS: 500, + }, + }, + operation: func(ctx context.Context, coll *mongo.Collection) error { + _, err := coll.InsertOne(ctx, bson.D{}) + return err + }, + }, + } + + for _, tc := range testCases { + mt.Run(tc.desc, func(t *mtest.T) { + _, err := mt.Coll.InsertOne(context.Background(), bson.D{}) + require.NoError(mt, err) + + mt.SetFailPoint(tc.failPoint) + + mt.ClearEvents() + + for i := 0; i < 50; i++ { + // Run 50 operations, each with a timeout of 50ms. Expect + // them to all return a timeout error because the failpoint + // blocks find operations for 500ms. Run 50 to increase the + // probability that an operation will time out in a way that + // can cause a retry. + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + err = tc.operation(ctx, mt.Coll) + assert.ErrorIs(mt, err, context.DeadlineExceeded) + assert.True(mt, mongo.IsTimeout(err), "expected mongo.IsTimeout(err) to be true") + + // Assert that each operation reported exactly one command + // started events, which means the operation did not retry + // after the context timeout. + evts := mt.GetAllStartedEvents() + require.Len(mt, + mt.GetAllStartedEvents(), + 1, + "expected exactly 1 command started event per operation, but got %d after %d iterations", + len(evts), + i) + mt.ClearEvents() + } + }) + } + }) } func TestClient_BSONOptions(t *testing.T) { diff --git a/x/mongo/driver/operation.go b/x/mongo/driver/operation.go index 8f87c21d3f..eb1acec88f 100644 --- a/x/mongo/driver/operation.go +++ b/x/mongo/driver/operation.go @@ -622,6 +622,13 @@ func (op Operation) Execute(ctx context.Context) error { } }() for { + // If we're starting a retry and the the error from the previous try was + // a context canceled or deadline exceeded error, stop retrying and + // return that error. + if errors.Is(prevErr, context.Canceled) || errors.Is(prevErr, context.DeadlineExceeded) { + return prevErr + } + requestID := wiremessage.NextRequestID() // If the server or connection are nil, try to select a new server and get a new connection. From 4cfcd127a1fd66fb8d4e2d9d6780e880e4b8c720 Mon Sep 17 00:00:00 2001 From: Matt Dale <9760375+matthewdale@users.noreply.github.com> Date: Fri, 12 Apr 2024 10:55:01 -0700 Subject: [PATCH 2/2] Simplify context timeout retry test. --- mongo/integration/client_test.go | 33 +++++++++++--------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/mongo/integration/client_test.go b/mongo/integration/client_test.go index 80635d179a..0139d273da 100644 --- a/mongo/integration/client_test.go +++ b/mongo/integration/client_test.go @@ -778,35 +778,16 @@ func TestClient(t *testing.T) { mt.RunOpts("operations don't retry after a context timeout", opts, func(mt *mtest.T) { testCases := []struct { desc string - failPoint mtest.FailPoint operation func(context.Context, *mongo.Collection) error }{ { desc: "read op", - failPoint: mtest.FailPoint{ - ConfigureFailPoint: "failCommand", - Mode: "alwaysOn", - Data: mtest.FailPointData{ - FailCommands: []string{"find"}, - BlockConnection: true, - BlockTimeMS: 500, - }, - }, operation: func(ctx context.Context, coll *mongo.Collection) error { return coll.FindOne(ctx, bson.D{}).Err() }, }, { desc: "write op", - failPoint: mtest.FailPoint{ - ConfigureFailPoint: "failCommand", - Mode: "alwaysOn", - Data: mtest.FailPointData{ - FailCommands: []string{"insert"}, - BlockConnection: true, - BlockTimeMS: 500, - }, - }, operation: func(ctx context.Context, coll *mongo.Collection) error { _, err := coll.InsertOne(ctx, bson.D{}) return err @@ -815,11 +796,19 @@ func TestClient(t *testing.T) { } for _, tc := range testCases { - mt.Run(tc.desc, func(t *mtest.T) { + mt.Run(tc.desc, func(mt *mtest.T) { _, err := mt.Coll.InsertOne(context.Background(), bson.D{}) require.NoError(mt, err) - mt.SetFailPoint(tc.failPoint) + mt.SetFailPoint(mtest.FailPoint{ + ConfigureFailPoint: "failCommand", + Mode: "alwaysOn", + Data: mtest.FailPointData{ + FailCommands: []string{"find", "insert"}, + BlockConnection: true, + BlockTimeMS: 500, + }, + }) mt.ClearEvents() @@ -830,8 +819,8 @@ func TestClient(t *testing.T) { // probability that an operation will time out in a way that // can cause a retry. ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) - defer cancel() err = tc.operation(ctx, mt.Coll) + cancel() assert.ErrorIs(mt, err, context.DeadlineExceeded) assert.True(mt, mongo.IsTimeout(err), "expected mongo.IsTimeout(err) to be true")