Skip to content

Commit b45e5d9

Browse files
GODRIVER-3156 Detect and discard closed idle connections. (#1815) [release/1.17] (#1841)
Co-authored-by: Matt Dale <[email protected]>
1 parent b473d1b commit b45e5d9

File tree

7 files changed

+364
-169
lines changed

7 files changed

+364
-169
lines changed

x/mongo/driver/topology/connection.go

+51-10
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"fmt"
1515
"io"
1616
"net"
17+
"os"
1718
"strings"
1819
"sync"
1920
"sync/atomic"
@@ -55,7 +56,7 @@ type connection struct {
5556
nc net.Conn // When nil, the connection is closed.
5657
addr address.Address
5758
idleTimeout time.Duration
58-
idleDeadline atomic.Value // Stores a time.Time
59+
idleStart atomic.Value // Stores a time.Time
5960
readTimeout time.Duration
6061
writeTimeout time.Duration
6162
desc description.Server
@@ -561,25 +562,65 @@ func (c *connection) close() error {
561562
return err
562563
}
563564

565+
// closed returns true if the connection has been closed by the driver.
564566
func (c *connection) closed() bool {
565567
return atomic.LoadInt64(&c.state) == connDisconnected
566568
}
567569

570+
// isAlive returns true if the connection is alive and ready to be used for an
571+
// operation.
572+
//
573+
// Note that the liveness check can be slow (at least 1ms), so isAlive only
574+
// checks the liveness of the connection if it's been idle for at least 10
575+
// seconds. For frequently in-use connections, a network error during an
576+
// operation will be the first indication of a dead connection.
577+
func (c *connection) isAlive() bool {
578+
if c.nc == nil {
579+
return false
580+
}
581+
582+
// If the connection has been idle for less than 10 seconds, skip the
583+
// liveness check.
584+
//
585+
// The 10-seconds idle bypass is based on the liveness check implementation
586+
// in the Python Driver. That implementation uses 1 second as the idle
587+
// threshold, but we chose to be more conservative in the Go Driver because
588+
// this is new behavior with unknown side-effects. See
589+
// https://github.com/mongodb/mongo-python-driver/blob/e6b95f65953e01e435004af069a6976473eaf841/pymongo/synchronous/pool.py#L983-L985
590+
idleStart, ok := c.idleStart.Load().(time.Time)
591+
if !ok || idleStart.Add(10*time.Second).After(time.Now()) {
592+
return true
593+
}
594+
595+
// Set a 1ms read deadline and attempt to read 1 byte from the connection.
596+
// Expect it to block for 1ms then return a deadline exceeded error. If it
597+
// returns any other error, the connection is not usable, so return false.
598+
// If it doesn't return an error and actually reads data, the connection is
599+
// also not usable, so return false.
600+
//
601+
// Note that we don't need to un-set the read deadline because the "read"
602+
// and "write" methods always reset the deadlines.
603+
err := c.nc.SetReadDeadline(time.Now().Add(1 * time.Millisecond))
604+
if err != nil {
605+
return false
606+
}
607+
var b [1]byte
608+
_, err = c.nc.Read(b[:])
609+
return errors.Is(err, os.ErrDeadlineExceeded)
610+
}
611+
568612
func (c *connection) idleTimeoutExpired() bool {
569-
now := time.Now()
570-
if c.idleTimeout > 0 {
571-
idleDeadline, ok := c.idleDeadline.Load().(time.Time)
572-
if ok && now.After(idleDeadline) {
573-
return true
574-
}
613+
if c.idleTimeout == 0 {
614+
return false
575615
}
576616

577-
return false
617+
idleStart, ok := c.idleStart.Load().(time.Time)
618+
return ok && idleStart.Add(c.idleTimeout).Before(time.Now())
578619
}
579620

580-
func (c *connection) bumpIdleDeadline() {
621+
func (c *connection) bumpIdleStart() {
581622
if c.idleTimeout > 0 {
582-
c.idleDeadline.Store(time.Now().Add(c.idleTimeout))
623+
c.idleStart.Store(time.Now())
583624
}
584625
}
585626

x/mongo/driver/topology/connection_test.go

+85-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
"github.com/google/go-cmp/cmp"
2121
"go.mongodb.org/mongo-driver/internal/assert"
22+
"go.mongodb.org/mongo-driver/internal/require"
2223
"go.mongodb.org/mongo-driver/mongo/address"
2324
"go.mongodb.org/mongo-driver/mongo/description"
2425
"go.mongodb.org/mongo-driver/x/mongo/driver"
@@ -427,7 +428,7 @@ func TestConnection(t *testing.T) {
427428

428429
want := []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A}
429430
err := conn.writeWireMessage(context.Background(), want)
430-
noerr(t, err)
431+
require.NoError(t, err)
431432
got := tnc.buf
432433
if !cmp.Equal(got, want) {
433434
t.Errorf("writeWireMessage did not write the proper bytes. got %v; want %v", got, want)
@@ -624,7 +625,7 @@ func TestConnection(t *testing.T) {
624625
conn.cancellationListener = listener
625626

626627
got, err := conn.readWireMessage(context.Background())
627-
noerr(t, err)
628+
require.NoError(t, err)
628629
if !cmp.Equal(got, want) {
629630
t.Errorf("did not read full wire message. got %v; want %v", got, want)
630631
}
@@ -1251,3 +1252,85 @@ func (tcl *testCancellationListener) assertCalledOnce(t *testing.T) {
12511252
assert.Equal(t, 1, tcl.numListen, "expected Listen to be called once, got %d", tcl.numListen)
12521253
assert.Equal(t, 1, tcl.numStopListening, "expected StopListening to be called once, got %d", tcl.numListen)
12531254
}
1255+
1256+
func TestConnection_IsAlive(t *testing.T) {
1257+
t.Parallel()
1258+
1259+
t.Run("uninitialized", func(t *testing.T) {
1260+
t.Parallel()
1261+
1262+
conn := newConnection("")
1263+
assert.False(t,
1264+
conn.isAlive(),
1265+
"expected isAlive for an uninitialized connection to always return false")
1266+
})
1267+
1268+
t.Run("connection open", func(t *testing.T) {
1269+
t.Parallel()
1270+
1271+
cleanup := make(chan struct{})
1272+
defer close(cleanup)
1273+
addr := bootstrapConnections(t, 1, func(nc net.Conn) {
1274+
// Keep the connection open until the end of the test.
1275+
<-cleanup
1276+
_ = nc.Close()
1277+
})
1278+
1279+
conn := newConnection(address.Address(addr.String()))
1280+
err := conn.connect(context.Background())
1281+
require.NoError(t, err)
1282+
1283+
conn.idleStart.Store(time.Now().Add(-11 * time.Second))
1284+
assert.True(t,
1285+
conn.isAlive(),
1286+
"expected isAlive for an open connection to return true")
1287+
})
1288+
1289+
t.Run("connection closed", func(t *testing.T) {
1290+
t.Parallel()
1291+
1292+
conns := make(chan net.Conn)
1293+
addr := bootstrapConnections(t, 1, func(nc net.Conn) {
1294+
conns <- nc
1295+
})
1296+
1297+
conn := newConnection(address.Address(addr.String()))
1298+
err := conn.connect(context.Background())
1299+
require.NoError(t, err)
1300+
1301+
// Close the connection before calling isAlive.
1302+
nc := <-conns
1303+
err = nc.Close()
1304+
require.NoError(t, err)
1305+
1306+
conn.idleStart.Store(time.Now().Add(-11 * time.Second))
1307+
assert.False(t,
1308+
conn.isAlive(),
1309+
"expected isAlive for a closed connection to return false")
1310+
})
1311+
1312+
t.Run("connection reads data", func(t *testing.T) {
1313+
t.Parallel()
1314+
1315+
cleanup := make(chan struct{})
1316+
defer close(cleanup)
1317+
addr := bootstrapConnections(t, 1, func(nc net.Conn) {
1318+
// Write some data to the connection before calling isAlive.
1319+
_, err := nc.Write([]byte{5, 0, 0, 0, 0})
1320+
require.NoError(t, err)
1321+
1322+
// Keep the connection open until the end of the test.
1323+
<-cleanup
1324+
_ = nc.Close()
1325+
})
1326+
1327+
conn := newConnection(address.Address(addr.String()))
1328+
err := conn.connect(context.Background())
1329+
require.NoError(t, err)
1330+
1331+
conn.idleStart.Store(time.Now().Add(-11 * time.Second))
1332+
assert.False(t,
1333+
conn.isAlive(),
1334+
"expected isAlive for an open connection that reads data to return false")
1335+
})
1336+
}

x/mongo/driver/topology/pool.go

+14-9
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,11 @@ type reason struct {
167167
// connectionPerished checks if a given connection is perished and should be removed from the pool.
168168
func connectionPerished(conn *connection) (reason, bool) {
169169
switch {
170-
case conn.closed():
171-
// A connection would only be closed if it encountered a network error during an operation and closed itself.
170+
case conn.closed() || !conn.isAlive():
171+
// A connection would only be closed if it encountered a network error
172+
// during an operation and closed itself. If a connection is not alive
173+
// (e.g. the connection was closed by the server-side), it's also
174+
// considered a network error.
172175
return reason{
173176
loggerConn: logger.ReasonConnClosedError,
174177
event: event.ReasonError,
@@ -898,13 +901,15 @@ func (p *pool) checkInNoEvent(conn *connection) error {
898901
return nil
899902
}
900903

901-
// Bump the connection idle deadline here because we're about to make the connection "available".
902-
// The idle deadline is used to determine when a connection has reached its max idle time and
903-
// should be closed. A connection reaches its max idle time when it has been "available" in the
904-
// idle connections stack for more than the configured duration (maxIdleTimeMS). Set it before
905-
// we call connectionPerished(), which checks the idle deadline, because a newly "available"
906-
// connection should never be perished due to max idle time.
907-
conn.bumpIdleDeadline()
904+
// Bump the connection idle start time here because we're about to make the
905+
// connection "available". The idle start time is used to determine how long
906+
// a connection has been idle and when it has reached its max idle time and
907+
// should be closed. A connection reaches its max idle time when it has been
908+
// "available" in the idle connections stack for more than the configured
909+
// duration (maxIdleTimeMS). Set it before we call connectionPerished(),
910+
// which checks the idle deadline, because a newly "available" connection
911+
// should never be perished due to max idle time.
912+
conn.bumpIdleStart()
908913

909914
r, perished := connectionPerished(conn)
910915
if !perished && conn.pool.getState() == poolClosed {

0 commit comments

Comments
 (0)