Skip to content

Commit 793958e

Browse files
authored
Merge pull request #3798 from telepresenceio/thallgren/port-forward-stability
Port forward stability and race conditions
2 parents 443fce3 + 4c3c0ad commit 793958e

File tree

26 files changed

+496
-482
lines changed

26 files changed

+496
-482
lines changed

.golangci.yml

-6
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,6 @@ linters-settings:
5353
- gotest.tools: { recommendations: ['github.com/stretchr/testify', 'github.com/google/go-cmp/cmp'] }
5454
- gotest.tools/v2: { recommendations: ['github.com/stretchr/testify', 'github.com/google/go-cmp/cmp'] }
5555
- gotest.tools/v3: { recommendations: ['github.com/stretchr/testify', 'github.com/google/go-cmp/cmp'] }
56-
forbidigo:
57-
forbid:
58-
- '^os\.(DirEntry|FileInfo|FileMode|PathError)$' # deprecated in Go 1.16, import them from 'io/fs' instead
59-
- '\.Readdir$' # deprecated in Go 1.16, use ReadDir instead
60-
exclude_godoc_examples: false
6156

6257
gocyclo:
6358
min-complexity: 35
@@ -71,7 +66,6 @@ linters-settings:
7166

7267
nolintlint:
7368
allow-unused: true # Needed because there's some linters disabled for 1.18
74-
allow-leading-space: false
7569
require-explanation: true
7670
require-specific: true
7771
allow-no-explanation:

build-aux/docker/images/Dockerfile.client

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ ARG TARGETARCH
3333
RUN \
3434
--mount=type=cache,target=/root/.cache/go-build \
3535
--mount=type=cache,target=/go/pkg/mod \
36-
GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /usr/local/bin/ --buildmode pie -trimpath -tags docker -ldflags=-X=$(go list ./pkg/version).Version=$(cat version.txt) ./cmd/telepresence/...
36+
GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /usr/local/bin/ -trimpath -tags docker -ldflags=-X=$(go list ./pkg/version).Version=$(cat version.txt) ./cmd/telepresence/...
3737

3838
# setcap is necessary because the process will listen to privileged ports
3939
RUN setcap 'cap_net_bind_service+ep' /usr/local/bin/telepresence

build-aux/docker/images/Dockerfile.traffic

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ ARG TARGETARCH
3232
RUN \
3333
--mount=type=cache,target=/root/.cache/go-build \
3434
--mount=type=cache,target=/go/pkg/mod \
35-
GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /usr/local/bin/ --buildmode pie -trimpath -ldflags=-X=$(go list ./pkg/version).Version=$(cat version.txt) ./cmd/traffic/...
35+
GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /usr/local/bin/ -trimpath -ldflags=-X=$(go list ./pkg/version).Version=$(cat version.txt) ./cmd/traffic/...
3636

3737
# setcap is necessary because the process will listen to privileged ports
3838
RUN setcap 'cap_net_bind_service+ep' /usr/local/bin/traffic

build-aux/genversion/main.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ func Main() error {
129129
gitDescVer.Patch++
130130
}
131131

132+
out := os.Stdout
133+
132134
// If an additional arg has been used, we include it in the tag
133135
if len(os.Args) >= 2 {
134136
// gitDescVer.Pre[0] contains the number of commits since the last tag and the
@@ -140,7 +142,7 @@ func Main() error {
140142
if err != nil {
141143
return fmt.Errorf("unable to git rev-parse: %w", err)
142144
}
143-
if _, err := fmt.Printf("v%d.%d.%d-%s-%s\n", gitDescVer.Major, gitDescVer.Minor, gitDescVer.Patch, os.Args[1], shortHash); err != nil {
145+
if _, err = fmt.Fprintf(out, "v%d.%d.%d-%s-%s\n", gitDescVer.Major, gitDescVer.Minor, gitDescVer.Patch, os.Args[1], shortHash); err != nil {
144146
return fmt.Errorf("unable to printf: %w", err)
145147
}
146148
return nil
@@ -163,9 +165,9 @@ func Main() error {
163165
b64 := base64.RawURLEncoding.EncodeToString(md5Out)
164166
b64 = strings.ReplaceAll(b64, "_", "Z")
165167
b64 = strings.ReplaceAll(b64, "-", "z")
166-
_, err = fmt.Printf("v%s-%s\n", gitDescVer, b64)
168+
_, err = fmt.Fprintf(out, "v%s-%s\n", gitDescVer, b64)
167169
} else {
168-
_, err = fmt.Printf("v%s\n", gitDescVer)
170+
_, err = fmt.Fprintf(out, "v%s\n", gitDescVer)
169171
}
170172
return err
171173
}

build-aux/main.mk

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ bindir ?= $(or $(shell go env GOBIN),$(shell go env GOPATH|cut -d: -f1)/bin)
3131
# https://github.com/moby/buildkit/blob/master/frontend/dockerfile/docs/syntax.md.
3232
export DOCKER_BUILDKIT := 1
3333

34+
GOLANGCI_VERSION:=v1.64.5
35+
3436
.PHONY: FORCE
3537
FORCE:
3638

@@ -393,8 +395,6 @@ shellscripts += ./packaging/windows-package.sh
393395

394396
lint: lint-rpc lint-go
395397

396-
GOLANGCI_VERSION:=v1.63.4
397-
398398
lint-go: lint-deps ## (QA) Run the golangci-lint
399399
$(eval badimports = $(shell find cmd integration_test pkg -name '*.go' | grep -v '/mocks/' | xargs $(tools/gosimports) --local github.com/datawire/,github.com/telepresenceio/ -l))
400400
$(if $(strip $(badimports)), echo "The following files have bad import ordering (use make format to fix): " $(badimports) && false)

cmd/traffic/cmd/manager/service.go

+4-3
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ func (s *service) WatchAgentPods(session *rpc.SessionInfo, stream rpc.Manager_Wa
324324
}
325325
ap := &rpc.AgentPodInfo{
326326
WorkloadName: a.Name,
327+
PodId: a.PodUid,
327328
PodName: a.PodName,
328329
Namespace: a.Namespace,
329330
PodIp: aip.AsSlice(),
@@ -687,9 +688,9 @@ func (s *service) ReviewIntercept(ctx context.Context, rIReq *rpc.ReviewIntercep
687688
return
688689
}
689690

690-
// Only update intercepts in the waiting state. Agents race to review an intercept, but we
691-
// expect they will always compatible answers.
692-
if intercept.Disposition == rpc.InterceptDispositionType_WAITING {
691+
// Only update intercepts in the waiting or no agent states. Agents race to review an intercept, but we
692+
// expect they will always produce compatible answers.
693+
if intercept.Disposition == rpc.InterceptDispositionType_NO_AGENT || intercept.Disposition == rpc.InterceptDispositionType_WAITING {
693694
intercept.Disposition = rIReq.Disposition
694695
intercept.Message = rIReq.Message
695696
intercept.PodIp = rIReq.PodIp

cmd/traffic/cmd/manager/state/assert_test.go

-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ func includeElement(list any, element proto.Message) (ok, found bool) {
5959
listKind := reflect.TypeOf(list).Kind()
6060
defer func() {
6161
if e := recover(); e != nil {
62-
fmt.Println("ERR", e)
6362
ok = false
6463
found = false
6564
}

cmd/traffic/cmd/manager/state/state.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -342,14 +342,16 @@ func (s *state) consolidateAgentSessionIntercepts(ctx context.Context, agent *Ag
342342

343343
if errCode, errMsg := s.checkAgentsForIntercept(intercept); errCode != rpc.InterceptDispositionType_UNSPECIFIED {
344344
// No agents matching this intercept are available, so the intercept is now dormant or in error.
345-
dlog.Debugf(ctx, "Intercept %q no longer has available agents. Setting it disposition to %s", interceptID, errCode)
345+
dlog.Debugf(ctx, "Intercept %q no longer has available agents. Setting its disposition to %s", interceptID, errCode)
346346
s.UpdateIntercept(interceptID, func(intercept *Intercept) {
347+
intercept.PodIp = ""
348+
intercept.PodName = ""
347349
intercept.Disposition = errCode
348350
intercept.Message = errMsg
349351
})
350352
} else if agent.PodIp == intercept.PodIp {
351353
// The agent is about to die, but apparently more agents are present. Let some other agent pick it up then.
352-
dlog.Debugf(ctx, "Intercept %q lost its agent pod %s(%s). Setting it disposition to WAITING", interceptID, agent.PodName, agent.PodIp)
354+
dlog.Debugf(ctx, "Intercept %q lost its agent pod %s(%s). Setting its disposition to WAITING", interceptID, agent.PodName, agent.PodIp)
353355
s.UpdateIntercept(interceptID, func(intercept *Intercept) {
354356
intercept.PodIp = ""
355357
intercept.PodName = ""

cmd/traffic/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func main() {
4343
os.Exit(1)
4444
}
4545
} else {
46-
fmt.Println("traffic: unknown command:", name)
46+
_, _ = fmt.Fprintln(os.Stderr, "traffic: unknown command:", name)
4747
os.Exit(127)
4848
}
4949
}

integration_test/docker_run_test.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,6 @@ func (s *dockerDaemonSuite) Test_DockerRun_DockerDaemon() {
170170
}
171171
if err != nil {
172172
dlog.Error(ctx, err.Error())
173-
} else {
174-
s.CapturePodLogs(ctx, svc, "traffic-agent", s.AppNamespace())
175173
}
176174
}
177175

@@ -213,6 +211,7 @@ func (s *dockerDaemonSuite) Test_DockerRun_DockerDaemon() {
213211
soft, softCancel := context.WithCancel(dcontext.WithSoftness(ctx))
214212
wch := make(chan struct{})
215213
go runDockerRun(soft, wch)
214+
s.CapturePodLogs(ctx, svc, "traffic-agent", s.AppNamespace())
216215
assertInterceptResponse(ctx)
217216
softCancel()
218217
assertNotIntercepted(ctx)
@@ -223,6 +222,7 @@ func (s *dockerDaemonSuite) Test_DockerRun_DockerDaemon() {
223222
ctx := s.Context()
224223
wch := make(chan struct{})
225224
go runDockerRun(ctx, wch)
225+
s.CapturePodLogs(ctx, svc, "traffic-agent", s.AppNamespace())
226226
assertInterceptResponse(ctx)
227227
itest.TelepresenceOk(ctx, "leave", svc)
228228
select {
@@ -238,6 +238,7 @@ func (s *dockerDaemonSuite) Test_DockerRun_DockerDaemon() {
238238
ctx := s.Context()
239239
wch := make(chan struct{})
240240
go runDockerRun(ctx, wch)
241+
s.CapturePodLogs(ctx, svc, "traffic-agent", s.AppNamespace())
241242
assertInterceptResponse(ctx)
242243
itest.TelepresenceDisconnectOk(ctx)
243244
select {

integration_test/gather_logs_test.go

+10-9
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strings"
1111
"time"
1212

13+
"github.com/datawire/dlib/dlog"
1314
"github.com/telepresenceio/telepresence/v2/integration_test/itest"
1415
"github.com/telepresenceio/telepresence/v2/pkg/labels"
1516
)
@@ -95,14 +96,6 @@ func (s *multipleInterceptsSuite) TestGatherLogs_NoK8sLogs() {
9596

9697
func (s *connectedSuite) TestGatherLogs_OnlyMappedLogs() {
9798
const svc = "echo"
98-
require := s.Require()
99-
defer func() {
100-
ctx := s.Context()
101-
itest.TelepresenceDisconnectOk(ctx)
102-
stdout := s.TelepresenceConnect(ctx)
103-
require.Contains(stdout, "Connected to context")
104-
}()
105-
10699
ctx := s.Context()
107100
itest.TelepresenceDisconnectOk(ctx)
108101

@@ -118,7 +111,15 @@ func (s *connectedSuite) TestGatherLogs_OnlyMappedLogs() {
118111
Namespace: s.ManagerNamespace(),
119112
Selector: labels.SelectorFromNames(otherOne, otherTwo),
120113
}), true)
121-
defer s.RollbackTM(ctx)
114+
115+
require := s.Require()
116+
defer func() {
117+
so, se, err := itest.Telepresence(ctx, "quit")
118+
dlog.Debug(ctx, so, se, err)
119+
s.RollbackTM(ctx)
120+
stdout := s.TelepresenceConnect(ctx)
121+
require.Contains(stdout, "Connected to context")
122+
}()
122123

123124
itest.TelepresenceDisconnectOk(ctx)
124125
itest.ApplyEchoService(ctx, svc, otherOne, 8083)

pkg/client/agentpf/clients.go

+30-22
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"google.golang.org/grpc"
1616
"google.golang.org/grpc/codes"
1717
"google.golang.org/grpc/status"
18+
"k8s.io/apimachinery/pkg/types"
1819

1920
"github.com/datawire/dlib/dlog"
2021
"github.com/datawire/dlib/dtime"
@@ -48,7 +49,7 @@ func (ac *client) String() string {
4849
return "<nil>"
4950
}
5051
ai := ac.info
51-
return fmt.Sprintf("%s.%s:%d", ai.PodName, ai.Namespace, ai.ApiPort)
52+
return fmt.Sprintf("%s(%s), port %d, dormant %t", ai.PodName, net.IP(ai.PodIp), ai.ApiPort, ac.dormant())
5253
}
5354

5455
func (ac *client) ensureConnect(ctx context.Context) (err error) {
@@ -78,8 +79,10 @@ func (ac *client) Tunnel(ctx context.Context, opts ...grpc.CallOption) (tunnel.C
7879
// Client was closed.
7980
return nil, io.EOF
8081
}
82+
dlog.Tracef(ctx, "%s(%s) creating Tunnel over gRPC", ac, net.IP(ac.info.PodIp))
8183
tc, err := cli.Tunnel(ctx, opts...)
8284
if err != nil {
85+
dlog.Tracef(ctx, "%s(%s) failed to create Tunnel over gRPC: %v", ac, net.IP(ac.info.PodIp), err)
8386
return nil, err
8487
}
8588
atomic.AddInt32(&ac.tunnelCount, 1)
@@ -107,7 +110,8 @@ func (ac *client) connect(ctx context.Context, deleteMe func()) {
107110
var conn *grpc.ClientConn
108111
var cli agent.AgentClient
109112

110-
conn, cli, _, err = k8sclient.ConnectToAgent(dialCtx, ac.info.PodName, ac.info.Namespace, uint16(ac.info.ApiPort))
113+
ai := ac.info
114+
conn, cli, _, err = k8sclient.ConnectToAgent(dialCtx, ai.PodName, ai.Namespace, uint16(ac.info.ApiPort), types.UID(ai.PodId))
111115
if err != nil {
112116
return
113117
}
@@ -117,18 +121,16 @@ func (ac *client) connect(ctx context.Context, deleteMe func()) {
117121
ac.cancelClient = func() {
118122
// Need to run this in a separate thread to avoid deadlock.
119123
go func() {
120-
ac.Lock()
124+
// Need to invalidate the pod connection cache here, because StatefulSets reuse the same pod name.
121125
conn.Close()
126+
ac.Lock()
122127
ac.cancelClient = nil
123128
ac.cli = nil
124129
ac.infant.Store(true)
125-
for len(ac.ready) > 0 {
126-
<-ac.ready
127-
}
128130
ac.Unlock()
129131
}()
130132
}
131-
intercepted := ac.info.Intercepted
133+
intercepted := ai.Intercepted
132134
ac.Unlock()
133135
if intercepted {
134136
err = ac.startDialWatcherReady(ctx)
@@ -169,26 +171,29 @@ func (ac *client) cancel() bool {
169171
return didCancel
170172
}
171173

172-
func (ac *client) setIntercepted(ctx context.Context, k string, status bool) {
173-
ac.RLock()
174-
aci := ac.info.Intercepted
174+
func (ac *client) refresh(ctx context.Context, ai *manager.AgentPodInfo) {
175+
ac.Lock()
176+
oldStatus := ac.info.Intercepted
177+
ac.info = ai
175178
cdw := ac.cancelDialWatch
176-
ac.RUnlock()
177-
if status {
178-
if aci {
179-
return
180-
}
181-
dlog.Debugf(ctx, "Agent %s changed to intercepted", k)
179+
ac.Unlock()
180+
if ai.Intercepted == oldStatus {
181+
return
182+
}
183+
if ai.Intercepted {
184+
dlog.Debugf(ctx, "Agent %s(%s) changed to intercepted", ai.PodName, net.IP(ai.PodIp))
182185
go func() {
183186
if err := ac.startDialWatcher(ctx); err != nil {
184-
dlog.Errorf(ctx, "failed to start client watcher for %s: %v", k, err)
187+
dlog.Errorf(ctx, "failed to start client watcher for %s(%s): %v", ai.PodName, net.IP(ai.PodIp), err)
185188
}
186189
}()
187190
// This agent is now intercepting. Start a dial watcher.
188-
} else if aci && cdw != nil {
191+
} else {
189192
// This agent is no longer intercepting. Stop the dial watcher
190-
dlog.Debugf(ctx, "Agent %s changed to not intercepted", k)
191-
cdw()
193+
dlog.Debugf(ctx, "Agent %s(%s) changed to not intercepted", ai.PodName, net.IP(ai.PodIp))
194+
if cdw != nil {
195+
cdw()
196+
}
192197
}
193198
}
194199

@@ -203,7 +208,11 @@ func (ac *client) startDialWatcher(ctx context.Context) error {
203208
func (ac *client) startDialWatcherReady(ctx context.Context) error {
204209
ac.RLock()
205210
cli := ac.cli
211+
running := ac.cancelDialWatch != nil
206212
ac.RUnlock()
213+
if running {
214+
return nil
215+
}
207216
if cli == nil {
208217
return fmt.Errorf("agent connection closed")
209218
}
@@ -218,7 +227,6 @@ func (ac *client) startDialWatcherReady(ctx context.Context) error {
218227
}
219228

220229
ac.Lock()
221-
ac.info.Intercepted = true
222230
ac.cancelDialWatch = func() {
223231
ac.Lock()
224232
ac.info.Intercepted = false
@@ -549,7 +557,7 @@ func (s *clients) updateClients(ctx context.Context, ais []*manager.AgentPodInfo
549557
// Refresh current clients
550558
for k, ai := range aim {
551559
if ac, ok := s.clients.Load(k); ok {
552-
ac.setIntercepted(ctx, k, ai.Intercepted)
560+
ac.refresh(ctx, ai)
553561
}
554562
}
555563

pkg/client/config.go

+10
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ type Timeouts struct {
354354
// PrivateTrafficManagerConnect is how long to wait for the initial port-forwards to the traffic-manager
355355
PrivateTrafficManagerConnect time.Duration `json:"trafficManagerConnect"`
356356
// PrivateFtpReadWrite read/write timeout used by the fuseftp client.
357+
PrivateTrafficAgentArrival time.Duration `json:"trafficAgentArrival"`
358+
// PrivateFtpReadWrite read/write timeout used by the fuseftp client.
357359
PrivateFtpReadWrite time.Duration `json:"ftpReadWrite"`
358360
// PrivateFtpShutdown max time to wait for the fuseftp client to complete pending operations before forcing termination.
359361
PrivateFtpShutdown time.Duration `json:"ftpShutdown"`
@@ -373,6 +375,7 @@ const (
373375
TimeoutRoundtripLatency
374376
TimeoutTrafficManagerAPI
375377
TimeoutTrafficManagerConnect
378+
TimeoutTrafficAgentArrival
376379
TimeoutFtpReadWrite
377380
TimeoutFtpShutdown
378381
TimeoutContainerShutdown
@@ -418,6 +421,8 @@ func (t *Timeouts) Get(timeoutID TimeoutID) time.Duration {
418421
timeoutVal = t.PrivateTrafficManagerAPI
419422
case TimeoutTrafficManagerConnect:
420423
timeoutVal = t.PrivateTrafficManagerConnect
424+
case TimeoutTrafficAgentArrival:
425+
timeoutVal = t.PrivateTrafficAgentArrival
421426
case TimeoutFtpReadWrite:
422427
timeoutVal = t.PrivateFtpReadWrite
423428
case TimeoutFtpShutdown:
@@ -478,6 +483,9 @@ func (e timeoutError) Error() string {
478483
case TimeoutTrafficManagerConnect:
479484
yamlName = "trafficManagerConnect"
480485
humanName = "port-forward connection to the traffic manager"
486+
case TimeoutTrafficAgentArrival:
487+
yamlName = "trafficAgentArrival"
488+
humanName = "waiting for traffic agent arrival"
481489
case TimeoutFtpReadWrite:
482490
yamlName = "ftpReadWrite"
483491
humanName = "FTP client read/write"
@@ -515,6 +523,7 @@ const (
515523
defaultTimeoutsRoundtripLatency = 2 * time.Second
516524
defaultTimeoutsTrafficManagerAPI = 15 * time.Second
517525
defaultTimeoutsTrafficManagerConnect = 60 * time.Second
526+
defaultTimeoutsTrafficAgentArrival = 60 * time.Second
518527
defaultTimeoutsFtpReadWrite = 1 * time.Minute
519528
defaultTimeoutsFtpShutdown = 2 * time.Minute
520529
defaultTimeoutsContainerShutdown = 0
@@ -531,6 +540,7 @@ var defaultTimeouts = Timeouts{ //nolint:gochecknoglobals // constant
531540
PrivateRoundtripLatency: defaultTimeoutsRoundtripLatency,
532541
PrivateTrafficManagerAPI: defaultTimeoutsTrafficManagerAPI,
533542
PrivateTrafficManagerConnect: defaultTimeoutsTrafficManagerConnect,
543+
PrivateTrafficAgentArrival: defaultTimeoutsTrafficAgentArrival,
534544
PrivateFtpReadWrite: defaultTimeoutsFtpReadWrite,
535545
PrivateFtpShutdown: defaultTimeoutsFtpShutdown,
536546
PrivateContainerShutdown: defaultTimeoutsContainerShutdown,

0 commit comments

Comments
 (0)