5
5
"errors"
6
6
"fmt"
7
7
"net/http"
8
+ "os/exec"
8
9
"strings"
9
10
"time"
10
11
@@ -15,6 +16,7 @@ import (
15
16
core "k8s.io/api/core/v1"
16
17
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
17
18
"k8s.io/apimachinery/pkg/types"
19
+ ctlr "sigs.k8s.io/controller-runtime"
18
20
"sigs.k8s.io/controller-runtime/pkg/client"
19
21
"sigs.k8s.io/yaml"
20
22
@@ -28,7 +30,7 @@ const (
28
30
29
31
// Since checkContainerLogsForErrors may experience interference from previous tests (as explained in the function
30
32
// documentation), this test is recommended to be run separate from other tests.
31
- var _ = Describe ("Graceful Recovery test" , Ordered , Label ("functional" , " graceful-recovery" ), func () {
33
+ var _ = Describe ("Graceful Recovery test" , Ordered , Label ("graceful-recovery" ), func () {
32
34
files := []string {
33
35
"graceful-recovery/cafe.yaml" ,
34
36
"graceful-recovery/cafe-secret.yaml" ,
@@ -45,10 +47,11 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu
45
47
46
48
var ngfPodName string
47
49
48
- BeforeAll (func () {
50
+ BeforeEach (func () {
49
51
// this test is unique in that it will check the entire log of both ngf and nginx containers
50
52
// for any errors, so in order to avoid errors generated in previous tests we will uninstall
51
- // NGF installed at the suite level, then re-deploy our own
53
+ // NGF installed at the suite level, then re-deploy our own. We will also uninstall and re-install
54
+ // NGF between each graceful-recovery test for the same reason.
52
55
teardown (releaseName )
53
56
54
57
setup (getDefaultSetupCfg ())
@@ -64,9 +67,7 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu
64
67
if portFwdHTTPSPort != 0 {
65
68
teaURL = fmt .Sprintf ("%s:%d/tea" , baseHTTPSURL , portFwdHTTPSPort )
66
69
}
67
- })
68
70
69
- BeforeEach (func () {
70
71
ns = core.Namespace {
71
72
ObjectMeta : metav1.ObjectMeta {
72
73
Name : "graceful-recovery" ,
@@ -98,8 +99,97 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("functional", "gracefu
98
99
It ("recovers when nginx container is restarted" , func () {
99
100
runRecoveryTest (teaURL , coffeeURL , ngfPodName , nginxContainerName , files , & ns )
100
101
})
102
+
103
+ It ("recovers when drained node is restarted" , func () {
104
+ runRestartNodeWithDrainingTest (teaURL , coffeeURL , files , & ns )
105
+ })
106
+
107
+ It ("recovers when node is restarted abruptly" , func () {
108
+ runRestartNodeAbruptlyTest (teaURL , coffeeURL , files , & ns )
109
+ })
101
110
})
102
111
112
+ func runRestartNodeWithDrainingTest (teaURL , coffeeURL string , files []string , ns * core.Namespace ) {
113
+ runRestartNodeTest (teaURL , coffeeURL , files , ns , true )
114
+ }
115
+
116
+ func runRestartNodeAbruptlyTest (teaURL , coffeeURL string , files []string , ns * core.Namespace ) {
117
+ runRestartNodeTest (teaURL , coffeeURL , files , ns , false )
118
+ }
119
+
120
+ func runRestartNodeTest (teaURL , coffeeURL string , files []string , ns * core.Namespace , drain bool ) {
121
+ nodeNames , err := getNodeNames ()
122
+ Expect (err ).ToNot (HaveOccurred ())
123
+ Expect (nodeNames ).To (HaveLen (1 ))
124
+
125
+ kindNodeName := nodeNames [0 ]
126
+
127
+ Expect (clusterName ).ToNot (BeNil (), "clusterName variable not set" )
128
+ Expect (* clusterName ).ToNot (BeEmpty ())
129
+ containerName := * clusterName + "-control-plane"
130
+
131
+ if portFwdPort != 0 {
132
+ close (portForwardStopCh )
133
+ }
134
+
135
+ if drain {
136
+ _ , err := exec .Command (
137
+ "kubectl" ,
138
+ "drain" ,
139
+ kindNodeName ,
140
+ "--ignore-daemonsets" ,
141
+ "--delete-local-data" ,
142
+ ).CombinedOutput ()
143
+ Expect (err ).ToNot (HaveOccurred ())
144
+
145
+ _ , err = exec .Command ("kubectl" , "delete" , "node" , kindNodeName ).CombinedOutput ()
146
+ Expect (err ).ToNot (HaveOccurred ())
147
+ }
148
+
149
+ _ , err = exec .Command ("docker" , "restart" , containerName ).CombinedOutput ()
150
+ Expect (err ).ToNot (HaveOccurred ())
151
+
152
+ // need to wait for docker container to restart and be running before polling for ready NGF Pods or else we will error
153
+ Eventually (
154
+ func () bool {
155
+ output , err := exec .Command (
156
+ "docker" ,
157
+ "inspect" ,
158
+ "-f" ,
159
+ "{{.State.Running}}" ,
160
+ containerName ,
161
+ ).CombinedOutput ()
162
+ return strings .TrimSpace (string (output )) == "true" && err == nil
163
+ }).
164
+ WithTimeout (timeoutConfig .CreateTimeout ).
165
+ WithPolling (500 * time .Millisecond ).
166
+ Should (BeTrue ())
167
+
168
+ // ngf can often oscillate between ready and error, so we wait for a stable readiness in ngf
169
+ var podNames []string
170
+ Eventually (
171
+ func () bool {
172
+ podNames , err = framework .GetReadyNGFPodNames (k8sClient , ngfNamespace , releaseName , timeoutConfig .GetStatusTimeout )
173
+ return len (podNames ) == 1 && err == nil
174
+ }).
175
+ WithTimeout (timeoutConfig .CreateTimeout * 2 ).
176
+ WithPolling (500 * time .Millisecond ).
177
+ MustPassRepeatedly (20 ).
178
+ Should (BeTrue ())
179
+
180
+ ngfPodName := podNames [0 ]
181
+ Expect (ngfPodName ).ToNot (BeEmpty ())
182
+
183
+ if portFwdPort != 0 {
184
+ ports := []string {fmt .Sprintf ("%d:80" , ngfHTTPForwardedPort ), fmt .Sprintf ("%d:443" , ngfHTTPSForwardedPort )}
185
+ portForwardStopCh = make (chan struct {})
186
+ err = framework .PortForward (ctlr .GetConfigOrDie (), ngfNamespace , ngfPodName , ports , portForwardStopCh )
187
+ Expect (err ).ToNot (HaveOccurred ())
188
+ }
189
+
190
+ checkNGFFunctionality (teaURL , coffeeURL , ngfPodName , "" , files , ns )
191
+ }
192
+
103
193
func runRecoveryTest (teaURL , coffeeURL , ngfPodName , containerName string , files []string , ns * core.Namespace ) {
104
194
var (
105
195
err error
@@ -127,36 +217,7 @@ func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files
127
217
Should (Succeed ())
128
218
}
129
219
130
- Eventually (
131
- func () error {
132
- return checkForWorkingTraffic (teaURL , coffeeURL )
133
- }).
134
- WithTimeout (timeoutConfig .RequestTimeout ).
135
- WithPolling (500 * time .Millisecond ).
136
- Should (Succeed ())
137
-
138
- Expect (resourceManager .DeleteFromFiles (files , ns .Name )).To (Succeed ())
139
-
140
- Eventually (
141
- func () error {
142
- return checkForFailingTraffic (teaURL , coffeeURL )
143
- }).
144
- WithTimeout (timeoutConfig .RequestTimeout ).
145
- WithPolling (500 * time .Millisecond ).
146
- Should (Succeed ())
147
-
148
- Expect (resourceManager .ApplyFromFiles (files , ns .Name )).To (Succeed ())
149
- Expect (resourceManager .WaitForAppsToBeReadyWithPodCount (ns .Name , 2 )).To (Succeed ())
150
-
151
- Eventually (
152
- func () error {
153
- return checkForWorkingTraffic (teaURL , coffeeURL )
154
- }).
155
- WithTimeout (timeoutConfig .RequestTimeout * 2 ).
156
- WithPolling (500 * time .Millisecond ).
157
- Should (Succeed ())
158
-
159
- checkContainerLogsForErrors (ngfPodName , containerName == nginxContainerName )
220
+ checkNGFFunctionality (teaURL , coffeeURL , ngfPodName , containerName , files , ns )
160
221
}
161
222
162
223
func restartContainer (ngfPodName , containerName string ) {
@@ -254,11 +315,41 @@ func expectRequestToFail(appURL, address string) error {
254
315
return nil
255
316
}
256
317
257
- // checkContainerLogsForErrors checks both nginx and ngf container's logs for any possible errors.
258
- // Since this function retrieves all the logs from both containers and the NGF pod may be shared between tests,
259
- // the logs retrieved may contain log messages from previous tests, thus any errors in the logs from previous tests
260
- // may cause an interference with this test and cause this test to fail.
261
- // Additionally, when the NGINX process is killed, some errors are expected in the NGF logs while we wait for the
318
+ func checkNGFFunctionality (teaURL , coffeeURL , ngfPodName , containerName string , files []string , ns * core.Namespace ) {
319
+ Eventually (
320
+ func () error {
321
+ return checkForWorkingTraffic (teaURL , coffeeURL )
322
+ }).
323
+ WithTimeout (timeoutConfig .RequestTimeout * 2 ).
324
+ WithPolling (500 * time .Millisecond ).
325
+ Should (Succeed ())
326
+
327
+ Expect (resourceManager .DeleteFromFiles (files , ns .Name )).To (Succeed ())
328
+
329
+ Eventually (
330
+ func () error {
331
+ return checkForFailingTraffic (teaURL , coffeeURL )
332
+ }).
333
+ WithTimeout (timeoutConfig .RequestTimeout ).
334
+ WithPolling (500 * time .Millisecond ).
335
+ Should (Succeed ())
336
+
337
+ Expect (resourceManager .ApplyFromFiles (files , ns .Name )).To (Succeed ())
338
+ Expect (resourceManager .WaitForAppsToBeReadyWithPodCount (ns .Name , 2 )).To (Succeed ())
339
+
340
+ Eventually (
341
+ func () error {
342
+ return checkForWorkingTraffic (teaURL , coffeeURL )
343
+ }).
344
+ WithTimeout (timeoutConfig .RequestTimeout * 2 ).
345
+ WithPolling (500 * time .Millisecond ).
346
+ Should (Succeed ())
347
+
348
+ checkContainerLogsForErrors (ngfPodName , containerName == nginxContainerName )
349
+ }
350
+
351
+ // checkContainerLogsForErrors checks both nginx and NGF container's logs for any possible errors.
352
+ // When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the
262
353
// NGINX container to be restarted.
263
354
func checkContainerLogsForErrors (ngfPodName string , checkNginxLogsOnly bool ) {
264
355
nginxLogs , err := resourceManager .GetPodLogs (
@@ -349,6 +440,24 @@ func getContainerRestartCount(ngfPodName, containerName string) (int, error) {
349
440
return restartCount , nil
350
441
}
351
442
443
+ func getNodeNames () ([]string , error ) {
444
+ ctx , cancel := context .WithTimeout (context .Background (), timeoutConfig .GetTimeout )
445
+ defer cancel ()
446
+ var nodes core.NodeList
447
+
448
+ if err := k8sClient .List (ctx , & nodes ); err != nil {
449
+ return nil , fmt .Errorf ("error listing nodes: %w" , err )
450
+ }
451
+
452
+ names := make ([]string , 0 , len (nodes .Items ))
453
+
454
+ for _ , node := range nodes .Items {
455
+ names = append (names , node .Name )
456
+ }
457
+
458
+ return names , nil
459
+ }
460
+
352
461
func runNodeDebuggerJob (ngfPodName , jobScript string ) (* v1.Job , error ) {
353
462
ctx , cancel := context .WithTimeout (context .Background (), timeoutConfig .GetTimeout )
354
463
defer cancel ()
0 commit comments