Skip to content

Commit 4c6c249

Browse files
kate-osbornlucacome
authored andcommitted
Improve usage reporting in automated tests (#2971)
Problem: Pipelines are failing intermittently due to r33 reporting errors. Solutions: - In the graceful recovery and reconfig tests, do not fail if there are errors in the NGINX logs. Instead, log the errors and continue. In the reconfig tests, all NGINX error logs are captured in the results file. Since the graceful recovery tests are part of the functional tests that run on every push to a PR and merge to main, it is not desirable to write the error logs to a file. Instead, if an unexpected error is seen in the NGINX logs, the test is skipped and the error is logged in the test output. Since we are sending traffic through NGINX during the graceful recovery tests, we will still catch any unrecoverable NGINX errors. - Reporting for conformance tests is disabled.
1 parent dc74260 commit 4c6c249

File tree

5 files changed

+85
-34
lines changed

5 files changed

+85
-34
lines changed

.github/workflows/conformance.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ jobs:
142142
- name: Setup license file for plus
143143
if: ${{ inputs.image == 'plus' }}
144144
env:
145-
PLUS_LICENSE: ${{ secrets.JWT_PLUS_REPORTING }}
145+
PLUS_LICENSE: ${{ secrets.JWT_PLUS_EXCEPTION_REPORTING }}
146146
run: echo "${PLUS_LICENSE}" > license.jwt
147147

148148
- name: Setup conformance tests

tests/framework/nginx_log_levels.go

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package framework
2+
3+
// NGINX Log Prefixes for various log levels.
4+
const (
5+
CritNGINXLog = "[crit]"
6+
ErrorNGINXLog = "[error]"
7+
WarnNGINXLog = "[warn]"
8+
AlertNGINXLog = "[alert]"
9+
EmergNGINXLog = "[emerg]"
10+
)

tests/suite/graceful_recovery_test.go

+66-31
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"net/http"
88
"os/exec"
9+
"slices"
910
"strings"
1011
"time"
1112

@@ -28,7 +29,7 @@ const (
2829
ngfContainerName = "nginx-gateway"
2930
)
3031

31-
// Since checkContainerLogsForErrors may experience interference from previous tests (as explained in the function
32+
// Since checkNGFContainerLogsForErrors may experience interference from previous tests (as explained in the function
3233
// documentation), this test is recommended to be run separate from other tests.
3334
var _ = Describe("Graceful Recovery test", Ordered, Label("graceful-recovery"), func() {
3435
files := []string{
@@ -189,6 +190,9 @@ func runRestartNodeTest(teaURL, coffeeURL string, files []string, ns *core.Names
189190
}
190191

191192
checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, "", files, ns)
193+
if errorLogs := getUnexpectedNginxErrorLogs(ngfPodName); errorLogs != "" {
194+
Skip(fmt.Sprintf("NGINX has unexpected error logs: \n%s", errorLogs))
195+
}
192196
}
193197

194198
func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files []string, ns *core.Namespace) {
@@ -219,6 +223,9 @@ func runRecoveryTest(teaURL, coffeeURL, ngfPodName, containerName string, files
219223
}
220224

221225
checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName, files, ns)
226+
if errorLogs := getUnexpectedNginxErrorLogs(ngfPodName); errorLogs != "" {
227+
Skip(fmt.Sprintf("NGINX has unexpected error logs: \n%s", errorLogs))
228+
}
222229
}
223230

224231
func restartContainer(ngfPodName, containerName string) {
@@ -347,52 +354,80 @@ func checkNGFFunctionality(teaURL, coffeeURL, ngfPodName, containerName string,
347354
WithPolling(500 * time.Millisecond).
348355
Should(Succeed())
349356

350-
checkContainerLogsForErrors(ngfPodName, containerName == nginxContainerName)
357+
// When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the
358+
// NGINX container to be restarted. Therefore, we don't want to check the NGF logs for errors when the container
359+
// we restarted was NGINX.
360+
if containerName != nginxContainerName {
361+
checkNGFContainerLogsForErrors(ngfPodName)
362+
}
351363
}
352364

353-
// checkContainerLogsForErrors checks both nginx and NGF container's logs for any possible errors.
354-
// When the NGINX process is killed, some errors are expected in the NGF logs while we wait for the
355-
// NGINX container to be restarted.
356-
func checkContainerLogsForErrors(ngfPodName string, checkNginxLogsOnly bool) {
365+
func getNginxErrorLogs(ngfPodName string) string {
357366
nginxLogs, err := resourceManager.GetPodLogs(
358367
ngfNamespace,
359368
ngfPodName,
360369
&core.PodLogOptions{Container: nginxContainerName},
361370
)
362371
Expect(err).ToNot(HaveOccurred())
363372

373+
errPrefixes := []string{
374+
framework.CritNGINXLog,
375+
framework.ErrorNGINXLog,
376+
framework.WarnNGINXLog,
377+
framework.AlertNGINXLog,
378+
framework.EmergNGINXLog,
379+
}
380+
errorLogs := ""
381+
364382
for _, line := range strings.Split(nginxLogs, "\n") {
365-
Expect(line).ToNot(ContainSubstring("[crit]"), line)
366-
Expect(line).ToNot(ContainSubstring("[alert]"), line)
367-
Expect(line).ToNot(ContainSubstring("[emerg]"), line)
368-
if strings.Contains(line, "[error]") {
369-
expectedError1 := "connect() failed (111: Connection refused)"
370-
expectedError2 := "could not be resolved (host not found) during usage report"
371-
expectedError3 := "server returned 429"
372-
// FIXME(salonichf5) remove this error message check
373-
// when https://github.com/nginxinc/nginx-gateway-fabric/issues/2090 is completed.
374-
expectedError4 := "no live upstreams while connecting to upstream"
375-
Expect(line).To(Or(
376-
ContainSubstring(expectedError1),
377-
ContainSubstring(expectedError2),
378-
ContainSubstring(expectedError3),
379-
ContainSubstring(expectedError4),
380-
))
383+
for _, prefix := range errPrefixes {
384+
if strings.Contains(line, prefix) {
385+
errorLogs += line + "\n"
386+
break
387+
}
381388
}
382389
}
383390

384-
if !checkNginxLogsOnly {
385-
ngfLogs, err := resourceManager.GetPodLogs(
386-
ngfNamespace,
387-
ngfPodName,
388-
&core.PodLogOptions{Container: ngfContainerName},
389-
)
390-
Expect(err).ToNot(HaveOccurred())
391+
return errorLogs
392+
}
393+
394+
func getUnexpectedNginxErrorLogs(ngfPodName string) string {
395+
expectedErrStrings := []string{
396+
"connect() failed (111: Connection refused)",
397+
"could not be resolved (host not found) during usage report",
398+
"server returned 429",
399+
// FIXME(salonichf5) remove this error message check
400+
// when https://github.com/nginxinc/nginx-gateway-fabric/issues/2090 is completed.
401+
"no live upstreams while connecting to upstream",
402+
}
391403

392-
for _, line := range strings.Split(ngfLogs, "\n") {
393-
Expect(line).ToNot(ContainSubstring("\"level\":\"error\""), line)
404+
unexpectedErrors := ""
405+
406+
errorLogs := getNginxErrorLogs(ngfPodName)
407+
408+
for _, line := range strings.Split(errorLogs, "\n") {
409+
if !slices.ContainsFunc(expectedErrStrings, func(s string) bool {
410+
return strings.Contains(line, s)
411+
}) {
412+
unexpectedErrors += line
394413
}
395414
}
415+
416+
return unexpectedErrors
417+
}
418+
419+
// checkNGFContainerLogsForErrors checks NGF container's logs for any possible errors.
420+
func checkNGFContainerLogsForErrors(ngfPodName string) {
421+
ngfLogs, err := resourceManager.GetPodLogs(
422+
ngfNamespace,
423+
ngfPodName,
424+
&core.PodLogOptions{Container: ngfContainerName},
425+
)
426+
Expect(err).ToNot(HaveOccurred())
427+
428+
for _, line := range strings.Split(ngfLogs, "\n") {
429+
Expect(line).ToNot(ContainSubstring("\"level\":\"error\""), line)
430+
}
396431
}
397432

398433
func checkLeaderLeaseChange(originalLeaseName string) error {

tests/suite/reconfig_test.go

+7-1
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,8 @@ var _ = Describe("Reconfiguration Performance Testing", Ordered, Label("nfr", "r
405405
).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed())
406406
}
407407

408-
checkContainerLogsForErrors(ngfPodName, false)
408+
checkNGFContainerLogsForErrors(ngfPodName)
409+
nginxErrorLogs := getNginxErrorLogs(ngfPodName)
409410

410411
reloadCount, err := framework.GetReloadCount(promInstance, ngfPodName)
411412
Expect(err).ToNot(HaveOccurred())
@@ -447,6 +448,7 @@ var _ = Describe("Reconfiguration Performance Testing", Ordered, Label("nfr", "r
447448
TimeToReadyAvgSingle: timeToReadyAvgSingle,
448449
NGINXReloads: int(reloadCount),
449450
NGINXReloadAvgTime: int(reloadAvgTime),
451+
NGINXErrorLogs: nginxErrorLogs,
450452
EventsCount: int(eventsCount),
451453
EventsAvgTime: int(eventsAvgTime),
452454
}
@@ -596,6 +598,7 @@ type reconfigTestResults struct {
596598
TestDescription string
597599
TimeToReadyTotal string
598600
TimeToReadyAvgSingle string
601+
NGINXErrorLogs string
599602
EventsBuckets []framework.Bucket
600603
ReloadBuckets []framework.Bucket
601604
NumResources int
@@ -627,6 +630,9 @@ const reconfigResultTemplate = `
627630
{{- range .EventsBuckets }}
628631
- {{ .Le }}ms: {{ .Val }}
629632
{{- end }}
633+
634+
### NGINX Error Logs
635+
{{ .NGINXErrorLogs }}
630636
`
631637

632638
func writeReconfigResults(dest io.Writer, results reconfigTestResults) error {

tests/suite/scale_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ The logs are attached only if there are errors.
368368
)
369369
nginxErrors := checkLogErrors(
370370
"nginx",
371-
[]string{"[error]", "[emerg]", "[crit]", "[alert]"},
371+
[]string{framework.ErrorNGINXLog, framework.EmergNGINXLog, framework.CritNGINXLog, framework.AlertNGINXLog},
372372
nil,
373373
filepath.Join(testResultsDir, framework.CreateResultsFilename("log", "nginx", *plusEnabled)),
374374
)

0 commit comments

Comments
 (0)