Skip to content

Commit 9c1d3e9

Browse files
authored
Test automation: zero-downtime upgrades (#1438)
Problem: Running system tests manually is a time consuming and arduous process, and can lead to inconsistent results. Solution: Add automation for the zero-downtime upgrades test. This test must be run on GKE. The test deploys the previous released version of NGF, then begins sending http and https traffic while performing an upgrade to NGF. Results are rendered and written. CSV files are getting too large to commit to the repository, so not including those going forward.
1 parent be8f0eb commit 9c1d3e9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+743
-492
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ build/.out
2929
build/out
3030
dist/
3131

32+
# Test artifacts
33+
tests/**/*.csv
34+
3235
# Node modules
3336
node_modules/
3437

.markdownlint-cli2.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ config:
1818
# Define glob expressions to ignore
1919
ignores:
2020
- ".github/"
21+
- "tests/results/"

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ repos:
55
rev: v4.5.0
66
hooks:
77
- id: trailing-whitespace
8+
exclude: (^tests/results/)
89
- id: end-of-file-fixer
910
- id: check-yaml
1011
args: [--allow-multiple-documents]

tests/Makefile

+18-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ TAG = edge
22
PREFIX = nginx-gateway-fabric
33
NGINX_PREFIX = $(PREFIX)/nginx
44
PULL_POLICY=Never
5-
GW_API_VERSION ?= 1.0.0
5+
GW_API_PREV_VERSION ?= 1.0.0 ## Supported Gateway API version from previous NGF release
6+
GW_API_VERSION ?= 1.0.0 ## Supported Gateway API version for NGF under test
67
K8S_VERSION ?= latest ## Expected format: 1.24 (major.minor) or latest
78
GW_SERVICE_TYPE=NodePort
89
GW_SVC_GKE_INTERNAL=false
@@ -30,7 +31,8 @@ load-images: ## Load NGF and NGINX images on configured kind cluster
3031
kind load docker-image $(PREFIX):$(TAG) $(NGINX_PREFIX):$(TAG)
3132

3233
test: ## Run the system tests against your default k8s cluster
33-
go test -v ./suite $(GINKGO_FLAGS) -args --gateway-api-version=$(GW_API_VERSION) --image-tag=$(TAG) \
34+
go test -v ./suite $(GINKGO_FLAGS) -args --gateway-api-version=$(GW_API_VERSION) \
35+
--gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) \
3436
--ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --pull-policy=$(PULL_POLICY) \
3537
--k8s-version=$(K8S_VERSION) --service-type=$(GW_SERVICE_TYPE) --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL)
3638

@@ -46,9 +48,20 @@ run-tests-on-vm: ## Run the tests on a GCP VM
4648
create-and-setup-vm: ## Create and setup a GCP VM for tests
4749
bash scripts/create-and-setup-gcp-vm.sh
4850

49-
.PHONY: create-vm-and-run-tests
50-
create-vm-and-run-tests: create-and-setup-vm run-tests-on-vm ## Create and setup a GCP VM for tests and run the tests
51-
5251
.PHONY: cleanup-vm
5352
cleanup-vm: ## Delete the test GCP VM and delete the firewall rule
5453
bash scripts/cleanup-vm.sh
54+
55+
.PHONY: create-gke-router
56+
create-gke-router: ## Create a GKE router to allow egress traffic from private nodes (allows for external image pulls)
57+
bash scripts/create-gke-router.sh
58+
59+
.PHONY: cleanup-router
60+
cleanup-router: ## Delete the GKE router
61+
bash scripts/cleanup-router.sh
62+
63+
.PHONY: setup-gcp-and-run-tests
64+
setup-gcp-and-run-tests: create-gke-router create-and-setup-vm run-tests-on-vm ## Create and setup a GKE router and GCP VM for tests and run the tests
65+
66+
.PHONY: cleanup-gcp
67+
cleanup-gcp: cleanup-router cleanup-vm ## Cleanup all GCP resources

tests/README.md

+31-9
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,17 @@ make
3838

3939
```text
4040
build-images Build NGF and NGINX images
41-
cleanup-vm Delete the test GCP VM and the firewall rule
41+
cleanup-gcp Cleanup all GCP resources
42+
cleanup-router Delete the GKE router
43+
cleanup-vm Delete the test GCP VM and delete the firewall rule
4244
create-and-setup-vm Create and setup a GCP VM for tests
45+
create-gke-router Create a GKE router to allow egress traffic from private nodes (allows for external image pulls)
4346
create-kind-cluster Create a kind cluster
44-
create-vm-and-run-tests Create and setup a GCP VM for tests and run the tests
4547
delete-kind-cluster Delete kind cluster
4648
help Display this help
4749
load-images Load NGF and NGINX images on configured kind cluster
4850
run-tests-on-vm Run the tests on a GCP VM
51+
setup-gcp-and-run-tests Create and setup a GKE router and GCP VM for tests and run the tests
4952
test Run the system tests against your default k8s cluster
5053
```
5154

@@ -101,15 +104,24 @@ make test TAG=$(whoami)
101104
This step only applies if you would like to run the tests from a GCP based VM.
102105

103106
Before running the below `make` command, copy the `scripts/vars.env-example` file to `scripts/vars.env` and populate the
104-
required env vars. The `GKE_CLUSTER_ZONE` needs to be the zone of your GKE cluster, and `GKE_SVC_ACCOUNT` needs to be
105-
the name of a service account that has Kubernetes admin permissions.
107+
required env vars. `GKE_SVC_ACCOUNT` needs to be the name of a service account that has Kubernetes admin permissions.
106108

107-
To create and setup the VM (including creating a firewall rule allowing SSH access from your local machine, and
108-
optionally adding the VM IP to the `master-authorized-networks` list of your GKE cluster if
109-
`ADD_VM_IP_AUTH_NETWORKS` is set to `true`) and run the tests, run the following
109+
In order to run the tests in GCP, you need a few things:
110+
111+
- GKE router to allow egress traffic (used by upgrade tests for pulling images from Github)
112+
- this assumes that your GKE cluster is using private nodes. If using public nodes, you don't need this.
113+
- GCP VM and firewall rule to send ingress traffic to GKE
114+
115+
To set up the GCP environment with the router and VM and then run the tests, run the following command:
110116

111117
```makefile
112-
make create-vm-and-run-tests
118+
make setup-gcp-and-run-tests
119+
```
120+
121+
If you just need a VM and no router (this will not run the tests):
122+
123+
```makefile
124+
make create-and-setup-vm
113125
```
114126

115127
To use an existing VM to run the tests, run the following
@@ -179,7 +191,17 @@ For more information of filtering specs, see [the docs here](https://onsi.github
179191
make delete-kind-cluster
180192
```
181193

182-
2. Delete the cloud VM and cleanup the firewall rule, if required
194+
2. Delete the GCP components (GKE router, VM, and firewall rule), if required
195+
196+
```makefile
197+
make cleanup-gcp
198+
```
199+
200+
or
201+
202+
```makefile
203+
make cleanup-router
204+
```
183205

184206
```makefile
185207
make cleanup-vm

tests/framework/common.go

-28
This file was deleted.

tests/framework/load.go

+45-26
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
package framework
22

33
import (
4-
"fmt"
4+
"context"
5+
"crypto/tls"
6+
"net"
57
"net/http"
6-
"net/url"
7-
"os"
88
"time"
99

1010
vegeta "github.com/tsenart/vegeta/v12/lib"
@@ -31,38 +31,57 @@ func convertTargetToVegetaTarget(targets []Target) []vegeta.Target {
3131
return vegTargets
3232
}
3333

34+
// LoadTestConfig is the configuration to run a load test.
35+
type LoadTestConfig struct {
36+
Description string
37+
Proxy string
38+
ServerName string
39+
Targets []Target
40+
Rate int
41+
Duration time.Duration
42+
}
43+
44+
// Metrics is a wrapper around the vegeta Metrics.
45+
type Metrics struct {
46+
vegeta.Metrics
47+
}
48+
3449
// RunLoadTest uses Vegeta to send traffic to the provided Targets at the given rate for the given duration and writes
3550
// the results to the provided file
36-
func RunLoadTest(
37-
targets []Target,
38-
rate int,
39-
duration time.Duration,
40-
desc string,
41-
outFile *os.File,
42-
proxy string,
43-
) error {
44-
vegTargets := convertTargetToVegetaTarget(targets)
51+
func RunLoadTest(cfg LoadTestConfig) (vegeta.Results, Metrics) {
52+
vegTargets := convertTargetToVegetaTarget(cfg.Targets)
4553
targeter := vegeta.NewStaticTargeter(vegTargets...)
46-
proxyURL, err := url.Parse(proxy)
47-
if err != nil {
48-
return fmt.Errorf("error getting proxy URL: %w", err)
54+
55+
dialer := &net.Dialer{
56+
LocalAddr: &net.TCPAddr{IP: vegeta.DefaultLocalAddr.IP, Zone: vegeta.DefaultLocalAddr.Zone},
57+
KeepAlive: 30 * time.Second,
4958
}
5059

51-
attacker := vegeta.NewAttacker(
52-
vegeta.Proxy(http.ProxyURL(proxyURL)),
53-
)
60+
httpClient := http.Client{
61+
Timeout: vegeta.DefaultTimeout,
62+
Transport: &http.Transport{
63+
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
64+
return dialer.DialContext(ctx, network, cfg.Proxy)
65+
},
66+
TLSClientConfig: &tls.Config{
67+
InsecureSkipVerify: true, //nolint:gosec // self-signed cert for testing
68+
ServerName: cfg.ServerName,
69+
},
70+
MaxIdleConnsPerHost: vegeta.DefaultConnections,
71+
MaxConnsPerHost: vegeta.DefaultMaxConnections,
72+
},
73+
}
74+
75+
attacker := vegeta.NewAttacker(vegeta.Client(&httpClient))
5476

55-
r := vegeta.Rate{Freq: rate, Per: time.Second}
77+
r := vegeta.Rate{Freq: cfg.Rate, Per: time.Second}
78+
var results vegeta.Results
5679
var metrics vegeta.Metrics
57-
for res := range attacker.Attack(targeter, r, duration, desc) {
80+
for res := range attacker.Attack(targeter, r, cfg.Duration, cfg.Description) {
81+
results = append(results, *res)
5882
metrics.Add(res)
5983
}
6084
metrics.Close()
6185

62-
reporter := vegeta.NewTextReporter(&metrics)
63-
64-
if err = reporter.Report(outFile); err != nil {
65-
return fmt.Errorf("error reporting results: %w", err)
66-
}
67-
return nil
86+
return results, Metrics{metrics}
6887
}

tests/framework/ngf.go

+61-31
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"os/exec"
7+
"path/filepath"
78
"strings"
89
"time"
910

@@ -83,39 +84,50 @@ func InstallGatewayAPI(
8384
return nil, nil
8485
}
8586

87+
// UninstallGatewayAPI uninstalls the specified version of the Gateway API resources.
88+
func UninstallGatewayAPI(apiVersion, k8sVersion string) ([]byte, error) {
89+
apiPath := fmt.Sprintf("%s/v%s/standard-install.yaml", gwInstallBasePath, apiVersion)
90+
91+
if webhookRequired(k8sVersion) {
92+
webhookPath := fmt.Sprintf("%s/v%s/webhook-install.yaml", gwInstallBasePath, apiVersion)
93+
94+
if output, err := exec.Command("kubectl", "delete", "-f", webhookPath).CombinedOutput(); err != nil {
95+
return output, err
96+
}
97+
}
98+
99+
output, err := exec.Command("kubectl", "delete", "-f", apiPath).CombinedOutput()
100+
if err != nil && !strings.Contains(string(output), "not found") {
101+
return output, err
102+
}
103+
104+
return nil, nil
105+
}
106+
86107
// InstallNGF installs NGF.
87108
func InstallNGF(cfg InstallationConfig, extraArgs ...string) ([]byte, error) {
88109
args := []string{
89110
"install", cfg.ReleaseName, cfg.ChartPath, "--create-namespace", "--namespace", cfg.Namespace, "--wait",
90111
}
91112

92-
if cfg.NgfImageRepository != "" {
93-
args = append(args, formatValueSet("nginxGateway.image.repository", cfg.NgfImageRepository)...)
94-
if cfg.ImageTag != "" {
95-
args = append(args, formatValueSet("nginxGateway.image.tag", cfg.ImageTag)...)
96-
}
97-
if cfg.ImagePullPolicy != "" {
98-
args = append(args, formatValueSet("nginxGateway.image.pullPolicy", cfg.ImagePullPolicy)...)
99-
}
100-
}
113+
args = append(args, setImageArgs(cfg)...)
114+
fullArgs := append(args, extraArgs...)
101115

102-
if cfg.NginxImageRepository != "" {
103-
args = append(args, formatValueSet("nginx.image.repository", cfg.NginxImageRepository)...)
104-
if cfg.ImageTag != "" {
105-
args = append(args, formatValueSet("nginx.image.tag", cfg.ImageTag)...)
106-
}
107-
if cfg.ImagePullPolicy != "" {
108-
args = append(args, formatValueSet("nginx.image.pullPolicy", cfg.ImagePullPolicy)...)
109-
}
116+
return exec.Command("helm", fullArgs...).CombinedOutput()
117+
}
118+
119+
// UpgradeNGF upgrades NGF. CRD upgrades assume the chart is local.
120+
func UpgradeNGF(cfg InstallationConfig, extraArgs ...string) ([]byte, error) {
121+
crdPath := filepath.Join(cfg.ChartPath, "crds")
122+
if output, err := exec.Command("kubectl", "apply", "-f", crdPath).CombinedOutput(); err != nil {
123+
return output, err
110124
}
111125

112-
if cfg.ServiceType != "" {
113-
args = append(args, formatValueSet("service.type", cfg.ServiceType)...)
114-
if cfg.ServiceType == "LoadBalancer" && cfg.IsGKEInternalLB {
115-
args = append(args, formatValueSet(`service.annotations.networking\.gke\.io\/load-balancer-type`, "Internal")...)
116-
}
126+
args := []string{
127+
"upgrade", cfg.ReleaseName, cfg.ChartPath, "--namespace", cfg.Namespace, "--wait",
117128
}
118129

130+
args = append(args, setImageArgs(cfg)...)
119131
fullArgs := append(args, extraArgs...)
120132

121133
return exec.Command("helm", fullArgs...).CombinedOutput()
@@ -128,7 +140,7 @@ func UninstallNGF(cfg InstallationConfig, k8sClient client.Client) ([]byte, erro
128140
}
129141

130142
output, err := exec.Command("helm", args...).CombinedOutput()
131-
if err != nil {
143+
if err != nil && !strings.Contains(string(output), "release: not found") {
132144
return output, err
133145
}
134146

@@ -157,19 +169,37 @@ func UninstallNGF(cfg InstallationConfig, k8sClient client.Client) ([]byte, erro
157169
return nil, nil
158170
}
159171

160-
// UninstallGatewayAPI uninstalls the specified version of the Gateway API resources.
161-
func UninstallGatewayAPI(apiVersion, k8sVersion string) ([]byte, error) {
162-
apiPath := fmt.Sprintf("%s/v%s/standard-install.yaml", gwInstallBasePath, apiVersion)
172+
func setImageArgs(cfg InstallationConfig) []string {
173+
var args []string
163174

164-
if webhookRequired(k8sVersion) {
165-
webhookPath := fmt.Sprintf("%s/v%s/webhook-install.yaml", gwInstallBasePath, apiVersion)
175+
if cfg.NgfImageRepository != "" {
176+
args = append(args, formatValueSet("nginxGateway.image.repository", cfg.NgfImageRepository)...)
177+
if cfg.ImageTag != "" {
178+
args = append(args, formatValueSet("nginxGateway.image.tag", cfg.ImageTag)...)
179+
}
180+
if cfg.ImagePullPolicy != "" {
181+
args = append(args, formatValueSet("nginxGateway.image.pullPolicy", cfg.ImagePullPolicy)...)
182+
}
183+
}
166184

167-
if output, err := exec.Command("kubectl", "delete", "-f", webhookPath).CombinedOutput(); err != nil {
168-
return output, err
185+
if cfg.NginxImageRepository != "" {
186+
args = append(args, formatValueSet("nginx.image.repository", cfg.NginxImageRepository)...)
187+
if cfg.ImageTag != "" {
188+
args = append(args, formatValueSet("nginx.image.tag", cfg.ImageTag)...)
189+
}
190+
if cfg.ImagePullPolicy != "" {
191+
args = append(args, formatValueSet("nginx.image.pullPolicy", cfg.ImagePullPolicy)...)
192+
}
193+
}
194+
195+
if cfg.ServiceType != "" {
196+
args = append(args, formatValueSet("service.type", cfg.ServiceType)...)
197+
if cfg.ServiceType == "LoadBalancer" && cfg.IsGKEInternalLB {
198+
args = append(args, formatValueSet(`service.annotations.networking\.gke\.io\/load-balancer-type`, "Internal")...)
169199
}
170200
}
171201

172-
return exec.Command("kubectl", "delete", "-f", apiPath).CombinedOutput()
202+
return args
173203
}
174204

175205
func formatValueSet(key, value string) []string {

0 commit comments

Comments
 (0)