Skip to content

Commit 9e4d9a4

Browse files
committed
fix: fl use case fixes
WIP
1 parent 90cc365 commit 9e4d9a4

File tree

6 files changed

+24
-119
lines changed

6 files changed

+24
-119
lines changed

platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/README.md

+5-25
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,6 @@ ones that the Federated learning reference architecture provisions.
135135
--workload "server1"
136136
```
137137

138-
1. Take note of the NVIDIA FLARE server IP address from the output of the
139-
last command. The output is similar to the following:
140-
141-
```text
142-
NVFLARE server1 IP address: 1.2.3.4
143-
```
144-
145138
For simplicity, the
146139
`"platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/deploy.sh"`
147140
convenience deployment script uploads the entire NVIDIA FLARE workspace to Cloud
@@ -202,14 +195,15 @@ To run NVIDIA FLARE clients, you do the following:
202195
```bash
203196
source "platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh"
204197
load_fl_terraform_outputs
198+
load_kubernetes_outputs
205199
```
206200

207201
1. Run `client1`:
208202

209203
```bash
210204
docker run --rm -it --entrypoint /usr/local/bin/python3 \
211205
--detach \
212-
--add-host=server1:<NVIDIA_FLARE_SERVER_IP_ADDRESS> \
206+
--add-host=server1:${CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS} \
213207
--name "nvflare-client1" \
214208
-v "$(pwd)/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/nvflare-workspace":/workspace/nvfl/ \
215209
-u 10000:10000 \
@@ -224,11 +218,6 @@ To run NVIDIA FLARE clients, you do the following:
224218
org=nvidia
225219
```
226220

227-
Where:
228-
229-
- `<NVIDIA_FLARE_SERVER_IP_ADDRESS>` is the IP address of the NVIDIA FLARE
230-
server.
231-
232221
1. Confirm that the `client1` is running:
233222

234223
```bash
@@ -257,7 +246,7 @@ To run NVIDIA FLARE clients, you do the following:
257246
```bash
258247
docker run --rm -it --entrypoint /usr/local/bin/python3 \
259248
--detach \
260-
--add-host=server1:<NVIDIA_FLARE_SERVER_IP_ADDRESS> \
249+
--add-host=server1:${CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS} \
261250
--name "nvflare-client2" \
262251
-v "$(pwd)/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/nvflare-workspace":/workspace/nvfl/ \
263252
-u 10000:10000 \
@@ -272,11 +261,6 @@ To run NVIDIA FLARE clients, you do the following:
272261
org=nvidia
273262
```
274263

275-
Where:
276-
277-
- `<NVIDIA_FLARE_SERVER_IP_ADDRESS>` is the IP address of the NVIDIA FLARE
278-
server.
279-
280264
1. Confirm that the `client2` is running:
281265

282266
```bash
@@ -319,6 +303,7 @@ In this section, you check the status of the registered NVIDIA FLARE clients:
319303
```bash
320304
source "platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh"
321305
load_fl_terraform_outputs
306+
load_kubernetes_outputs
322307
```
323308

324309
1. Configure `kubectl` to access the cluster:
@@ -330,14 +315,9 @@ In this section, you check the status of the registered NVIDIA FLARE clients:
330315
1. Open a shell in the NVIDIA FLARE server pod:
331316

332317
```bash
333-
kubectl exec --stdin --tty --namespace fl-1 <NVIDIA_FLARE_SERVER_POD_NAME> -- /bin/bash
318+
kubectl exec --stdin --tty --namespace "${NVFLARE_EXAMPLE_TENANT_NAME}" "${NVFLARE_EXAMPLE_WORKLOAD_POD_NAME}" -- /bin/bash
334319
```
335320

336-
Where:
337-
338-
- `<NVIDIA_FLARE_SERVER_POD_NAME>` is the name of the NVIDIA FLARE server
339-
pod.
340-
341321
1. Connect to NVIDIA FLARE:
342322

343323
```bash

platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/deploy.sh

+1-8
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ for terraservice in "${nvflare_example_terraservices[@]}"; do
139139
done
140140

141141
provision_terraservice "config_management"
142+
provision_terraservice "cloud_storage"
142143

143144
echo "Building the NVIDIA FLARE container image"
144145
"${FEDERATED_LEARNING_USE_CASE_DIR}/examples/nvflare-tff/build-container-image.sh"
@@ -175,14 +176,6 @@ gcloud container clusters get-credentials "${cluster_name}" \
175176
--project "${cluster_project_id}" \
176177
--dns-endpoint
177178

178-
INGRESS_GATEWAY_IP_ADDRESS=
179-
get_kubernetes_load_balancer_service_external_ip_address_or_wait "istio-ingressgateway-nvflare" "istio-ingress" "INGRESS_GATEWAY_IP_ADDRESS"
180-
echo "Cloud Service Mesh ingress gateway IP address: ${INGRESS_GATEWAY_IP_ADDRESS}"
181-
182-
NVFLARE_SERVER_IP_ADDRESS=
183-
get_kubernetes_load_balancer_service_external_ip_address_or_wait "nvflare-${WORKLOAD_NAME}-lb" "${NVFLARE_EXAMPLE_TENANT_NAME}" "NVFLARE_SERVER_IP_ADDRESS"
184-
echo "NVFLARE ${WORKLOAD_NAME} external IP address: ${NVFLARE_SERVER_IP_ADDRESS}"
185-
186179
end_timestamp_federated_learning=$(date +%s)
187180
total_runtime_value_federated_learning=$((end_timestamp_federated_learning - start_timestamp_federated_learning))
188181
echo "Total runtime (Federated learning NVIDIA FLARE example deployment): $(date -d@${total_runtime_value_federated_learning} -u +%H:%M:%S)"

platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh

+16
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,19 @@ load_fl_terraform_outputs() {
9494
# shellcheck disable=SC2034 # Variable is used in other scripts
9595
NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG=${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID}:${NVFLARE_EXAMPLE_CONTAINER_IMAGE_TAG}
9696
}
97+
98+
load_kubernetes_outputs() {
99+
CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS=
100+
get_kubernetes_load_balancer_service_external_ip_address_or_wait "istio-ingressgateway-nvflare" "istio-ingress" "CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS"
101+
echo "Cloud Service Mesh ingress gateway IP address: ${CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS}"
102+
export CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS
103+
104+
NVFLARE_EXAMPLE_WORKLOAD_POD_NAME=
105+
kubectl get pods -n "${NVFLARE_EXAMPLE_TENANT_NAME}" -l "run=nvflare-${WORKLOAD_NAME}" -o jsonpath='{.items[0].metadata.name}'
106+
local RET_CODE=$?
107+
if [[ "${RET_CODE}" -gt 0 ]]; then
108+
echo "Error while initializing NVFLARE_EXAMPLE_WORKLOAD_POD_NAME"
109+
return 1
110+
fi
111+
export NVFLARE_EXAMPLE_WORKLOAD_POD_NAME
112+
}

platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_virtual_services_server.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ spec:
3030
port: 8002
3131
route:
3232
- destination:
33-
host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}
33+
host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}.svc.cluster.local
3434
port:
3535
number: 8002
3636
weight: 100
@@ -40,7 +40,7 @@ spec:
4040
port: 8003
4141
route:
4242
- destination:
43-
host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}
43+
host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}.svc.cluster.local
4444
port:
4545
number: 8003
4646
weight: 100

platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_workaround_listener_issue_server.yaml

-77
This file was deleted.

platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_workload.yaml

-7
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,6 @@ spec:
6161
annotations:
6262
gke-gcsfuse/volumes: "true"
6363
proxy.istio.io/config: '{ "holdApplicationUntilProxyStarts": true }'
64-
# Don't intercept these ports using the Cloud Service Mesh sidecar
65-
# (istio-proxy) because NVIDIA FLARE handles mTLS between NVIDIA FLARE
66-
# clients and servers. If this annotation is omitted, istio-proxy
67-
# intercepts traffic as expected, and presents its own TLS certificate,
68-
# causing NVIDIA FLARE clients to fail validation because they expect a
69-
# TLS certificate generated when creating the NVFLARE workspace.
70-
traffic.sidecar.istio.io/excludeInboundPorts: "8002,8003"
7164
spec:
7265
containers:
7366
- name: nvflare

0 commit comments

Comments
 (0)