fix: fl use case fixes

ferrarimarco · ferrarimarco · commit 9e4d9a4584fd · 2025-04-03T09:52:05.000Z
WIP
diff --git a/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/README.md b/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/README.md
@@ -135,13 +135,6 @@ ones that the Federated learning reference architecture provisions.
       --workload "server1"
    ```
 
-   1. Take note of the NVIDIA FLARE server IP address from the output of the
-      last command. The output is similar to the following:
-
-   ```text
-   NVFLARE server1 IP address: 1.2.3.4
-   ```
-
 For simplicity, the
 `"platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/deploy.sh"`
 convenience deployment script uploads the entire NVIDIA FLARE workspace to Cloud
@@ -202,14 +195,15 @@ To run NVIDIA FLARE clients, you do the following:
    ```bash
    source "platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh"
    load_fl_terraform_outputs
+   load_kubernetes_outputs
    ```
 
 1. Run `client1`:
 
    ```bash
    docker run --rm -it --entrypoint /usr/local/bin/python3 \
      --detach \
-     --add-host=server1:<NVIDIA_FLARE_SERVER_IP_ADDRESS> \
+     --add-host=server1:${CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS} \
      --name "nvflare-client1" \
      -v "$(pwd)/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/nvflare-workspace":/workspace/nvfl/ \
      -u 10000:10000 \
@@ -224,11 +218,6 @@ To run NVIDIA FLARE clients, you do the following:
      org=nvidia
    ```
 
-   Where:
-
-   - `<NVIDIA_FLARE_SERVER_IP_ADDRESS>` is the IP address of the NVIDIA FLARE
-     server.
-
 1. Confirm that the `client1` is running:
 
    ```bash
@@ -257,7 +246,7 @@ To run NVIDIA FLARE clients, you do the following:
    ```bash
    docker run --rm -it --entrypoint /usr/local/bin/python3 \
      --detach \
-     --add-host=server1:<NVIDIA_FLARE_SERVER_IP_ADDRESS> \
+     --add-host=server1:${CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS} \
      --name "nvflare-client2" \
      -v "$(pwd)/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/nvflare-workspace":/workspace/nvfl/ \
      -u 10000:10000 \
@@ -272,11 +261,6 @@ To run NVIDIA FLARE clients, you do the following:
      org=nvidia
    ```
 
-   Where:
-
-   - `<NVIDIA_FLARE_SERVER_IP_ADDRESS>` is the IP address of the NVIDIA FLARE
-     server.
-
 1. Confirm that the `client2` is running:
 
    ```bash
@@ -319,6 +303,7 @@ In this section, you check the status of the registered NVIDIA FLARE clients:
    ```bash
    source "platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh"
    load_fl_terraform_outputs
+   load_kubernetes_outputs
    ```
 
 1. Configure `kubectl` to access the cluster:
@@ -330,14 +315,9 @@ In this section, you check the status of the registered NVIDIA FLARE clients:
 1. Open a shell in the NVIDIA FLARE server pod:
 
    ```bash
-   kubectl exec --stdin --tty --namespace fl-1 <NVIDIA_FLARE_SERVER_POD_NAME> -- /bin/bash
+   kubectl exec --stdin --tty --namespace "${NVFLARE_EXAMPLE_TENANT_NAME}" "${NVFLARE_EXAMPLE_WORKLOAD_POD_NAME}" -- /bin/bash
    ```
 
-   Where:
-
-   - `<NVIDIA_FLARE_SERVER_POD_NAME>` is the name of the NVIDIA FLARE server
-     pod.
-
 1. Connect to NVIDIA FLARE:
 
    ```bash
diff --git a/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/deploy.sh b/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/deploy.sh
@@ -139,6 +139,7 @@ for terraservice in "${nvflare_example_terraservices[@]}"; do
 done
 
 provision_terraservice "config_management"
+provision_terraservice "cloud_storage"
 
 echo "Building the NVIDIA FLARE container image"
 "${FEDERATED_LEARNING_USE_CASE_DIR}/examples/nvflare-tff/build-container-image.sh"
@@ -175,14 +176,6 @@ gcloud container clusters get-credentials "${cluster_name}" \
   --project "${cluster_project_id}" \
   --dns-endpoint
 
-INGRESS_GATEWAY_IP_ADDRESS=
-get_kubernetes_load_balancer_service_external_ip_address_or_wait "istio-ingressgateway-nvflare" "istio-ingress" "INGRESS_GATEWAY_IP_ADDRESS"
-echo "Cloud Service Mesh ingress gateway IP address: ${INGRESS_GATEWAY_IP_ADDRESS}"
-
-NVFLARE_SERVER_IP_ADDRESS=
-get_kubernetes_load_balancer_service_external_ip_address_or_wait "nvflare-${WORKLOAD_NAME}-lb" "${NVFLARE_EXAMPLE_TENANT_NAME}" "NVFLARE_SERVER_IP_ADDRESS"
-echo "NVFLARE ${WORKLOAD_NAME} external IP address: ${NVFLARE_SERVER_IP_ADDRESS}"
-
 end_timestamp_federated_learning=$(date +%s)
 total_runtime_value_federated_learning=$((end_timestamp_federated_learning - start_timestamp_federated_learning))
 echo "Total runtime (Federated learning NVIDIA FLARE example deployment): $(date -d@${total_runtime_value_federated_learning} -u +%H:%M:%S)"
diff --git a/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh b/platforms/gke/base/use-cases/federated-learning/examples/nvflare-tff/setup-environment.sh
@@ -94,3 +94,19 @@ load_fl_terraform_outputs() {
   # shellcheck disable=SC2034 # Variable is used in other scripts
   NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID_WITH_TAG=${NVFLARE_EXAMPLE_CONTAINER_IMAGE_LOCALIZED_ID}:${NVFLARE_EXAMPLE_CONTAINER_IMAGE_TAG}
 }
+
+load_kubernetes_outputs() {
+  CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS=
+  get_kubernetes_load_balancer_service_external_ip_address_or_wait "istio-ingressgateway-nvflare" "istio-ingress" "CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS"
+  echo "Cloud Service Mesh ingress gateway IP address: ${CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS}"
+  export CLOUD_SERVICE_MESH_INGRESS_GATEWAY_IP_ADDRESS
+
+  NVFLARE_EXAMPLE_WORKLOAD_POD_NAME=
+  kubectl get pods -n "${NVFLARE_EXAMPLE_TENANT_NAME}" -l "run=nvflare-${WORKLOAD_NAME}" -o jsonpath='{.items[0].metadata.name}'
+  local RET_CODE=$?
+  if [[ "${RET_CODE}" -gt 0 ]]; then
+    echo "Error while initializing NVFLARE_EXAMPLE_WORKLOAD_POD_NAME"
+    return 1
+  fi
+  export NVFLARE_EXAMPLE_WORKLOAD_POD_NAME
+}
diff --git a/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_virtual_services_server.yaml b/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_virtual_services_server.yaml
@@ -30,7 +30,7 @@ spec:
           port: 8002
       route:
         - destination:
-            host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}
+            host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}.svc.cluster.local
             port:
               number: 8002
           weight: 100
@@ -40,7 +40,7 @@ spec:
           port: 8003
       route:
         - destination:
-            host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}
+            host: ${nvidia_flare_tff_example_workload_name}.${namespace_name}.svc.cluster.local
             port:
               number: 8003
           weight: 100
diff --git a/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_workaround_listener_issue_server.yaml b/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_workaround_listener_issue_server.yaml
diff --git a/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_workload.yaml b/platforms/gke/base/use-cases/federated-learning/terraform/example_nvidia_flare_tff/templates/nvidia-flare-tff-example/nvidia_flare_tff_example_workload.yaml
@@ -61,13 +61,6 @@ spec:
       annotations:
         gke-gcsfuse/volumes: "true"
         proxy.istio.io/config: '{ "holdApplicationUntilProxyStarts": true }'
-        # Don't intercept these ports using the Cloud Service Mesh sidecar
-        # (istio-proxy) because NVIDIA FLARE handles mTLS between NVIDIA FLARE
-        # clients and servers. If this annotation is omitted, istio-proxy
-        # intercepts traffic as expected, and presents its own TLS certificate,
-        # causing NVIDIA FLARE clients to fail validation because they expect a
-        # TLS certificate generated when creating the NVFLARE workspace.
-        traffic.sidecar.istio.io/excludeInboundPorts: "8002,8003"
     spec:
       containers:
         - name: nvflare