E2E: Replace outdated images with latest ones (kubeflow#2083)

tenzen-y · johnugeorge · commit 3beb7edbdd41 · 2024-04-28T21:22:17.000+05:30
Signed-off-by: Yuki Iwai &lt;yuki.iwai.tz@gmail.com&gt;
diff --git a/.github/workflows/build-and-publish-images.yaml b/.github/workflows/build-and-publish-images.yaml
@@ -31,6 +31,9 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
       - name: Docker Login
         # Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
         if: >-
diff --git a/.github/workflows/free-up-disk-space/action.yaml b/.github/workflows/free-up-disk-space/action.yaml
@@ -0,0 +1,49 @@
+name: Free-Up Disk Space
+description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker
+
+runs:
+  using: composite
+  steps:
+    # This step is a Workaround to avoid the "No space left on device" error.
+    # ref: https://github.com/actions/runner-images/issues/2840
+    - name: Remove unnecessary files
+      shell: bash
+      run: |
+        echo "Disk usage before cleanup:"
+        df -hT
+
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /usr/local/share/boost
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /usr/local/share/powershell
+        sudo rm -rf /usr/share/swift
+
+        echo "Disk usage after cleanup:"
+        df -hT
+
+    - name: Prune docker images
+      shell: bash
+      run: |
+        docker image prune -a -f
+        docker system df
+        df -hT
+
+    - name: Move docker data directory
+      shell: bash
+      run: |
+        echo "Stopping docker service ..."
+        sudo systemctl stop docker
+        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
+        DOCKER_ROOT_DIR=/mnt/docker
+        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
+        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
+        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
+        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
+        echo "Starting docker service ..."
+        sudo systemctl daemon-reload
+        sudo systemctl start docker
+        echo "Docker service status:"
+        sudo systemctl --no-pager -l -o short status docker
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -55,25 +55,12 @@ jobs:
             python-version: "3.10"
 
     steps:
-      # This step is a Workaround to avoid the "No space left on device" error.
-      # ref: https://github.com/actions/runner-images/issues/2840
-      - name: Remove unnecessary files
-        shell: bash
-        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf "/usr/local/share/boost"
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/local/share/powershell
-          sudo rm -rf /usr/share/swift
-
-          echo "Disk usage after cleanup:"
-          df -h
-
       - name: Checkout
         uses: actions/checkout@v3
 
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
diff --git a/.github/workflows/template-publish-image/action.yaml b/.github/workflows/template-publish-image/action.yaml
@@ -23,50 +23,6 @@ inputs:
 runs:
   using: composite
   steps:
-    # This step is a Workaround to avoid the "No space left on device" error.
-    # ref: https://github.com/actions/runner-images/issues/2840
-    - name: Remove unnecessary files
-      shell: bash
-      run: |
-        echo "Disk usage before cleanup:"
-        df -hT
-
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf /usr/local/share/boost
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-        sudo rm -rf /usr/local/lib/android
-        sudo rm -rf /usr/local/share/powershell
-        sudo rm -rf /usr/share/swift
-
-        echo "Disk usage after cleanup:"
-        df -hT
-
-    - name: Prune docker images
-      shell: bash
-      run: |
-        docker image prune -a -f
-        docker system df
-        df -hT
-
-    - name: Move docker data directory
-      shell: bash
-      run: |
-        echo "Stopping docker service ..."
-        sudo systemctl stop docker
-        DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
-        DOCKER_ROOT_DIR=/mnt/docker
-        echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
-        echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
-        sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
-        echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
-        echo "Starting docker service ..."
-        sudo systemctl daemon-reload
-        sudo systemctl start docker
-        echo "Docker service status:"
-        sudo systemctl --no-pager -l -o short status docker
-
     - name: Setup QEMU
       uses: docker/setup-qemu-action@v2
       with:
diff --git a/examples/mpi/tensorflow-mnist.yaml b/examples/mpi/tensorflow-mnist.yaml
@@ -12,7 +12,7 @@ spec:
       template:
         spec:
           containers:
-          - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
+          - image: horovod/horovod:0.28.1
             name: mpi
             command:
             - mpirun
@@ -35,7 +35,7 @@ spec:
             - btl
             - ^openib
             - python
-            - /examples/tensorflow2_mnist.py
+            - /horovod/examples/tensorflow2/tensorflow2_mnist.py
             resources:
               limits:
                 cpu: 1
@@ -45,7 +45,7 @@ spec:
       template:
         spec:
           containers:
-          - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
+          - image: horovod/horovod:0.28.1
             name: mpi
             resources:
               limits:
diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml
@@ -11,10 +11,10 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "gloo"]
               # Comment out the below resources to use the CPU.
-              resources: 
+              resources:
                 limits:
                   nvidia.com/gpu: 1
     Worker:
@@ -24,9 +24,9 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "gloo"]
               # Comment out the below resources to use the CPU.
-              resources: 
+              resources:
                 limits:
                   nvidia.com/gpu: 1
diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml
@@ -11,7 +11,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "mpi"]
               # Comment out the below resources to use the CPU.
               resources: 
@@ -24,7 +24,7 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "mpi"]
               # Comment out the below resources to use the CPU.
               resources: 
diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml
@@ -11,7 +11,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "nccl"]
               resources: 
                 limits:
@@ -23,7 +23,7 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "nccl"]
               resources: 
                 limits:
diff --git a/sdk/python/test/e2e/test_e2e_mpijob.py b/sdk/python/test/e2e/test_e2e_mpijob.py
@@ -39,7 +39,7 @@
 logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
 
 TRAINING_CLIENT = TrainingClient(job_kind=constants.MPIJOB_KIND)
-JOB_NAME = "mpijob-mxnet-ci-test"
+JOB_NAME = "mpijob-pytorch-ci-test"
 CONTAINER_NAME = "mpi"
 GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "")
 
@@ -182,7 +182,7 @@ def generate_mpijob(
 def generate_containers() -> Tuple[V1Container, V1Container]:
     launcher_container = V1Container(
         name=CONTAINER_NAME,
-        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
+        image="horovod/horovod:0.28.1",
         command=["mpirun"],
         args=[
             "-np",
@@ -202,19 +202,18 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
             "-mca",
             "btl",
             "^openib",
-            # "python", "/examples/tensorflow2_mnist.py"]
             "python",
-            "/examples/pytorch_mnist.py",
+            "/horovod/examples/pytorch/pytorch_mnist.py",
             "--epochs",
             "1",
         ],
         resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 
     worker_container = V1Container(
-        name="mpi",
-        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        name=CONTAINER_NAME,
+        image="horovod/horovod:0.28.1",
+        resources=V1ResourceRequirements(limits={"memory": "3Gi", "cpu": "1.2"}),
     )
 
     return launcher_container, worker_container
diff --git a/sdk/python/test/e2e/test_e2e_mxjob.py b/sdk/python/test/e2e/test_e2e_mxjob.py
@@ -233,21 +233,21 @@ def generate_containers() -> Tuple[V1Container, V1Container, V1Container]:
             "dist_sync",
         ],
         ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
 
     server_container = V1Container(
         name=CONTAINER_NAME,
         image="docker.io/kubeflow/mxnet-gpu:latest",
         ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
+        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 
     scheduler_container = V1Container(
         name=CONTAINER_NAME,
         image="docker.io/kubeflow/mxnet-gpu:latest",
         ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
+        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 
     return worker_container, server_container, scheduler_container
diff --git a/sdk/python/test/e2e/test_e2e_paddlejob.py b/sdk/python/test/e2e/test_e2e_paddlejob.py
@@ -158,5 +158,5 @@ def generate_container() -> V1Container:
         image="docker.io/paddlepaddle/paddle:2.4.0rc0-cpu",
         command=["python"],
         args=["-m", "paddle.distributed.launch", "run_check"],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -264,7 +264,7 @@ def generate_pytorchjob(
 def generate_container() -> V1Container:
     return V1Container(
         name=CONTAINER_NAME,
-        image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
-        args=["--backend", "gloo"],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        image="kubeflow/pytorch-dist-mnist:latest",
+        args=["--backend", "gloo", "--epochs", "1"],
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )
diff --git a/sdk/python/test/e2e/test_e2e_tfjob.py b/sdk/python/test/e2e/test_e2e_tfjob.py
@@ -164,5 +164,5 @@ def generate_container() -> V1Container:
             "--learning_rate=0.01",
             "--batch_size=150",
         ],
-        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.75"}),
+        resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),
     )
diff --git a/sdk/python/test/e2e/test_e2e_xgboostjob.py b/sdk/python/test/e2e/test_e2e_xgboostjob.py
@@ -190,5 +190,5 @@ def generate_container() -> V1Container:
             "--model_path=/tmp/xgboost-model",
             "--model_storage_type=local",
         ],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
     )

Original file line number	Diff line number	Diff line change
`@@ -158,5 +158,5 @@ def generate_container() -> V1Container:`
`158`	`158`	`image="docker.io/paddlepaddle/paddle:2.4.0rc0-cpu",`
`159`	`159`	`command=["python"],`
`160`	`160`	`args=["-m", "paddle.distributed.launch", "run_check"],`
`161`		`- resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),`
	`161`	`+ resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),`
`162`	`162`	`)`
Original file line number	Diff line number	Diff line change
`@@ -164,5 +164,5 @@ def generate_container() -> V1Container:`
`164`	`164`	`"--learning_rate=0.01",`
`165`	`165`	`"--batch_size=150",`
`166`	`166`	`],`
`167`		`- resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.75"}),`
	`167`	`+ resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),`
`168`	`168`	`)`
Original file line number	Diff line number	Diff line change
`@@ -190,5 +190,5 @@ def generate_container() -> V1Container:`
`190`	`190`	`"--model_path=/tmp/xgboost-model",`
`191`	`191`	`"--model_storage_type=local",`
`192`	`192`	`],`
`193`		`- resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),`
	`193`	`+ resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),`
`194`	`194`	`)`