Skip to content

Commit 3beb7ed

Browse files
tenzen-yjohnugeorge
authored andcommitted
E2E: Replace outdated images with latest ones (kubeflow#2083)
Signed-off-by: Yuki Iwai <[email protected]>
1 parent 9e31bdf commit 3beb7ed

14 files changed

+81
-87
lines changed

.github/workflows/build-and-publish-images.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ jobs:
3131
- name: Checkout
3232
uses: actions/checkout@v3
3333

34+
- name: Free-Up Disk Space
35+
uses: ./.github/workflows/free-up-disk-space
36+
3437
- name: Docker Login
3538
# Trigger workflow only for kubeflow/training-operator repository with specific branch (master, v.*-branch) or tag (v.*).
3639
if: >-
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
name: Free-Up Disk Space
2+
description: Remove Non-Essential Tools And Move Docker Data Directory to /mnt/docker
3+
4+
runs:
5+
using: composite
6+
steps:
7+
# This step is a Workaround to avoid the "No space left on device" error.
8+
# ref: https://github.com/actions/runner-images/issues/2840
9+
- name: Remove unnecessary files
10+
shell: bash
11+
run: |
12+
echo "Disk usage before cleanup:"
13+
df -hT
14+
15+
sudo rm -rf /usr/share/dotnet
16+
sudo rm -rf /opt/ghc
17+
sudo rm -rf /usr/local/share/boost
18+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
19+
sudo rm -rf /usr/local/lib/android
20+
sudo rm -rf /usr/local/share/powershell
21+
sudo rm -rf /usr/share/swift
22+
23+
echo "Disk usage after cleanup:"
24+
df -hT
25+
26+
- name: Prune docker images
27+
shell: bash
28+
run: |
29+
docker image prune -a -f
30+
docker system df
31+
df -hT
32+
33+
- name: Move docker data directory
34+
shell: bash
35+
run: |
36+
echo "Stopping docker service ..."
37+
sudo systemctl stop docker
38+
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
39+
DOCKER_ROOT_DIR=/mnt/docker
40+
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
41+
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
42+
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
43+
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
44+
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
45+
echo "Starting docker service ..."
46+
sudo systemctl daemon-reload
47+
sudo systemctl start docker
48+
echo "Docker service status:"
49+
sudo systemctl --no-pager -l -o short status docker

.github/workflows/integration-tests.yaml

+3-16
Original file line numberDiff line numberDiff line change
@@ -55,25 +55,12 @@ jobs:
5555
python-version: "3.10"
5656

5757
steps:
58-
# This step is a Workaround to avoid the "No space left on device" error.
59-
# ref: https://github.com/actions/runner-images/issues/2840
60-
- name: Remove unnecessary files
61-
shell: bash
62-
run: |
63-
sudo rm -rf /usr/share/dotnet
64-
sudo rm -rf /opt/ghc
65-
sudo rm -rf "/usr/local/share/boost"
66-
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
67-
sudo rm -rf /usr/local/lib/android
68-
sudo rm -rf /usr/local/share/powershell
69-
sudo rm -rf /usr/share/swift
70-
71-
echo "Disk usage after cleanup:"
72-
df -h
73-
7458
- name: Checkout
7559
uses: actions/checkout@v3
7660

61+
- name: Free-Up Disk Space
62+
uses: ./.github/workflows/free-up-disk-space
63+
7764
- name: Setup Python
7865
uses: actions/setup-python@v4
7966
with:

.github/workflows/template-publish-image/action.yaml

-44
Original file line numberDiff line numberDiff line change
@@ -23,50 +23,6 @@ inputs:
2323
runs:
2424
using: composite
2525
steps:
26-
# This step is a Workaround to avoid the "No space left on device" error.
27-
# ref: https://github.com/actions/runner-images/issues/2840
28-
- name: Remove unnecessary files
29-
shell: bash
30-
run: |
31-
echo "Disk usage before cleanup:"
32-
df -hT
33-
34-
sudo rm -rf /usr/share/dotnet
35-
sudo rm -rf /opt/ghc
36-
sudo rm -rf /usr/local/share/boost
37-
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
38-
sudo rm -rf /usr/local/lib/android
39-
sudo rm -rf /usr/local/share/powershell
40-
sudo rm -rf /usr/share/swift
41-
42-
echo "Disk usage after cleanup:"
43-
df -hT
44-
45-
- name: Prune docker images
46-
shell: bash
47-
run: |
48-
docker image prune -a -f
49-
docker system df
50-
df -hT
51-
52-
- name: Move docker data directory
53-
shell: bash
54-
run: |
55-
echo "Stopping docker service ..."
56-
sudo systemctl stop docker
57-
DOCKER_DEFAULT_ROOT_DIR=/var/lib/docker
58-
DOCKER_ROOT_DIR=/mnt/docker
59-
echo "Moving ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
60-
sudo mv ${DOCKER_DEFAULT_ROOT_DIR} ${DOCKER_ROOT_DIR}
61-
echo "Creating symlink ${DOCKER_DEFAULT_ROOT_DIR} -> ${DOCKER_ROOT_DIR}"
62-
sudo ln -s ${DOCKER_ROOT_DIR} ${DOCKER_DEFAULT_ROOT_DIR}
63-
echo "$(sudo ls -l ${DOCKER_DEFAULT_ROOT_DIR})"
64-
echo "Starting docker service ..."
65-
sudo systemctl daemon-reload
66-
sudo systemctl start docker
67-
echo "Docker service status:"
68-
sudo systemctl --no-pager -l -o short status docker
69-
7026
- name: Setup QEMU
7127
uses: docker/setup-qemu-action@v2
7228
with:

examples/mpi/tensorflow-mnist.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ spec:
1212
template:
1313
spec:
1414
containers:
15-
- image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
15+
- image: horovod/horovod:0.28.1
1616
name: mpi
1717
command:
1818
- mpirun
@@ -35,7 +35,7 @@ spec:
3535
- btl
3636
- ^openib
3737
- python
38-
- /examples/tensorflow2_mnist.py
38+
- /horovod/examples/tensorflow2/tensorflow2_mnist.py
3939
resources:
4040
limits:
4141
cpu: 1
@@ -45,7 +45,7 @@ spec:
4545
template:
4646
spec:
4747
containers:
48-
- image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
48+
- image: horovod/horovod:0.28.1
4949
name: mpi
5050
resources:
5151
limits:

examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml

+4-4
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ spec:
1111
spec:
1212
containers:
1313
- name: pytorch
14-
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
14+
image: kubeflow/pytorch-dist-mnist:latest
1515
args: ["--backend", "gloo"]
1616
# Comment out the below resources to use the CPU.
17-
resources:
17+
resources:
1818
limits:
1919
nvidia.com/gpu: 1
2020
Worker:
@@ -24,9 +24,9 @@ spec:
2424
spec:
2525
containers:
2626
- name: pytorch
27-
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
27+
image: kubeflow/pytorch-dist-mnist:latest
2828
args: ["--backend", "gloo"]
2929
# Comment out the below resources to use the CPU.
30-
resources:
30+
resources:
3131
limits:
3232
nvidia.com/gpu: 1

examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
spec:
1212
containers:
1313
- name: pytorch
14-
image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
14+
image: kubeflow/pytorch-dist-mnist:latest
1515
args: ["--backend", "mpi"]
1616
# Comment out the below resources to use the CPU.
1717
resources:
@@ -24,7 +24,7 @@ spec:
2424
spec:
2525
containers:
2626
- name: pytorch
27-
image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
27+
image: kubeflow/pytorch-dist-mnist:latest
2828
args: ["--backend", "mpi"]
2929
# Comment out the below resources to use the CPU.
3030
resources:

examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
spec:
1212
containers:
1313
- name: pytorch
14-
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
14+
image: kubeflow/pytorch-dist-mnist:latest
1515
args: ["--backend", "nccl"]
1616
resources:
1717
limits:
@@ -23,7 +23,7 @@ spec:
2323
spec:
2424
containers:
2525
- name: pytorch
26-
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
26+
image: kubeflow/pytorch-dist-mnist:latest
2727
args: ["--backend", "nccl"]
2828
resources:
2929
limits:

sdk/python/test/e2e/test_e2e_mpijob.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
4040

4141
TRAINING_CLIENT = TrainingClient(job_kind=constants.MPIJOB_KIND)
42-
JOB_NAME = "mpijob-mxnet-ci-test"
42+
JOB_NAME = "mpijob-pytorch-ci-test"
4343
CONTAINER_NAME = "mpi"
4444
GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "")
4545

@@ -182,7 +182,7 @@ def generate_mpijob(
182182
def generate_containers() -> Tuple[V1Container, V1Container]:
183183
launcher_container = V1Container(
184184
name=CONTAINER_NAME,
185-
image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
185+
image="horovod/horovod:0.28.1",
186186
command=["mpirun"],
187187
args=[
188188
"-np",
@@ -202,19 +202,18 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
202202
"-mca",
203203
"btl",
204204
"^openib",
205-
# "python", "/examples/tensorflow2_mnist.py"]
206205
"python",
207-
"/examples/pytorch_mnist.py",
206+
"/horovod/examples/pytorch/pytorch_mnist.py",
208207
"--epochs",
209208
"1",
210209
],
211210
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
212211
)
213212

214213
worker_container = V1Container(
215-
name="mpi",
216-
image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
217-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
214+
name=CONTAINER_NAME,
215+
image="horovod/horovod:0.28.1",
216+
resources=V1ResourceRequirements(limits={"memory": "3Gi", "cpu": "1.2"}),
218217
)
219218

220219
return launcher_container, worker_container

sdk/python/test/e2e/test_e2e_mxjob.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -233,21 +233,21 @@ def generate_containers() -> Tuple[V1Container, V1Container, V1Container]:
233233
"dist_sync",
234234
],
235235
ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
236-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
236+
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
237237
)
238238

239239
server_container = V1Container(
240240
name=CONTAINER_NAME,
241241
image="docker.io/kubeflow/mxnet-gpu:latest",
242242
ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
243-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
243+
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
244244
)
245245

246246
scheduler_container = V1Container(
247247
name=CONTAINER_NAME,
248248
image="docker.io/kubeflow/mxnet-gpu:latest",
249249
ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
250-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
250+
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
251251
)
252252

253253
return worker_container, server_container, scheduler_container

sdk/python/test/e2e/test_e2e_paddlejob.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -158,5 +158,5 @@ def generate_container() -> V1Container:
158158
image="docker.io/paddlepaddle/paddle:2.4.0rc0-cpu",
159159
command=["python"],
160160
args=["-m", "paddle.distributed.launch", "run_check"],
161-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
161+
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
162162
)

sdk/python/test/e2e/test_e2e_pytorchjob.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ def generate_pytorchjob(
264264
def generate_container() -> V1Container:
265265
return V1Container(
266266
name=CONTAINER_NAME,
267-
image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
268-
args=["--backend", "gloo"],
269-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
267+
image="kubeflow/pytorch-dist-mnist:latest",
268+
args=["--backend", "gloo", "--epochs", "1"],
269+
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
270270
)

sdk/python/test/e2e/test_e2e_tfjob.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -164,5 +164,5 @@ def generate_container() -> V1Container:
164164
"--learning_rate=0.01",
165165
"--batch_size=150",
166166
],
167-
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.75"}),
167+
resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),
168168
)

sdk/python/test/e2e/test_e2e_xgboostjob.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -190,5 +190,5 @@ def generate_container() -> V1Container:
190190
"--model_path=/tmp/xgboost-model",
191191
"--model_storage_type=local",
192192
],
193-
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
193+
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.8"}),
194194
)

0 commit comments

Comments
 (0)