39
39
logging .getLogger ("kubeflow.training.api.training_client" ).setLevel (logging .DEBUG )
40
40
41
41
TRAINING_CLIENT = TrainingClient (job_kind = constants .MPIJOB_KIND )
42
- JOB_NAME = "mpijob-mxnet -ci-test"
42
+ JOB_NAME = "mpijob-pytorch -ci-test"
43
43
CONTAINER_NAME = "mpi"
44
44
GANG_SCHEDULER_NAME = os .getenv (TEST_GANG_SCHEDULER_NAME_ENV_KEY , "" )
45
45
@@ -182,7 +182,7 @@ def generate_mpijob(
182
182
def generate_containers () -> Tuple [V1Container , V1Container ]:
183
183
launcher_container = V1Container (
184
184
name = CONTAINER_NAME ,
185
- image = "horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu " ,
185
+ image = "horovod/horovod:0.28.1 " ,
186
186
command = ["mpirun" ],
187
187
args = [
188
188
"-np" ,
@@ -202,19 +202,18 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
202
202
"-mca" ,
203
203
"btl" ,
204
204
"^openib" ,
205
- # "python", "/examples/tensorflow2_mnist.py"]
206
205
"python" ,
207
- "/examples/pytorch_mnist.py" ,
206
+ "/horovod/ examples/pytorch /pytorch_mnist.py" ,
208
207
"--epochs" ,
209
208
"1" ,
210
209
],
211
210
resources = V1ResourceRequirements (limits = {"memory" : "1Gi" , "cpu" : "0.4" }),
212
211
)
213
212
214
213
worker_container = V1Container (
215
- name = "mpi" ,
216
- image = "horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu " ,
217
- resources = V1ResourceRequirements (limits = {"memory" : "1Gi " , "cpu" : "0.4 " }),
214
+ name = CONTAINER_NAME ,
215
+ image = "horovod/horovod:0.28.1 " ,
216
+ resources = V1ResourceRequirements (limits = {"memory" : "3Gi " , "cpu" : "1.2 " }),
218
217
)
219
218
220
219
return launcher_container , worker_container
0 commit comments