Skip to content

Commit b4c84f6

Browse files
author
Klaus Ma
authored
Merge pull request #51 from volcano-sh/feature/support-mpi
Add MPI example and tests
2 parents 8c275bb + 185c54a commit b4c84f6

File tree

6 files changed

+166
-5
lines changed

6 files changed

+166
-5
lines changed

example/integrations/mpi/Dockerfile

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
FROM ubuntu:16.04
2+
MAINTAINER volcano <[email protected]>
3+
RUN apt-get update --fix-missing \
4+
&& apt-get install -y libopenmpi-dev openmpi-bin \
5+
&& apt-get install -y git \
6+
&& apt-get install -y build-essential \
7+
&& apt-get install -y ssh \
8+
&& apt-get clean \
9+
&& rm -rf /var/lib/apt/lists/*
10+
RUN git clone https://github.com/wesleykendall/mpitutorial \
11+
&& cd mpitutorial/tutorials/mpi-hello-world/code \
12+
&& make \
13+
&& cp mpi_hello_world /home/ \
14+
&& apt-get autoremove -y git \
15+
&& apt-get autoremove -y build-essential \
16+
&& rm -rf "/mpitutorial"
17+
CMD mkdir -p /var/run/sshd; /usr/sbin/sshd;
+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
apiVersion: batch.volcano.sh/v1alpha1
2+
kind: Job
3+
metadata:
4+
name: lm-mpi-job
5+
spec:
6+
minAvailable: 2
7+
schedulerName: kube-batch
8+
plugins:
9+
ssh: []
10+
env: []
11+
tasks:
12+
- replicas: 1
13+
name: mpimaster
14+
policies:
15+
- event: TaskCompleted
16+
action: CompleteJob
17+
template:
18+
spec:
19+
containers:
20+
- command:
21+
- /bin/sh
22+
- -c
23+
- |
24+
MPI_HOST=`cat /etc/volcano/mpiworker.host | tr "\n" ","`;
25+
mkdir -p /var/run/sshd; /usr/sbin/sshd;
26+
mpiexec --allow-run-as-root --host ${MPI_HOST} -np 2 mpi_hello_world > /home/re;
27+
#TODO: use volcano repo instead in the future.
28+
image: tommylike/volcano-example-mpi:0.0.1
29+
name: mpimaster
30+
ports:
31+
- containerPort: 22
32+
name: mpijob-port
33+
workingDir: /home
34+
restartPolicy: OnFailure
35+
- replicas: 2
36+
name: mpiworker
37+
template:
38+
spec:
39+
containers:
40+
- command:
41+
- /bin/sh
42+
- -c
43+
- |
44+
mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
45+
image: tommylike/volcano-example-mpi:0.0.1
46+
name: mpiworker
47+
ports:
48+
- containerPort: 22
49+
name: mpijob-port
50+
workingDir: /home
51+
restartPolicy: OnFailure
52+
---

hack/run-e2e-kind.sh

+6
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ export VK_BIN=${VK_ROOT}/_output/bin
55
export LOG_LEVEL=3
66
export SHOW_VOLCANO_LOGS=${SHOW_VOLCANO_LOGS:-1}
77
export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-1}
8+
#TODO: Use volcano repo instead in the future
9+
export MPI_EXAMPLE_IMAGE=${MPI_EXAMPLE_IMAGE:-"tommylike/volcano-example-mpi:0.0.1"}
810

911
if [[ "${CLUSTER_NAME}xxx" != "xxx" ]];then
1012
export CLUSTER_CONTEXT="--name ${CLUSTER_NAME}"
@@ -52,10 +54,14 @@ function install-volcano {
5254
chmod 700 get_helm.sh && ./get_helm.sh --version v2.13.0
5355
helm init --service-account tiller --kubeconfig ${KUBECONFIG} --wait
5456

57+
echo "Pulling required docker images"
58+
docker pull ${MPI_EXAMPLE_IMAGE}
59+
5560
echo "Loading docker images into kind cluster"
5661
kind load docker-image ${IMAGE}-controllers:${TAG} ${CLUSTER_CONTEXT}
5762
kind load docker-image ${IMAGE}-scheduler:${TAG} ${CLUSTER_CONTEXT}
5863
kind load docker-image ${IMAGE}-admission:${TAG} ${CLUSTER_CONTEXT}
64+
kind load docker-image ${MPI_EXAMPLE_IMAGE} ${CLUSTER_CONTEXT}
5965

6066
echo "Install volcano plugin into cluster...."
6167
helm plugin install --kubeconfig ${KUBECONFIG} installer/chart/volcano/plugins/gen-admission-secret

pkg/apis/batch/v1alpha1/job.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ type JobStatus struct {
239239
//Current version of job
240240
Version int32 `json:"version,omitempty" protobuf:"bytes,8,opt,name=version"`
241241
// The resources that controlled by this job, e.g. Service, ConfigMap
242-
ControlledResources map[string]string
242+
ControlledResources map[string]string `json:"controlledResources,omitempty" protobuf:"bytes,8,opt,name=controlledResources"`
243243
}
244244

245245
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

test/e2e/mpi.go

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
Copyright 2019 The Volcano Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
package e2e
17+
18+
import (
19+
. "github.com/onsi/ginkgo"
20+
. "github.com/onsi/gomega"
21+
22+
vkv1 "volcano.sh/volcano/pkg/apis/batch/v1alpha1"
23+
)
24+
25+
var _ = Describe("MPI E2E Test", func() {
26+
It("will run and complete finally", func() {
27+
context := initTestContext()
28+
defer cleanupTestContext(context)
29+
30+
slot := oneCPU
31+
32+
spec := &jobSpec{
33+
name: "mpi",
34+
policies: []vkv1.LifecyclePolicy{
35+
{
36+
Action: vkv1.CompleteJobAction,
37+
Event: vkv1.TaskCompletedEvent,
38+
},
39+
},
40+
plugins: map[string][]string{
41+
"ssh": {},
42+
"env": {},
43+
},
44+
tasks: []taskSpec{
45+
{
46+
name: "mpimaster",
47+
img: defaultMPIImage,
48+
req: slot,
49+
min: 1,
50+
rep: 1,
51+
workingDir: "/home",
52+
//Need sometime waiting for worker node ready
53+
command: `sleep 5;
54+
MPI_HOST=` + "`" + `cat /etc/volcano/mpiworker.host | tr "\n" ","` + "`" + `;
55+
mkdir -p /var/run/sshd; /usr/sbin/sshd;
56+
mpiexec --allow-run-as-root --host ${MPI_HOST} -np 2 mpi_hello_world > /home/re`,
57+
},
58+
{
59+
name: "mpiworker",
60+
img: defaultMPIImage,
61+
req: slot,
62+
min: 2,
63+
rep: 2,
64+
workingDir: "/home",
65+
command: "mkdir -p /var/run/sshd; /usr/sbin/sshd -D;",
66+
},
67+
},
68+
}
69+
70+
job := createJob(context, spec)
71+
72+
err := waitJobStates(context, job, []vkv1.JobPhase{
73+
vkv1.Pending, vkv1.Running, vkv1.Completing, vkv1.Completed})
74+
Expect(err).NotTo(HaveOccurred())
75+
})
76+
77+
})

test/e2e/util.go

+13-4
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ const (
5959
masterPriority = "master-pri"
6060
defaultNginxImage = "nginx:1.14"
6161
defaultBusyBoxImage = "busybox:1.24"
62+
//TODO: Use volcano repo instead in the future
63+
defaultMPIImage = "tommylike/volcano-example-mpi:0.0.1"
6264
)
6365

6466
func cpuResource(request string) v1.ResourceList {
@@ -279,6 +281,7 @@ type taskSpec struct {
279281
min, rep int32
280282
img string
281283
command string
284+
workingDir string
282285
hostport int32
283286
req v1.ResourceList
284287
affinity *v1.Affinity
@@ -359,7 +362,7 @@ func createJobInner(context *context, jobSpec *jobSpec) (*vkv1.Job, error) {
359362
Spec: v1.PodSpec{
360363
SchedulerName: "kube-batch",
361364
RestartPolicy: restartPolicy,
362-
Containers: createContainers(task.img, task.command, task.req, task.hostport),
365+
Containers: createContainers(task.img, task.command, task.workingDir, task.req, task.hostport),
363366
Affinity: task.affinity,
364367
},
365368
},
@@ -553,7 +556,8 @@ func waitJobUnschedulable(ctx *context, job *vkv1.Job) error {
553556
return wait.Poll(10*time.Second, oneMinute, jobUnschedulable(ctx, job, now))
554557
}
555558

556-
func createContainers(img, command string, req v1.ResourceList, hostport int32) []v1.Container {
559+
func createContainers(img, command, workingDir string, req v1.ResourceList, hostport int32) []v1.Container {
560+
var imageRepo []string
557561
container := v1.Container{
558562
Image: img,
559563
ImagePullPolicy: v1.PullIfNotPresent,
@@ -562,10 +566,11 @@ func createContainers(img, command string, req v1.ResourceList, hostport int32)
562566
},
563567
}
564568
if strings.Index(img, ":") < 0 {
565-
container.Name = img
569+
imageRepo = strings.Split(img, "/")
566570
} else {
567-
container.Name = img[:strings.Index(img, ":")]
571+
imageRepo = strings.Split(img[:strings.Index(img, ":")], "/")
568572
}
573+
container.Name = imageRepo[len(imageRepo)-1]
569574

570575
if len(command) > 0 {
571576
container.Command = []string{"/bin/sh"}
@@ -581,6 +586,10 @@ func createContainers(img, command string, req v1.ResourceList, hostport int32)
581586
}
582587
}
583588

589+
if len(workingDir) > 0 {
590+
container.WorkingDir = workingDir
591+
}
592+
584593
return []v1.Container{container}
585594
}
586595

0 commit comments

Comments
 (0)