Skip to content

Commit 992ffc2

Browse files
author
Klaus Ma
authored
Merge pull request volcano-sh#98 from TommyLike/feature/support-tf-example
2 parents 76ad1fe + b770297 commit 992ffc2

File tree

2 files changed

+101
-0
lines changed

2 files changed

+101
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#NOTE: the build process would change during developing.
2+
FROM python:2.7
3+
MAINTAINER volcano <[email protected]>
4+
RUN apt-get update --fix-missing \
5+
&& apt-get install -y git \
6+
&& apt-get clean \
7+
&& rm -rf /var/lib/apt/lists/*
8+
RUN pip install tf-nightly-gpu \
9+
&& git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
################################################
2+
# #
3+
# Demo for running TF tasks on Volcano #
4+
# #
5+
################################################
6+
#
7+
# This yaml used to demonstrate how to running a TF task via Volcano Job,
8+
# the running sample program is from TF benchmark
9+
# (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
10+
# The equivalent command when running locally:
11+
#
12+
# python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
13+
# --local_parameter_device=cpu --device=cpu --data_format=NHWC
14+
#
15+
# The output from ps or worker pod can be used to identify whether the TF cluster
16+
# has been correctly configured:
17+
#
18+
# (log from worker pod....)
19+
# 2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
20+
# Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222}
21+
# 2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
22+
# Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}
23+
#
24+
# (log from ps pod....)
25+
# 2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
26+
# Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
27+
# 2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
28+
# Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222}
29+
#
30+
# **NOTES**: This example may take about an hour to finish.
31+
32+
apiVersion: batch.volcano.sh/v1alpha1
33+
kind: Job
34+
metadata:
35+
name: tensorflow-benchmark
36+
spec:
37+
minAvailable: 2
38+
schedulerName: kube-batch
39+
plugins:
40+
#TODO: Upgrade this into `svc` plugin once it's supported in volcano
41+
env: []
42+
policies:
43+
- event: PodEvicted
44+
action: RestartJob
45+
tasks:
46+
- replicas: 1
47+
name: ps
48+
template:
49+
spec:
50+
imagePullSecrets:
51+
- name: default-secret
52+
containers:
53+
- command:
54+
- sh
55+
- -c
56+
- |
57+
PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
58+
WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
59+
python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
60+
image: volcanosh/example-tf:0.0.1
61+
name: tensorflow
62+
ports:
63+
- containerPort: 2222
64+
name: tfjob-port
65+
resources: {}
66+
workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
67+
restartPolicy: OnFailure
68+
- replicas: 1
69+
name: worker
70+
policies:
71+
- event: TaskCompleted
72+
action: CompleteJob
73+
template:
74+
spec:
75+
imagePullSecrets:
76+
- name: default-secret
77+
containers:
78+
- command:
79+
- sh
80+
- -c
81+
- |
82+
PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
83+
WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
84+
python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
85+
image: volcanosh/example-tf:0.0.1
86+
name: tensorflow
87+
ports:
88+
- containerPort: 2222
89+
name: tfjob-port
90+
resources: {}
91+
workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
92+
restartPolicy: OnFailure

0 commit comments

Comments
 (0)