-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathtf-example.yaml
92 lines (91 loc) · 4.02 KB
/
tf-example.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
################################################
# #
# Demo for running TF tasks on Volcano #
# #
################################################
#
# This yaml used to demonstrate how to running a TF task via Volcano Job,
# the running sample program is from TF benchmark
# (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
# The equivalent command when running locally:
#
# python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
# --local_parameter_device=cpu --device=cpu --data_format=NHWC
#
# The output from ps or worker pod can be used to identify whether the TF cluster
# has been correctly configured:
#
# (log from worker pod....)
# 2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
# Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222}
# 2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
# Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}
#
# (log from ps pod....)
# 2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
# Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
# 2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
# Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222}
#
# **NOTES**: This example may take about an hour to finish.
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: tensorflow-benchmark
spec:
minAvailable: 2
schedulerName: kube-batch
plugins:
#TODO: Upgrade this into `svc` plugin once it's supported in volcano
env: []
policies:
- event: PodEvicted
action: RestartJob
tasks:
- replicas: 1
name: ps
template:
spec:
imagePullSecrets:
- name: default-secret
containers:
- command:
- sh
- -c
- |
PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
image: volcanosh/example-tf:0.0.1
name: tensorflow
ports:
- containerPort: 2222
name: tfjob-port
resources: {}
workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
restartPolicy: OnFailure
- replicas: 1
name: worker
policies:
- event: TaskCompleted
action: CompleteJob
template:
spec:
imagePullSecrets:
- name: default-secret
containers:
- command:
- sh
- -c
- |
PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
image: volcanosh/example-tf:0.0.1
name: tensorflow
ports:
- containerPort: 2222
name: tfjob-port
resources: {}
workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
restartPolicy: OnFailure