volcano-sh · k82cn · Apr 26, 2019 · Apr 22, 2019 · Apr 23, 2019
diff --git a/example/integrations/tensorflow/Dockerfile b/example/integrations/tensorflow/Dockerfile
@@ -0,0 +1,9 @@
+#NOTE: the build process would change during developing.
+FROM python:2.7
+MAINTAINER volcano <[email protected]>
+RUN  apt-get update --fix-missing \
+&& apt-get install -y git \
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/*
+RUN pip install tf-nightly-gpu \
+&& git clone https://github.com/tensorflow/benchmarks.git /opt/tf-benchmarks
diff --git a/example/integrations/tensorflow/tf-example.yaml b/example/integrations/tensorflow/tf-example.yaml
@@ -0,0 +1,92 @@
+################################################
+#                                              #
+#    Demo for running TF tasks on Volcano      #
+#                                              #
+################################################
+#
+# This yaml used to demonstrate how to running a TF task via Volcano Job,
+# the running sample program is from TF benchmark
+# (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+# The equivalent command when running locally:
+#
+#   python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
+#   --local_parameter_device=cpu --device=cpu --data_format=NHWC
+#
+# The output from ps or worker pod can be used to identify whether the TF cluster
+# has been correctly configured:
+#
+#    (log from worker pod....)
+#    2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
+#    Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222}
+#    2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
+#    Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}
+#
+#    (log from ps pod....)
+#    2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
+#    Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
+#    2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
+#    Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222}
+#
+# **NOTES**: This example may take about an hour to finish.
+
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  name: tensorflow-benchmark
+spec:
+  minAvailable: 2
+  schedulerName: kube-batch
+  plugins:
+    #TODO: Upgrade this into `svc` plugin once it's supported in volcano
+    env: []
+  policies:
+    - event: PodEvicted
+      action: RestartJob
+  tasks:
+    - replicas: 1
+      name: ps
+      template:
+        spec:
+          imagePullSecrets:
+            - name: default-secret
+          containers:
+            - command:
+                - sh
+                - -c
+                - |
+                  PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
+                  WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
+                  python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
+              image: volcanosh/example-tf:0.0.1
+              name: tensorflow
+              ports:
+                - containerPort: 2222
+                  name: tfjob-port
+              resources: {}
+              workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
+          restartPolicy: OnFailure
+    - replicas: 1
+      name: worker
+      policies:
+        - event: TaskCompleted
+          action: CompleteJob
+      template:
+        spec:
+          imagePullSecrets:
+            - name: default-secret
+          containers:
+            - command:
+                - sh
+                - -c
+                - |
+                  PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
+                  WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
+                  python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
+              image: volcanosh/example-tf:0.0.1
+              name: tensorflow
+              ports:
+                - containerPort: 2222
+                  name: tfjob-port
+              resources: {}
+              workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
+          restartPolicy: OnFailure