example/integrations/tensorflow/tf-example.yaml

################################################
#                                              #
#    Demo for running TF tasks on Volcano      #
#                                              #
################################################
#
# This yaml used to demonstrate how to running a TF task via Volcano Job,
# the running sample program is from TF benchmark
# (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
# The equivalent command when running locally:
#
#   python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
#   --local_parameter_device=cpu --device=cpu --data_format=NHWC
#
# The output from ps or worker pod can be used to identify whether the TF cluster
# has been correctly configured:
#
#    (log from worker pod....)
#    2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
#    Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222}
#    2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
#    Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}
#
#    (log from ps pod....)
#    2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
#    Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
#    2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
#    Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222}
#
# **NOTES**: This example may take about an hour to finish.

apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
  name: tensorflow-benchmark
spec:
  minAvailable: 2
  schedulerName: kube-batch
  plugins:
    #TODO: Upgrade this into `svc` plugin once it's supported in volcano
    env: []
  policies:
    - event: PodEvicted
      action: RestartJob
  tasks:
    - replicas: 1
      name: ps
      template:
        spec:
          imagePullSecrets:
            - name: default-secret
          containers:
            - command:
                - sh
                - -c
                - |
                  PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
                  WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
                  python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
              image: volcanosh/example-tf:0.0.1
              name: tensorflow
              ports:
                - containerPort: 2222
                  name: tfjob-port
              resources: {}
              workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
          restartPolicy: OnFailure
    - replicas: 1
      name: worker
      policies:
        - event: TaskCompleted
          action: CompleteJob
      template:
        spec:
          imagePullSecrets:
            - name: default-secret
          containers:
            - command:
                - sh
                - -c
                - |
                  PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
                  WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
                  python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
              image: volcanosh/example-tf:0.0.1
              name: tensorflow
              ports:
                - containerPort: 2222
                  name: tfjob-port
              resources: {}
              workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
          restartPolicy: OnFailure