Extend the gRPC unrequested time-out to 1h for the TF gRPC server.

tensorflower-gardener · tensorflower-gardener · commit 7e5b21421874 · 2024-10-10T19:24:50.000-07:00
PiperOrigin-RevId: 684657048
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -32,6 +32,13 @@ GrpcEagerServiceImpl::GrpcEagerServiceImpl(
       local_impl_(env),
       enqueue_streaming_thread_(env_->env, "enqueue_streaming_thread", 1) {
   server_builder->RegisterService(&service_);
+  // gRPC by default will cancel requests that sit in a completion queue for
+  // more than 30s. See
+  // https://github.com/grpc/grpc/blob/e52e48b7ef83feeff56ed0894ce39841ea8bd483/include/grpc/impl/channel_arg_names.h#L106-L111
+  // Extending this to 1 hour for Tensorflow since some graphs may have periods
+  // of heavy load which may cause the server to run into these cancellations.
+  server_builder->AddChannelArgument(
+      "grpc.server_max_unrequested_time_in_server", 3600);
   cq_ = server_builder->AddCompletionQueue();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -54,6 +54,14 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
         is_shutdown_(false),
         default_session_config_(default_session_config) {
     builder->RegisterService(&master_service_);
+    // gRPC by default will cancel requests that sit in a completion queue for
+    // more than 30s. See
+    // https://github.com/grpc/grpc/blob/e52e48b7ef83feeff56ed0894ce39841ea8bd483/include/grpc/impl/channel_arg_names.h#L106-L111
+    // Extending this to 1 hour for Tensorflow since some graphs may have
+    // periods of heavy load which may cause the server to run into these
+    // cancellations.
+    builder->AddChannelArgument("grpc.server_max_unrequested_time_in_server",
+                                3600);
     cq_ = builder->AddCompletionQueue();
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -371,6 +371,14 @@ class GrpcWorkerService : public tsl::AsyncServiceInterface {
                     GrpcWorkerServiceOptions options)
       : is_shutdown_(false) {
     builder->RegisterService(&worker_service_);
+    // gRPC by default will cancel requests that sit in a completion queue for
+    // more than 30s. See
+    // https://github.com/grpc/grpc/blob/e52e48b7ef83feeff56ed0894ce39841ea8bd483/include/grpc/impl/channel_arg_names.h#L106-L111
+    // Extending this to 1 hour for Tensorflow since some graphs may have
+    // periods of heavy load which may cause the server to run into these
+    // cancellations.
+    builder->AddChannelArgument("grpc.server_max_unrequested_time_in_server",
+                                3600);
 
     for (int i = 0; i < options.num_serving_threads; i++) {
       threads_.emplace_back(new GrpcWorkerServiceThread(