Skip to content

Commit 7e5b214

Browse files
Extend the gRPC unrequested time-out to 1h for the TF gRPC server.
PiperOrigin-RevId: 684657048
1 parent 92a84a3 commit 7e5b214

File tree

3 files changed

+23
-0
lines changed

3 files changed

+23
-0
lines changed

tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,13 @@ GrpcEagerServiceImpl::GrpcEagerServiceImpl(
3232
local_impl_(env),
3333
enqueue_streaming_thread_(env_->env, "enqueue_streaming_thread", 1) {
3434
server_builder->RegisterService(&service_);
35+
// gRPC by default will cancel requests that sit in a completion queue for
36+
// more than 30s. See
37+
// https://github.com/grpc/grpc/blob/e52e48b7ef83feeff56ed0894ce39841ea8bd483/include/grpc/impl/channel_arg_names.h#L106-L111
38+
// Extending this to 1 hour for Tensorflow since some graphs may have periods
39+
// of heavy load which may cause the server to run into these cancellations.
40+
server_builder->AddChannelArgument(
41+
"grpc.server_max_unrequested_time_in_server", 3600);
3542
cq_ = server_builder->AddCompletionQueue();
3643
}
3744

tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,14 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
5454
is_shutdown_(false),
5555
default_session_config_(default_session_config) {
5656
builder->RegisterService(&master_service_);
57+
// gRPC by default will cancel requests that sit in a completion queue for
58+
// more than 30s. See
59+
// https://github.com/grpc/grpc/blob/e52e48b7ef83feeff56ed0894ce39841ea8bd483/include/grpc/impl/channel_arg_names.h#L106-L111
60+
// Extending this to 1 hour for Tensorflow since some graphs may have
61+
// periods of heavy load which may cause the server to run into these
62+
// cancellations.
63+
builder->AddChannelArgument("grpc.server_max_unrequested_time_in_server",
64+
3600);
5765
cq_ = builder->AddCompletionQueue();
5866
}
5967

tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,14 @@ class GrpcWorkerService : public tsl::AsyncServiceInterface {
371371
GrpcWorkerServiceOptions options)
372372
: is_shutdown_(false) {
373373
builder->RegisterService(&worker_service_);
374+
// gRPC by default will cancel requests that sit in a completion queue for
375+
// more than 30s. See
376+
// https://github.com/grpc/grpc/blob/e52e48b7ef83feeff56ed0894ce39841ea8bd483/include/grpc/impl/channel_arg_names.h#L106-L111
377+
// Extending this to 1 hour for Tensorflow since some graphs may have
378+
// periods of heavy load which may cause the server to run into these
379+
// cancellations.
380+
builder->AddChannelArgument("grpc.server_max_unrequested_time_in_server",
381+
3600);
374382

375383
for (int i = 0; i < options.num_serving_threads; i++) {
376384
threads_.emplace_back(new GrpcWorkerServiceThread(

0 commit comments

Comments
 (0)