Skip to content

Commit 414bd11

Browse files
committed
Use failure_rate instead of failure count for circuit breaker
Continuation of bazelbuild#18359 I ran multiple experiment and tried to find optimal failure threshold and failure window interval with different remote_timeout, for healthy remote cache, semi-healthy (overloaded) remote cache and unhealthy remote cache. As I described [here](bazelbuild#18359 (comment)) even with healthy remote cache there was 5-10% circuit trip and we were not getting the best result. Issue related to the failure count: 1. When the remote cache is healthy, builds are fast, and Bazel makes a high number of calls to the buildfarm. As a result, even with a moderate failure rate, the failure count may exceed the threshold. 2. Additionally, write calls, which have a higher probability of failure compared to other calls, are batched immediately after the completion of an action's build. This further increases the chances of breaching the failure threshold within the defined window interval. 3. On the other hand, when the remote cache is unhealthy or semi-healthy, builds are significantly slowed down, and Bazel makes fewer calls to the remote cache. Finding a configuration that works well for both healthy and unhealthy remote caches was not feasible. Therefore, changed the approach to use the failure rate, and easily found a configuration that worked effectively in both scenarios. Closes bazelbuild#18539. PiperOrigin-RevId: 538588379 Change-Id: I64a49eeeb32846d41d54ca3b637ded3085588528
1 parent ca13a8e commit 414bd11

File tree

7 files changed

+109
-96
lines changed

7 files changed

+109
-96
lines changed

src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
public class RemoteRetrier extends Retrier {
3636

3737
@Nullable
38-
public static Status fromException(Exception e) {
38+
private static Status fromException(Exception e) {
3939
for (Throwable cause = e; cause != null; cause = cause.getCause()) {
4040
if (cause instanceof StatusRuntimeException) {
4141
return ((StatusRuntimeException) cause).getStatus();

src/main/java/com/google/devtools/build/lib/remote/Retrier.java

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ enum State {
100100
State state();
101101

102102
/** Called after an execution failed. */
103-
void recordFailure(Exception e);
103+
void recordFailure();
104104

105105
/** Called after an execution succeeded. */
106106
void recordSuccess();
@@ -130,7 +130,7 @@ public State state() {
130130
}
131131

132132
@Override
133-
public void recordFailure(Exception e) {}
133+
public void recordFailure() {}
134134

135135
@Override
136136
public void recordSuccess() {}
@@ -245,12 +245,14 @@ public <T> T execute(Callable<T> call, Backoff backoff) throws Exception {
245245
circuitBreaker.recordSuccess();
246246
return r;
247247
} catch (Exception e) {
248-
circuitBreaker.recordFailure(e);
249248
Throwables.throwIfInstanceOf(e, InterruptedException.class);
250-
if (State.TRIAL_CALL.equals(circuitState)) {
249+
if (!shouldRetry.test(e)) {
250+
// A non-retriable error doesn't represent server failure.
251+
circuitBreaker.recordSuccess();
251252
throw e;
252253
}
253-
if (!shouldRetry.test(e)) {
254+
circuitBreaker.recordFailure();
255+
if (State.TRIAL_CALL.equals(circuitState)) {
254256
throw e;
255257
}
256258
final long delayMillis = backoff.nextDelayMillis(e);
@@ -297,11 +299,11 @@ public <T> ListenableFuture<T> executeAsync(AsyncCallable<T> call, Backoff backo
297299

298300
private <T> ListenableFuture<T> onExecuteAsyncFailure(
299301
Exception t, AsyncCallable<T> call, Backoff backoff, State circuitState) {
300-
circuitBreaker.recordFailure(t);
301-
if (circuitState.equals(State.TRIAL_CALL)) {
302-
return Futures.immediateFailedFuture(t);
303-
}
304302
if (isRetriable(t)) {
303+
circuitBreaker.recordFailure();
304+
if (circuitState.equals(State.TRIAL_CALL)) {
305+
return Futures.immediateFailedFuture(t);
306+
}
305307
long waitMillis = backoff.nextDelayMillis(t);
306308
if (waitMillis >= 0) {
307309
try {
@@ -315,6 +317,9 @@ private <T> ListenableFuture<T> onExecuteAsyncFailure(
315317
return Futures.immediateFailedFuture(t);
316318
}
317319
} else {
320+
// gRPC Errors NOT_FOUND, OUT_OF_RANGE, ALREADY_EXISTS etc. are non-retriable error, and they don't represent an
321+
// issue in Server. So treating these errors as successful api call.
322+
circuitBreaker.recordSuccess();
318323
return Futures.immediateFailedFuture(t);
319324
}
320325
}

src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,11 @@
1414
package com.google.devtools.build.lib.remote.circuitbreaker;
1515

1616
import com.google.devtools.build.lib.remote.Retrier;
17-
import com.google.devtools.build.lib.remote.common.CacheNotFoundException;
1817
import com.google.devtools.build.lib.remote.options.RemoteOptions;
19-
import io.grpc.Status;
20-
import java.util.function.Predicate;
21-
22-
import static com.google.devtools.build.lib.remote.RemoteRetrier.fromException;
23-
2418

2519
/** Factory for {@link Retrier.CircuitBreaker} */
2620
public class CircuitBreakerFactory {
27-
public static final Predicate<? super Exception> DEFAULT_IGNORED_ERRORS =
28-
e -> {
29-
Status s = fromException(e);
30-
if (s == null) {
31-
return e.getClass() == CacheNotFoundException.class;
32-
}
33-
switch (s.getCode()) {
34-
case NOT_FOUND:
35-
case OUT_OF_RANGE:
36-
System.out.println("out of range");
37-
return true;
38-
default:
39-
return false;
40-
}
41-
};
21+
public static final int DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE = 100;
4222

4323
private CircuitBreakerFactory() {}
4424

@@ -53,7 +33,7 @@ private CircuitBreakerFactory() {}
5333
public static Retrier.CircuitBreaker createCircuitBreaker(final RemoteOptions remoteOptions) {
5434
if (remoteOptions.circuitBreakerStrategy == RemoteOptions.CircuitBreakerStrategy.FAILURE) {
5535
return new FailureCircuitBreaker(
56-
remoteOptions.remoteFailureThreshold,
36+
remoteOptions.remoteFailureRateThreshold,
5737
(int) remoteOptions.remoteFailureWindowInterval.toMillis());
5838
}
5939
return Retrier.ALLOW_ALL_CALLS;

src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,41 +18,43 @@
1818
import java.util.concurrent.ScheduledExecutorService;
1919
import java.util.concurrent.TimeUnit;
2020
import java.util.concurrent.atomic.AtomicInteger;
21-
import java.util.function.Predicate;
2221

2322
/**
2423
* The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents
25-
* further calls to a remote cache once the number of failures within a given window exceeds a
26-
* specified threshold for a build. In the context of Bazel, a new instance of {@link
27-
* Retrier.CircuitBreaker} is created for each build. Therefore, if the circuit breaker trips during
28-
* a build, the remote cache will be disabled for that build. However, it will be enabled again for
29-
* the next build as a new instance of {@link Retrier.CircuitBreaker} will be created.
24+
* further calls to a remote cache once the failures rate within a given window exceeds a specified
25+
* threshold for a build. In the context of Bazel, a new instance of {@link Retrier.CircuitBreaker}
26+
* is created for each build. Therefore, if the circuit breaker trips during a build, the remote
27+
* cache will be disabled for that build. However, it will be enabled again for the next build as a
28+
* new instance of {@link Retrier.CircuitBreaker} will be created.
3029
*/
3130
public class FailureCircuitBreaker implements Retrier.CircuitBreaker {
3231

3332
private State state;
33+
private final AtomicInteger successes;
3434
private final AtomicInteger failures;
35-
private final int failureThreshold;
35+
private final int failureRateThreshold;
3636
private final int slidingWindowSize;
37+
private final int minCallCountToComputeFailureRate;
3738
private final ScheduledExecutorService scheduledExecutor;
38-
private final Predicate<? super Exception> ignoredErrors;
3939

4040
/**
4141
* Creates a {@link FailureCircuitBreaker}.
4242
*
43-
* @param failureThreshold is used to set the number of failures required to trip the circuit
44-
* breaker in given time window.
43+
* @param failureRateThreshold is used to set the min percentage of failure required to trip the
44+
* circuit breaker in given time window.
4545
* @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number
4646
* of failures.
4747
*/
48-
public FailureCircuitBreaker(int failureThreshold, int slidingWindowSize) {
49-
this.failureThreshold = failureThreshold;
48+
public FailureCircuitBreaker(int failureRateThreshold, int slidingWindowSize) {
5049
this.failures = new AtomicInteger(0);
50+
this.successes = new AtomicInteger(0);
51+
this.failureRateThreshold = failureRateThreshold;
5152
this.slidingWindowSize = slidingWindowSize;
53+
this.minCallCountToComputeFailureRate =
54+
CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE;
5255
this.state = State.ACCEPT_CALLS;
5356
this.scheduledExecutor =
5457
slidingWindowSize > 0 ? Executors.newSingleThreadScheduledExecutor() : null;
55-
this.ignoredErrors = CircuitBreakerFactory.DEFAULT_IGNORED_ERRORS;
5658
}
5759

5860
@Override
@@ -61,23 +63,30 @@ public State state() {
6163
}
6264

6365
@Override
64-
public void recordFailure(Exception e) {
65-
if (!ignoredErrors.test(e)) {
66-
int failureCount = failures.incrementAndGet();
67-
if (slidingWindowSize > 0) {
68-
var unused =
69-
scheduledExecutor.schedule(
70-
failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
71-
}
72-
// Since the state can only be changed to the open state, synchronization is not required.
73-
if (failureCount > this.failureThreshold) {
74-
this.state = State.REJECT_CALLS;
75-
}
66+
public void recordFailure() {
67+
int failureCount = failures.incrementAndGet();
68+
int totalCallCount = successes.get() + failureCount;
69+
if (slidingWindowSize > 0) {
70+
var unused = scheduledExecutor.schedule(failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
71+
}
72+
73+
if (totalCallCount < minCallCountToComputeFailureRate) {
74+
// The remote call count is below the threshold required to calculate the failure rate.
75+
return;
76+
}
77+
double failureRate = (failureCount * 100.0) / totalCallCount;
78+
79+
// Since the state can only be changed to the open state, synchronization is not required.
80+
if (failureRate > this.failureRateThreshold) {
81+
this.state = State.REJECT_CALLS;
7682
}
7783
}
7884

7985
@Override
8086
public void recordSuccess() {
81-
// do nothing, implement if we need to set threshold on failure rate instead of count.
87+
successes.incrementAndGet();
88+
if (slidingWindowSize > 0) {
89+
var unused = scheduledExecutor.schedule(successes::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
90+
}
8291
}
8392
}

src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -666,15 +666,16 @@ public RemoteOutputsStrategyConverter() {
666666
public CircuitBreakerStrategy circuitBreakerStrategy;
667667

668668
@Option(
669-
name = "experimental_remote_failure_threshold",
670-
defaultValue = "100",
669+
name = "experimental_remote_failure_rate_threshold",
670+
defaultValue = "10",
671671
documentationCategory = OptionDocumentationCategory.REMOTE,
672672
effectTags = {OptionEffectTag.EXECUTION},
673+
converter = Converters.PercentageConverter.class,
673674
help =
674-
"Sets the allowed number of failures in a specific time window after which it stops"
675-
+ " calling to the remote cache/executor. By default the value is 100. Setting this"
676-
+ " to 0 or negative means no limitation.")
677-
public int remoteFailureThreshold;
675+
"Sets the allowed number of failure rate in percentage for a specific time window after"
676+
+ " which it stops calling to the remote cache/executor. By default the value is 10."
677+
+ " Setting this to 0 means no limitation.")
678+
public int remoteFailureRateThreshold;
678679

679680
@Option(
680681
name = "experimental_remote_failure_window_interval",
@@ -683,7 +684,7 @@ public RemoteOutputsStrategyConverter() {
683684
effectTags = {OptionEffectTag.EXECUTION},
684685
converter = RemoteDurationConverter.class,
685686
help =
686-
"The interval in which the failure count of the remote requests are computed. On zero or"
687+
"The interval in which the failure rate of the remote requests are computed. On zero or"
687688
+ " negative value the failure duration is computed the whole duration of the"
688689
+ " execution.Following units can be used: Days (d), hours (h), minutes (m), seconds"
689690
+ " (s), and milliseconds (ms). If the unit is omitted, the value is interpreted as"

src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
import java.util.function.Predicate;
3939
import java.util.function.Supplier;
4040
import javax.annotation.concurrent.ThreadSafe;
41+
42+
import io.grpc.Status;
43+
import io.grpc.StatusRuntimeException;
4144
import org.junit.After;
4245
import org.junit.Before;
4346
import org.junit.Test;
@@ -94,7 +97,7 @@ public void retryShouldWork_failure() throws Exception {
9497
assertThat(e).hasMessageThat().isEqualTo("call failed");
9598

9699
assertThat(numCalls.get()).isEqualTo(3);
97-
verify(alwaysOpen, times(3)).recordFailure(any(Exception.class));
100+
verify(alwaysOpen, times(3)).recordFailure();
98101
verify(alwaysOpen, never()).recordSuccess();
99102
}
100103

@@ -118,8 +121,8 @@ public void retryShouldWorkNoRetries_failure() throws Exception {
118121
assertThat(e).hasMessageThat().isEqualTo("call failed");
119122

120123
assertThat(numCalls.get()).isEqualTo(1);
121-
verify(alwaysOpen, times(1)).recordFailure(e);
122-
verify(alwaysOpen, never()).recordSuccess();
124+
verify(alwaysOpen, never()).recordFailure();
125+
verify(alwaysOpen, times(1)).recordSuccess();
123126
}
124127

125128
@Test
@@ -139,7 +142,7 @@ public void retryShouldWork_success() throws Exception {
139142
});
140143
assertThat(val).isEqualTo(1);
141144

142-
verify(alwaysOpen, times(2)).recordFailure(any(Exception.class));
145+
verify(alwaysOpen, times(2)).recordFailure();
143146
verify(alwaysOpen, times(1)).recordSuccess();
144147
}
145148

@@ -351,7 +354,7 @@ public synchronized State state() {
351354
}
352355

353356
@Override
354-
public synchronized void recordFailure(Exception e) {
357+
public synchronized void recordFailure() {
355358
consecutiveFailures++;
356359
if (consecutiveFailures >= maxConsecutiveFailures) {
357360
state = State.REJECT_CALLS;

0 commit comments

Comments
 (0)