Skip to content

Commit 10fb5f6

Browse files
amishra-ucopybara-github
authored andcommitted
Use failure_rate instead of failure count for circuit breaker
Continuation of #18359 I ran multiple experiment and tried to find optimal failure threshold and failure window interval with different remote_timeout, for healthy remote cache, semi-healthy (overloaded) remote cache and unhealthy remote cache. As I described [here](#18359 (comment)) even with healthy remote cache there was 5-10% circuit trip and we were not getting the best result. Issue related to the failure count: 1. When the remote cache is healthy, builds are fast, and Bazel makes a high number of calls to the buildfarm. As a result, even with a moderate failure rate, the failure count may exceed the threshold. 2. Additionally, write calls, which have a higher probability of failure compared to other calls, are batched immediately after the completion of an action's build. This further increases the chances of breaching the failure threshold within the defined window interval. 3. On the other hand, when the remote cache is unhealthy or semi-healthy, builds are significantly slowed down, and Bazel makes fewer calls to the remote cache. Finding a configuration that works well for both healthy and unhealthy remote caches was not feasible. Therefore, changed the approach to use the failure rate, and easily found a configuration that worked effectively in both scenarios. Closes #18539. PiperOrigin-RevId: 538588379 Change-Id: I64a49eeeb32846d41d54ca3b637ded3085588528
1 parent d289229 commit 10fb5f6

File tree

4 files changed

+80
-25
lines changed

4 files changed

+80
-25
lines changed

src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ public class CircuitBreakerFactory {
2323

2424
public static final ImmutableSet<Class<? extends Exception>> DEFAULT_IGNORED_ERRORS =
2525
ImmutableSet.of(CacheNotFoundException.class);
26+
public static final int DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE = 100;
2627

2728
private CircuitBreakerFactory() {}
2829

@@ -37,7 +38,7 @@ private CircuitBreakerFactory() {}
3738
public static Retrier.CircuitBreaker createCircuitBreaker(final RemoteOptions remoteOptions) {
3839
if (remoteOptions.circuitBreakerStrategy == RemoteOptions.CircuitBreakerStrategy.FAILURE) {
3940
return new FailureCircuitBreaker(
40-
remoteOptions.remoteFailureThreshold,
41+
remoteOptions.remoteFailureRateThreshold,
4142
(int) remoteOptions.remoteFailureWindowInterval.toMillis());
4243
}
4344
return Retrier.ALLOW_ALL_CALLS;

src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,33 +22,40 @@
2222

2323
/**
2424
* The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents
25-
* further calls to a remote cache once the number of failures within a given window exceeds a
26-
* specified threshold for a build. In the context of Bazel, a new instance of {@link
27-
* Retrier.CircuitBreaker} is created for each build. Therefore, if the circuit breaker trips during
28-
* a build, the remote cache will be disabled for that build. However, it will be enabled again for
29-
* the next build as a new instance of {@link Retrier.CircuitBreaker} will be created.
25+
* further calls to a remote cache once the failures rate within a given window exceeds a specified
26+
* threshold for a build. In the context of Bazel, a new instance of {@link Retrier.CircuitBreaker}
27+
* is created for each build. Therefore, if the circuit breaker trips during a build, the remote
28+
* cache will be disabled for that build. However, it will be enabled again for the next build as a
29+
* new instance of {@link Retrier.CircuitBreaker} will be created.
3030
*/
3131
public class FailureCircuitBreaker implements Retrier.CircuitBreaker {
3232

3333
private State state;
34+
private final AtomicInteger successes;
3435
private final AtomicInteger failures;
35-
private final int failureThreshold;
36+
private final AtomicInteger ignoredFailures;
37+
private final int failureRateThreshold;
3638
private final int slidingWindowSize;
39+
private final int minCallCountToComputeFailureRate;
3740
private final ScheduledExecutorService scheduledExecutor;
3841
private final ImmutableSet<Class<? extends Exception>> ignoredErrors;
3942

4043
/**
4144
* Creates a {@link FailureCircuitBreaker}.
4245
*
43-
* @param failureThreshold is used to set the number of failures required to trip the circuit
44-
* breaker in given time window.
46+
* @param failureRateThreshold is used to set the min percentage of failure required to trip the
47+
* circuit breaker in given time window.
4548
* @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number
4649
* of failures.
4750
*/
48-
public FailureCircuitBreaker(int failureThreshold, int slidingWindowSize) {
49-
this.failureThreshold = failureThreshold;
51+
public FailureCircuitBreaker(int failureRateThreshold, int slidingWindowSize) {
5052
this.failures = new AtomicInteger(0);
53+
this.successes = new AtomicInteger(0);
54+
this.ignoredFailures = new AtomicInteger(0);
55+
this.failureRateThreshold = failureRateThreshold;
5156
this.slidingWindowSize = slidingWindowSize;
57+
this.minCallCountToComputeFailureRate =
58+
CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE;
5259
this.state = State.ACCEPT_CALLS;
5360
this.scheduledExecutor =
5461
slidingWindowSize > 0 ? Executors.newSingleThreadScheduledExecutor() : null;
@@ -64,20 +71,40 @@ public State state() {
6471
public void recordFailure(Exception e) {
6572
if (!ignoredErrors.contains(e.getClass())) {
6673
int failureCount = failures.incrementAndGet();
74+
int totalCallCount = successes.get() + failureCount + ignoredFailures.get();
6775
if (slidingWindowSize > 0) {
6876
var unused =
6977
scheduledExecutor.schedule(
7078
failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
7179
}
80+
81+
if (totalCallCount < minCallCountToComputeFailureRate) {
82+
// The remote call count is below the threshold required to calculate the failure rate.
83+
return;
84+
}
85+
double failureRate = (failureCount * 100.0) / totalCallCount;
86+
7287
// Since the state can only be changed to the open state, synchronization is not required.
73-
if (failureCount > this.failureThreshold) {
88+
if (failureRate > this.failureRateThreshold) {
7489
this.state = State.REJECT_CALLS;
7590
}
91+
} else {
92+
ignoredFailures.incrementAndGet();
93+
if (slidingWindowSize > 0) {
94+
var unused =
95+
scheduledExecutor.schedule(
96+
ignoredFailures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
97+
}
7698
}
7799
}
78100

79101
@Override
80102
public void recordSuccess() {
81-
// do nothing, implement if we need to set threshold on failure rate instead of count.
103+
successes.incrementAndGet();
104+
if (slidingWindowSize > 0) {
105+
var unused =
106+
scheduledExecutor.schedule(
107+
successes::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
108+
}
82109
}
83110
}

src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -666,15 +666,16 @@ public RemoteOutputsStrategyConverter() {
666666
public CircuitBreakerStrategy circuitBreakerStrategy;
667667

668668
@Option(
669-
name = "experimental_remote_failure_threshold",
670-
defaultValue = "100",
669+
name = "experimental_remote_failure_rate_threshold",
670+
defaultValue = "10",
671671
documentationCategory = OptionDocumentationCategory.REMOTE,
672672
effectTags = {OptionEffectTag.EXECUTION},
673+
converter = Converters.PercentageConverter.class,
673674
help =
674-
"Sets the allowed number of failures in a specific time window after which it stops"
675-
+ " calling to the remote cache/executor. By default the value is 100. Setting this"
676-
+ " to 0 or negative means no limitation.")
677-
public int remoteFailureThreshold;
675+
"Sets the allowed number of failure rate in percentage for a specific time window after"
676+
+ " which it stops calling to the remote cache/executor. By default the value is 10."
677+
+ " Setting this to 0 means no limitation.")
678+
public int remoteFailureRateThreshold;
678679

679680
@Option(
680681
name = "experimental_remote_failure_window_interval",
@@ -683,7 +684,7 @@ public RemoteOutputsStrategyConverter() {
683684
effectTags = {OptionEffectTag.EXECUTION},
684685
converter = RemoteDurationConverter.class,
685686
help =
686-
"The interval in which the failure count of the remote requests are computed. On zero or"
687+
"The interval in which the failure rate of the remote requests are computed. On zero or"
687688
+ " negative value the failure duration is computed the whole duration of the"
688689
+ " execution.Following units can be used: Days (d), hours (h), minutes (m), seconds"
689690
+ " (s), and milliseconds (ms). If the unit is omitted, the value is interpreted as"

src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.ArrayList;
2222
import java.util.Collections;
2323
import java.util.List;
24+
import java.util.stream.IntStream;
2425
import org.junit.Test;
2526
import org.junit.runner.RunWith;
2627
import org.junit.runners.JUnit4;
@@ -29,17 +30,17 @@
2930
public class FailureCircuitBreakerTest {
3031

3132
@Test
32-
public void testRecordFailure() throws InterruptedException {
33-
final int failureThreshold = 10;
33+
public void testRecordFailure_withIgnoredErrors() throws InterruptedException {
34+
final int failureRateThreshold = 10;
3435
final int windowInterval = 100;
3536
FailureCircuitBreaker failureCircuitBreaker =
36-
new FailureCircuitBreaker(failureThreshold, windowInterval);
37+
new FailureCircuitBreaker(failureRateThreshold, windowInterval);
3738

3839
List<Exception> listOfExceptionThrownOnFailure = new ArrayList<>();
39-
for (int index = 0; index < failureThreshold; index++) {
40+
for (int index = 0; index < failureRateThreshold; index++) {
4041
listOfExceptionThrownOnFailure.add(new Exception());
4142
}
42-
for (int index = 0; index < failureThreshold * 9; index++) {
43+
for (int index = 0; index < failureRateThreshold * 9; index++) {
4344
listOfExceptionThrownOnFailure.add(new CacheNotFoundException(Digest.newBuilder().build()));
4445
}
4546

@@ -65,4 +66,29 @@ public void testRecordFailure() throws InterruptedException {
6566
failureCircuitBreaker.recordFailure(new Exception());
6667
assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS);
6768
}
69+
70+
@Test
71+
public void testRecordFailure_minCallCriteriaNotMet() throws InterruptedException {
72+
final int failureRateThreshold = 10;
73+
final int windowInterval = 100;
74+
final int minCallToComputeFailure =
75+
CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE;
76+
FailureCircuitBreaker failureCircuitBreaker =
77+
new FailureCircuitBreaker(failureRateThreshold, windowInterval);
78+
79+
// make half failure call, half success call and number of total call less than
80+
// minCallToComputeFailure.
81+
IntStream.range(0, minCallToComputeFailure >> 1)
82+
.parallel()
83+
.forEach(i -> failureCircuitBreaker.recordFailure(new Exception()));
84+
IntStream.range(0, minCallToComputeFailure >> 1)
85+
.parallel()
86+
.forEach(i -> failureCircuitBreaker.recordSuccess());
87+
assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS);
88+
89+
// Sleep for less than windowInterval.
90+
Thread.sleep(windowInterval - 20);
91+
failureCircuitBreaker.recordFailure(new Exception());
92+
assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS);
93+
}
6894
}

0 commit comments

Comments
 (0)