Skip to content

Commit e802842

Browse files
[6.3.0] Use failure_rate instead of failure count for circuit breaker (#18559)
* feat: Implement failure circuit breaker Copy of #18120: I accidentally closed #18120 during rebase and doesn't have permission to reopen. We have noticed that any problems with the remote cache have a detrimental effect on build times. On investigation we found that the interface for the circuit breaker was left unimplemented. To address this issue, implemented a failure circuit breaker, which includes three new Bazel flags: 1) experimental_circuitbreaker_strategy, 2) experimental_remote_failure_threshold, and 3) experimental_emote_failure_window. In this implementation, I have implemented failure strategy for circuit breaker and used failure count to trip the circuit. Reasoning behind using failure count instead of failure rate : To measure failure rate I also need the success count. While both the failure and success count need to be an AtomicInteger as both will be modified concurrently by multiple threads. Even though getAndIncrement is very light weight operation, at very high request it might contribute to latency. Reasoning behind using failure circuit breaker : A new instance of Retrier.CircuitBreaker is created for each build. Therefore, if the circuit breaker trips during a build, the remote cache will be disabled for that build. However, it will be enabled again for the next build as a new instance of Retrier.CircuitBreaker will be created. If needed in the future we may add cool down strategy also. e.g. failure_and_cool_down_startegy. closes #18136 Closes #18359. PiperOrigin-RevId: 536349954 Change-Id: I5e1c57d4ad0ce07ddc4808bf1f327bc5df6ce704 * remove target included in cherry-pick by mistake * Use failure_rate instead of failure count for circuit breaker --------- Co-authored-by: Ian (Hee) Cha <[email protected]>
1 parent ea4ad30 commit e802842

File tree

4 files changed

+76
-34
lines changed

4 files changed

+76
-34
lines changed

src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ public class CircuitBreakerFactory {
2323

2424
public static final ImmutableSet<Class<? extends Exception>> DEFAULT_IGNORED_ERRORS =
2525
ImmutableSet.of(CacheNotFoundException.class);
26+
public static final int DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE = 100;
2627

2728
private CircuitBreakerFactory() {}
2829

@@ -37,7 +38,7 @@ private CircuitBreakerFactory() {}
3738
public static Retrier.CircuitBreaker createCircuitBreaker(final RemoteOptions remoteOptions) {
3839
if (remoteOptions.circuitBreakerStrategy == RemoteOptions.CircuitBreakerStrategy.FAILURE) {
3940
return new FailureCircuitBreaker(
40-
remoteOptions.remoteFailureThreshold,
41+
remoteOptions.remoteFailureRateThreshold,
4142
(int) remoteOptions.remoteFailureWindowInterval.toMillis());
4243
}
4344
return Retrier.ALLOW_ALL_CALLS;

src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java

Lines changed: 35 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,34 +21,38 @@
2121
import java.util.concurrent.atomic.AtomicInteger;
2222

2323
/**
24-
* The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents
25-
* further calls to a remote cache once the number of failures within a given window exceeds a
26-
* specified threshold for a build. In the context of Bazel, a new instance of {@link
27-
* Retrier.CircuitBreaker} is created for each build. Therefore, if the circuit breaker trips during
28-
* a build, the remote cache will be disabled for that build. However, it will be enabled again for
29-
* the next build as a new instance of {@link Retrier.CircuitBreaker} will be created.
24+
* The {@link FailureCircuitBreaker} implementation of the {@link Retrier.CircuitBreaker} prevents further calls to
25+
* a remote cache once the failures rate within a given window exceeds a specified threshold for a build.
26+
* In the context of Bazel, a new instance of {@link Retrier.CircuitBreaker} is created for each build. Therefore, if
27+
* the circuit breaker trips during a build, the remote cache will be disabled for that build. However, it will be
28+
* enabled again for the next build as a new instance of {@link Retrier.CircuitBreaker} will be created.
3029
*/
3130
public class FailureCircuitBreaker implements Retrier.CircuitBreaker {
3231

3332
private State state;
33+
private final AtomicInteger successes;
3434
private final AtomicInteger failures;
35-
private final int failureThreshold;
35+
private final AtomicInteger ignoredFailures;
36+
private final int failureRateThreshold;
3637
private final int slidingWindowSize;
38+
private final int minCallCountToComputeFailureRate;
3739
private final ScheduledExecutorService scheduledExecutor;
3840
private final ImmutableSet<Class<? extends Exception>> ignoredErrors;
3941

4042
/**
4143
* Creates a {@link FailureCircuitBreaker}.
4244
*
43-
* @param failureThreshold is used to set the number of failures required to trip the circuit
44-
* breaker in given time window.
45-
* @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number
46-
* of failures.
45+
* @param failureRateThreshold is used to set the min percentage of failure required to trip the circuit breaker in
46+
* given time window.
47+
* @param slidingWindowSize the size of the sliding window in milliseconds to calculate the number of failures.
4748
*/
48-
public FailureCircuitBreaker(int failureThreshold, int slidingWindowSize) {
49-
this.failureThreshold = failureThreshold;
49+
public FailureCircuitBreaker(int failureRateThreshold, int slidingWindowSize) {
5050
this.failures = new AtomicInteger(0);
51+
this.successes = new AtomicInteger(0);
52+
this.ignoredFailures = new AtomicInteger(0);
53+
this.failureRateThreshold = failureRateThreshold;
5154
this.slidingWindowSize = slidingWindowSize;
55+
this.minCallCountToComputeFailureRate = CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE;
5256
this.state = State.ACCEPT_CALLS;
5357
this.scheduledExecutor =
5458
slidingWindowSize > 0 ? Executors.newSingleThreadScheduledExecutor() : null;
@@ -64,20 +68,36 @@ public State state() {
6468
public void recordFailure(Exception e) {
6569
if (!ignoredErrors.contains(e.getClass())) {
6670
int failureCount = failures.incrementAndGet();
71+
int totalCallCount = successes.get() + failureCount + ignoredFailures.get();
6772
if (slidingWindowSize > 0) {
6873
var unused =
6974
scheduledExecutor.schedule(
7075
failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
7176
}
77+
78+
if (totalCallCount < minCallCountToComputeFailureRate) {
79+
// The remote call count is below the threshold required to calculate the failure rate.
80+
return;
81+
}
82+
double failureRate = (failureCount * 100.0) / totalCallCount;
83+
7284
// Since the state can only be changed to the open state, synchronization is not required.
73-
if (failureCount > this.failureThreshold) {
85+
if (failureRate > this.failureRateThreshold) {
7486
this.state = State.REJECT_CALLS;
7587
}
88+
} else {
89+
ignoredFailures.incrementAndGet();
90+
if (slidingWindowSize > 0) {
91+
scheduledExecutor.schedule(ignoredFailures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
92+
}
7693
}
7794
}
7895

7996
@Override
8097
public void recordSuccess() {
81-
// do nothing, implement if we need to set threshold on failure rate instead of count.
98+
successes.incrementAndGet();
99+
if (slidingWindowSize > 0) {
100+
scheduledExecutor.schedule(successes::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS);
101+
}
82102
}
83103
}

src/main/java/com/google/devtools/build/lib/remote/options/RemoteOptions.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -695,15 +695,16 @@ public RemoteOutputsStrategyConverter() {
695695
public CircuitBreakerStrategy circuitBreakerStrategy;
696696

697697
@Option(
698-
name = "experimental_remote_failure_threshold",
699-
defaultValue = "100",
698+
name = "experimental_remote_failure_rate_threshold",
699+
defaultValue = "10",
700700
documentationCategory = OptionDocumentationCategory.REMOTE,
701701
effectTags = {OptionEffectTag.EXECUTION},
702+
converter = Converters.PercentageConverter.class,
702703
help =
703-
"Sets the allowed number of failures in a specific time window after which it stops"
704-
+ " calling to the remote cache/executor. By default the value is 100. Setting this"
705-
+ " to 0 or negative means no limitation.")
706-
public int remoteFailureThreshold;
704+
"Sets the allowed number of failure rate in percentage for a specific time window after which it stops "
705+
+ "calling to the remote cache/executor. By default the value is 10. Setting this to 0 means no "
706+
+ "limitation.")
707+
public int remoteFailureRateThreshold;
707708

708709
@Option(
709710
name = "experimental_remote_failure_window_interval",
@@ -712,7 +713,7 @@ public RemoteOutputsStrategyConverter() {
712713
effectTags = {OptionEffectTag.EXECUTION},
713714
converter = RemoteDurationConverter.class,
714715
help =
715-
"The interval in which the failure count of the remote requests are computed. On zero or"
716+
"The interval in which the failure rate of the remote requests are computed. On zero or"
716717
+ " negative value the failure duration is computed the whole duration of the"
717718
+ " execution.Following units can be used: Days (d), hours (h), minutes (m), seconds"
718719
+ " (s), and milliseconds (ms). If the unit is omitted, the value is interpreted as"

src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,51 +18,71 @@
1818
import build.bazel.remote.execution.v2.Digest;
1919
import com.google.devtools.build.lib.remote.Retrier.CircuitBreaker.State;
2020
import com.google.devtools.build.lib.remote.common.CacheNotFoundException;
21+
2122
import java.util.ArrayList;
2223
import java.util.Collections;
2324
import java.util.List;
25+
2426
import org.junit.Test;
2527
import org.junit.runner.RunWith;
2628
import org.junit.runners.JUnit4;
2729

30+
import java.util.stream.IntStream;
31+
2832
@RunWith(JUnit4.class)
2933
public class FailureCircuitBreakerTest {
3034

3135
@Test
32-
public void testRecordFailure() throws InterruptedException {
33-
final int failureThreshold = 10;
36+
public void testRecordFailure_withIgnoredErrors() throws InterruptedException {
37+
final int failureRateThreshold = 10;
3438
final int windowInterval = 100;
3539
FailureCircuitBreaker failureCircuitBreaker =
36-
new FailureCircuitBreaker(failureThreshold, windowInterval);
40+
new FailureCircuitBreaker(failureRateThreshold, windowInterval);
3741

3842
List<Exception> listOfExceptionThrownOnFailure = new ArrayList<>();
39-
for (int index = 0; index < failureThreshold; index++) {
43+
for (int index = 0; index < failureRateThreshold; index++) {
4044
listOfExceptionThrownOnFailure.add(new Exception());
4145
}
42-
for (int index = 0; index < failureThreshold * 9; index++) {
46+
for (int index = 0; index < failureRateThreshold * 9; index++) {
4347
listOfExceptionThrownOnFailure.add(new CacheNotFoundException(Digest.newBuilder().build()));
4448
}
4549

4650
Collections.shuffle(listOfExceptionThrownOnFailure);
4751

4852
// make calls equals to threshold number of not ignored failure calls in parallel.
49-
listOfExceptionThrownOnFailure.stream()
50-
.parallel()
51-
.forEach(failureCircuitBreaker::recordFailure);
53+
listOfExceptionThrownOnFailure.stream().parallel().forEach(failureCircuitBreaker::recordFailure);
5254
assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS);
5355

5456
// Sleep for windowInterval + 1ms.
5557
Thread.sleep(windowInterval + 1 /*to compensate any delay*/);
5658

5759
// make calls equals to threshold number of not ignored failure calls in parallel.
58-
listOfExceptionThrownOnFailure.stream()
59-
.parallel()
60-
.forEach(failureCircuitBreaker::recordFailure);
60+
listOfExceptionThrownOnFailure.stream().parallel().forEach(failureCircuitBreaker::recordFailure);
6161
assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS);
6262

6363
// Sleep for less than windowInterval.
6464
Thread.sleep(windowInterval - 5);
6565
failureCircuitBreaker.recordFailure(new Exception());
6666
assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS);
6767
}
68+
69+
@Test
70+
public void testRecordFailure_minCallCriteriaNotMet() throws InterruptedException {
71+
final int failureRateThreshold = 10;
72+
final int windowInterval = 100;
73+
final int minCallToComputeFailure = CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE;
74+
FailureCircuitBreaker failureCircuitBreaker =
75+
new FailureCircuitBreaker(failureRateThreshold, windowInterval);
76+
77+
// make half failure call, half success call and number of total call less than minCallToComputeFailure.
78+
IntStream.range(0, minCallToComputeFailure >> 1).parallel()
79+
.forEach(i -> failureCircuitBreaker.recordFailure(new Exception()));
80+
IntStream.range(0, minCallToComputeFailure >> 1).parallel().forEach(i -> failureCircuitBreaker.recordSuccess());
81+
assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS);
82+
83+
// Sleep for less than windowInterval.
84+
Thread.sleep(windowInterval - 20);
85+
failureCircuitBreaker.recordFailure(new Exception());
86+
assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS);
87+
}
6888
}

0 commit comments

Comments
 (0)