Skip to content

Commit b3564b2

Browse files
authored
Don't rewind the build if invocation id stays the same (#18670)
Invocation id (`BUILD_ID` in code) is expected to be different for each command. When retrying the build, we rely on Bazel generating a new invocation id for a new attempt. However, if flag `--invocation_id` is set, Bazel just uses the provided value instead of generating a new one. In this case, invocation id stays the same among multiple attempts which could cause issues like #18547. This PR fixes that by not retrying the build if the invocation id is same to previous attempt. Also updated the doc to point this requirement out. Closes #18591. PiperOrigin-RevId: 539946840 Change-Id: I6ae85ea923b0fdbff97fe2e44e36995f0205f8a1
1 parent ed7da61 commit b3564b2

File tree

3 files changed

+110
-9
lines changed

3 files changed

+110
-9
lines changed

src/main/java/com/google/devtools/build/lib/exec/ExecutionOptions.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,11 @@ public boolean usingLocalTestJobs() {
502502
help =
503503
"The maximum number of attempts to retry if the build encountered remote cache eviction"
504504
+ " error. A non-zero value will implicitly set"
505-
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.")
505+
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true. A new invocation"
506+
+ " id will be generated for each attempt. If you generate invocation id and provide"
507+
+ " it to Bazel with --invocation_id, you should not use this flag. Instead, set flag"
508+
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs and check for the exit"
509+
+ " code 39.")
506510
public int remoteRetryOnCacheEviction;
507511

508512
/** An enum for specifying different formats of test output. */

src/main/java/com/google/devtools/build/lib/runtime/BlazeCommandDispatcher.java

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,14 @@
7474
import java.io.OutputStream;
7575
import java.time.Duration;
7676
import java.util.ArrayList;
77+
import java.util.HashSet;
7778
import java.util.List;
7879
import java.util.Optional;
80+
import java.util.Set;
81+
import java.util.UUID;
7982
import java.util.concurrent.atomic.AtomicReference;
8083
import java.util.logging.Level;
84+
import javax.annotation.Nullable;
8185
import net.starlark.java.eval.Starlark;
8286

8387
/**
@@ -233,7 +237,8 @@ public BlazeCommandResult exec(
233237
retrievedShutdownReason, FailureDetails.Command.Code.PREVIOUSLY_SHUTDOWN);
234238
}
235239
BlazeCommandResult result;
236-
int attempt = 0;
240+
Set<UUID> attemptedCommandIds = new HashSet<>();
241+
BlazeCommandResult lastResult = null;
237242
while (true) {
238243
try {
239244
result =
@@ -248,11 +253,12 @@ public BlazeCommandResult exec(
248253
waitTimeInMs,
249254
startupOptionsTaggedWithBazelRc,
250255
commandExtensions,
251-
attempt);
256+
attemptedCommandIds,
257+
lastResult);
252258
break;
253259
} catch (RemoteCacheEvictedException e) {
254-
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
255-
attempt += 1;
260+
attemptedCommandIds.add(e.getCommandId());
261+
lastResult = e.getResult();
256262
}
257263
}
258264
if (result.shutdown()) {
@@ -303,7 +309,8 @@ private BlazeCommandResult execExclusively(
303309
long waitTimeInMs,
304310
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
305311
List<Any> commandExtensions,
306-
int attempt)
312+
Set<UUID> attemptedCommandIds,
313+
@Nullable BlazeCommandResult lastResult)
307314
throws RemoteCacheEvictedException {
308315
// Record the start time for the profiler. Do not put anything before this!
309316
long execStartTimeNanos = runtime.getClock().nanoTime();
@@ -331,6 +338,19 @@ private BlazeCommandResult execExclusively(
331338
firstContactTime,
332339
commandExtensions,
333340
this::setShutdownReason);
341+
342+
if (!attemptedCommandIds.isEmpty()) {
343+
if (attemptedCommandIds.contains(env.getCommandId())) {
344+
outErr.printErrLn(
345+
String.format(
346+
"Failed to retry the build: invocation id `%s` has already been used.",
347+
env.getCommandId()));
348+
return Preconditions.checkNotNull(lastResult);
349+
} else {
350+
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
351+
}
352+
}
353+
334354
CommonCommandOptions commonOptions = options.getOptions(CommonCommandOptions.class);
335355
boolean tracerEnabled = false;
336356
if (commonOptions.enableTracer == TriState.YES) {
@@ -650,8 +670,8 @@ private BlazeCommandResult execExclusively(
650670
if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
651671
var executionOptions =
652672
Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class));
653-
if (attempt < executionOptions.remoteRetryOnCacheEviction) {
654-
throw new RemoteCacheEvictedException();
673+
if (attemptedCommandIds.size() < executionOptions.remoteRetryOnCacheEviction) {
674+
throw new RemoteCacheEvictedException(env.getCommandId(), newResult);
655675
}
656676
}
657677

@@ -691,7 +711,23 @@ private BlazeCommandResult execExclusively(
691711
}
692712
}
693713

694-
private static class RemoteCacheEvictedException extends IOException {}
714+
private static class RemoteCacheEvictedException extends IOException {
715+
private final UUID commandId;
716+
private final BlazeCommandResult result;
717+
718+
private RemoteCacheEvictedException(UUID commandId, BlazeCommandResult result) {
719+
this.commandId = commandId;
720+
this.result = result;
721+
}
722+
723+
public UUID getCommandId() {
724+
return commandId;
725+
}
726+
727+
public BlazeCommandResult getResult() {
728+
return result;
729+
}
730+
}
695731

696732
private static void replayEarlyExitEvents(
697733
OutErr outErr,

src/test/shell/bazel/remote/build_without_the_bytes_test.sh

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1718,7 +1718,68 @@ EOF
17181718
--experimental_remote_cache_eviction_retries=5 \
17191719
//a:bar >& $TEST_log || fail "Failed to build"
17201720

1721+
expect_log 'Failed to fetch blobs because they do not exist remotely.'
17211722
expect_log "Found remote cache eviction error, retrying the build..."
17221723
}
17231724

1725+
function test_remote_cache_eviction_retries_with_fixed_invocation_id() {
1726+
mkdir -p a
1727+
1728+
cat > a/BUILD <<'EOF'
1729+
genrule(
1730+
name = 'foo',
1731+
srcs = ['foo.in'],
1732+
outs = ['foo.out'],
1733+
cmd = 'cat $(SRCS) > $@',
1734+
)
1735+
1736+
genrule(
1737+
name = 'bar',
1738+
srcs = ['foo.out', 'bar.in'],
1739+
outs = ['bar.out'],
1740+
cmd = 'cat $(SRCS) > $@',
1741+
tags = ['no-remote-exec'],
1742+
)
1743+
EOF
1744+
1745+
echo foo > a/foo.in
1746+
echo bar > a/bar.in
1747+
1748+
# Populate remote cache
1749+
bazel build \
1750+
--remote_executor=grpc://localhost:${worker_port} \
1751+
--remote_download_minimal \
1752+
//a:bar >& $TEST_log || fail "Failed to build"
1753+
1754+
bazel clean
1755+
1756+
# Clean build, foo.out isn't downloaded
1757+
bazel build \
1758+
--remote_executor=grpc://localhost:${worker_port} \
1759+
--remote_download_minimal \
1760+
//a:bar >& $TEST_log || fail "Failed to build"
1761+
1762+
if [[ -f bazel-bin/a/foo.out ]]; then
1763+
fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded"
1764+
fi
1765+
1766+
# Evict blobs from remote cache
1767+
stop_worker
1768+
start_worker
1769+
1770+
echo "updated bar" > a/bar.in
1771+
1772+
# Incremental build triggers remote cache eviction error and Bazel tries to
1773+
# retry the build but failed because the invocation id is the same.
1774+
bazel build \
1775+
--invocation_id=91648f28-6081-4af7-9374-cdfd3cd36ef2 \
1776+
--remote_executor=grpc://localhost:${worker_port} \
1777+
--remote_download_minimal \
1778+
--experimental_remote_cache_eviction_retries=5 \
1779+
//a:bar >& $TEST_log && fail "Expected build to fail"
1780+
1781+
expect_log 'Failed to fetch blobs because they do not exist remotely.'
1782+
expect_log 'Failed to retry the build: invocation id `91648f28-6081-4af7-9374-cdfd3cd36ef2` has already been used.'
1783+
}
1784+
17241785
run_suite "Build without the Bytes tests"

0 commit comments

Comments
 (0)