Skip to content

Commit 8d1db5b

Browse files
larsrc-googlecopybara-github
authored andcommitted
Add --experimental_worker_memory_limit_mb flag that kills (-9) workers using more than a given amount of memory.
This flag works without cgroups and on all OSes, with or without sandboxing, but relies on polling memory info. Note: Can't reliably integration test this without having more control over the timing. PiperOrigin-RevId: 529347598 Change-Id: Iabeb8a45850a619dcb6ef8be9369fdc221f952ef
1 parent 4073bcd commit 8d1db5b

File tree

3 files changed

+85
-5
lines changed

3 files changed

+85
-5
lines changed

src/main/java/com/google/devtools/build/lib/worker/WorkerLifecycleManager.java

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
// limitations under the License.
1414
package com.google.devtools.build.lib.worker;
1515

16+
import static com.google.common.collect.ImmutableList.toImmutableList;
17+
1618
import com.google.common.annotations.VisibleForTesting;
1719
import com.google.common.collect.ImmutableList;
1820
import com.google.common.collect.ImmutableSet;
@@ -22,6 +24,7 @@
2224
import java.util.Comparator;
2325
import java.util.HashSet;
2426
import java.util.List;
27+
import java.util.Optional;
2528
import java.util.Set;
2629
import java.util.stream.Collectors;
2730
import org.apache.commons.pool2.PooledObject;
@@ -53,7 +56,7 @@ public void setReporter(Reporter reporter) {
5356

5457
@Override
5558
public void run() {
56-
if (options.totalWorkerMemoryLimitMb == 0) {
59+
if (options.totalWorkerMemoryLimitMb == 0 && options.workerMemoryLimitMb == 0) {
5760
return;
5861
}
5962

@@ -69,10 +72,17 @@ public void run() {
6972

7073
ImmutableList<WorkerMetric> workerMetrics =
7174
WorkerMetricsCollector.instance().collectMetrics();
72-
try {
73-
evictWorkers(workerMetrics);
74-
} catch (InterruptedException e) {
75-
break;
75+
76+
if (options.totalWorkerMemoryLimitMb > 0) {
77+
try {
78+
evictWorkers(workerMetrics);
79+
} catch (InterruptedException e) {
80+
break;
81+
}
82+
}
83+
84+
if (options.workerMemoryLimitMb > 0) {
85+
killLargeWorkers(workerMetrics, options.workerMemoryLimitMb);
7686
}
7787
}
7888

@@ -83,6 +93,35 @@ void stopProcessing() {
8393
isWorking = false;
8494
}
8595

96+
/** Kills any worker that uses more than {@code limitMb} MB of memory. */
97+
void killLargeWorkers(ImmutableList<WorkerMetric> workerMetrics, int limitMb) {
98+
ImmutableList<WorkerMetric> large =
99+
workerMetrics.stream()
100+
.filter(m -> m.getWorkerStat().getUsedMemoryInKB() / 1000 > limitMb)
101+
.collect(toImmutableList());
102+
103+
for (WorkerMetric l : large) {
104+
String msg;
105+
106+
ImmutableList<Integer> workerIds = l.getWorkerProperties().getWorkerIds();
107+
Optional<ProcessHandle> ph = ProcessHandle.of(l.getWorkerProperties().getProcessId());
108+
if (ph.isPresent()) {
109+
msg =
110+
String.format(
111+
"Killing %s worker %s (pid %d) taking %dMB",
112+
l.getWorkerProperties().getMnemonic(),
113+
workerIds.size() == 1 ? workerIds.get(0) : workerIds,
114+
l.getWorkerProperties().getProcessId(),
115+
l.getWorkerStat().getUsedMemoryInKB() / 1000);
116+
ph.get().destroyForcibly();
117+
logger.atInfo().log("%s", msg);
118+
if (reporter != null) {
119+
reporter.handle(Event.info(msg));
120+
}
121+
}
122+
}
123+
}
124+
86125
@VisibleForTesting // productionVisibility = Visibility.PRIVATE
87126
void evictWorkers(ImmutableList<WorkerMetric> workerMetrics) throws InterruptedException {
88127

src/main/java/com/google/devtools/build/lib/worker/WorkerOptions.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,4 +225,16 @@ public String getTypeDescription() {
225225
"The interval between collecting worker metrics and possibly attempting evictions. "
226226
+ "Cannot effectively be less than 1s for performance reasons.")
227227
public Duration workerMetricsPollInterval;
228+
229+
@Option(
230+
name = "experimental_worker_memory_limit_mb",
231+
converter = RamResourceConverter.class,
232+
defaultValue = "0",
233+
documentationCategory = OptionDocumentationCategory.EXECUTION_STRATEGY,
234+
effectTags = {OptionEffectTag.EXECUTION, OptionEffectTag.HOST_MACHINE_RESOURCE_OPTIMIZATIONS},
235+
help =
236+
"If this limit is greater than zero, workers might be killed if the memory usage of the "
237+
+ "worker exceeds the limit. If not used together with dynamic execution and "
238+
+ "`--experimental_dynamic_ignore_local_signals=9`, this may crash your build.")
239+
public int workerMemoryLimitMb;
228240
}

src/test/shell/integration/bazel_worker_test.sh

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,35 @@ EOF
728728
expect_log "^---8<---8<--- End of log ---8<---8<---"
729729
}
730730

731+
function test_worker_memory_limit() {
732+
prepare_example_worker
733+
cat >>BUILD <<EOF
734+
work(
735+
name = "hello_world",
736+
worker = ":worker",
737+
worker_args = [
738+
"--worker_protocol=${WORKER_PROTOCOL}",
739+
],
740+
args = [
741+
"--work_time=3s",
742+
]
743+
)
744+
EOF
745+
746+
bazel build --experimental_worker_memory_limit_mb=1000 \
747+
--experimental_worker_metrics_poll_interval=1s :hello_world &> "$TEST_log" \
748+
|| fail "build failed"
749+
bazel clean
750+
bazel build --experimental_worker_memory_limit_mb=1 \
751+
--experimental_worker_metrics_poll_interval=1s :hello_world &> "$TEST_log" \
752+
&& fail "expected build to fail" || true
753+
754+
expect_log "^---8<---8<--- Start of log, file at /"
755+
expect_log "Worker process did not return a WorkResponse:"
756+
expect_log "Killing [a-zA-Z]\+ worker [0-9]\+ (pid [0-9]\+) taking [0-9]\+MB"
757+
expect_log "^---8<---8<--- End of log ---8<---8<---"
758+
}
759+
731760
function test_worker_metrics_collection() {
732761
prepare_example_worker
733762
cat >>BUILD <<EOF

0 commit comments

Comments
 (0)