34
34
import java .util .concurrent .ConcurrentHashMap ;
35
35
import java .util .concurrent .ConcurrentMap ;
36
36
import java .util .concurrent .Semaphore ;
37
+ import javax .annotation .Nullable ;
37
38
38
39
/**
39
40
* An intermediate worker that sends requests and receives responses from the worker processes.
@@ -48,29 +49,23 @@ public class WorkerMultiplexer extends Thread {
48
49
* A map of {@code WorkResponse}s received from the worker process. They are stored in this map
49
50
* keyed by the request id until the corresponding {@code WorkerProxy} picks them up.
50
51
*/
51
- private final ConcurrentMap <Integer , WorkResponse > workerProcessResponse ;
52
+ private final ConcurrentMap <Integer , WorkResponse > workerProcessResponse =
53
+ new ConcurrentHashMap <>();
52
54
/**
53
55
* A map of semaphores corresponding to {@code WorkRequest}s. After sending the {@code
54
56
* WorkRequest}, {@code WorkerProxy} will wait on a semaphore to be released. {@code
55
57
* WorkerMultiplexer} is responsible for releasing the corresponding semaphore in order to signal
56
58
* {@code WorkerProxy} that the {@code WorkerResponse} has been received.
57
59
*/
58
- private final ConcurrentMap <Integer , Semaphore > responseChecker ;
59
- /** The worker process that this WorkerMultiplexer should be talking to. */
60
- private Subprocess process ;
60
+ private final ConcurrentMap <Integer , Semaphore > responseChecker = new ConcurrentHashMap <>();
61
61
/**
62
- * Set to true if one of the worker processes returns an unparseable response, or for other
63
- * reasons we can't properly handle the remaining responses. We then discard all the responses
64
- * from other work requests and abort .
62
+ * The worker process that this WorkerMultiplexer should be talking to. This should only be set
63
+ * once, when creating a new process. If the process dies or its stdio streams get corrupted, the
64
+ * {@code WorkerMultiplexer} gets discarded as well and a new one gets created as needed .
65
65
*/
66
- private boolean isWorkerStreamCorrupted ;
66
+ private Subprocess process ;
67
67
/** InputStream from the worker process. */
68
68
private RecordingInputStream recordingStream ;
69
- /**
70
- * True if we have received EOF on the stream from the worker process. We then stop processing,
71
- * and all workers still waiting for responses will fail.
72
- */
73
- private boolean isWorkerStreamClosed ;
74
69
/** True if this multiplexer was explicitly destroyed. */
75
70
private boolean wasDestroyed ;
76
71
/**
@@ -89,25 +84,20 @@ public class WorkerMultiplexer extends Thread {
89
84
* The active Reporter object, non-null if {@code --worker_verbose} is set. This must be cleared
90
85
* at the end of a command execution.
91
86
*/
92
- public EventHandler reporter ;
87
+ private EventHandler reporter ;
93
88
94
89
WorkerMultiplexer (Path logFile , WorkerKey workerKey ) {
95
90
this .logFile = logFile ;
96
91
this .workerKey = workerKey ;
97
- responseChecker = new ConcurrentHashMap <>();
98
- workerProcessResponse = new ConcurrentHashMap <>();
99
- isWorkerStreamCorrupted = false ;
100
- isWorkerStreamClosed = false ;
101
- wasDestroyed = false ;
102
92
}
103
93
104
94
/** Sets or clears the reporter for outputting verbose info. */
105
- void setReporter (EventHandler reporter ) {
95
+ synchronized void setReporter (@ Nullable EventHandler reporter ) {
106
96
this .reporter = reporter ;
107
97
}
108
98
109
99
/** Reports a string to the user if reporting is enabled. */
110
- private void report (String s ) {
100
+ private synchronized void report (String s ) {
111
101
EventHandler r = this .reporter ; // Protect against race condition with setReporter().
112
102
if (r != null && s != null ) {
113
103
r .handle (Event .info (s ));
@@ -119,17 +109,17 @@ private void report(String s) {
119
109
* exist. Also makes sure this {@code WorkerMultiplexer} runs as a separate thread.
120
110
*/
121
111
public synchronized void createProcess (Path workDir ) throws IOException {
122
- // The process may have died in the meanwhile (e.g. between builds).
123
- if (this .process == null || !this .process .isAlive ()) {
112
+ if (this .process == null ) {
113
+ if (this .wasDestroyed ) {
114
+ throw new IOException ("Multiplexer destroyed before created process" );
115
+ }
124
116
ImmutableList <String > args = workerKey .getArgs ();
125
117
File executable = new File (args .get (0 ));
126
118
if (!executable .isAbsolute () && executable .getParent () != null ) {
127
119
List <String > newArgs = new ArrayList <>(args );
128
120
newArgs .set (0 , new File (workDir .getPathFile (), newArgs .get (0 )).getAbsolutePath ());
129
121
args = ImmutableList .copyOf (newArgs );
130
122
}
131
- isWorkerStreamCorrupted = false ;
132
- isWorkerStreamClosed = false ;
133
123
SubprocessBuilder processBuilder =
134
124
subprocessFactory != null
135
125
? new SubprocessBuilder (subprocessFactory )
@@ -139,6 +129,8 @@ public synchronized void createProcess(Path workDir) throws IOException {
139
129
processBuilder .setStderr (logFile .getPathFile ());
140
130
processBuilder .setEnv (workerKey .getEnv ());
141
131
this .process = processBuilder .start ();
132
+ } else if (!this .process .isAlive ()) {
133
+ throw new IOException ("Process is dead" );
142
134
}
143
135
if (!this .isAlive ()) {
144
136
this .start ();
@@ -155,24 +147,24 @@ public Path getLogFile() {
155
147
156
148
/**
157
149
* Signals this object to destroy itself, including the worker process. The object might not be
158
- * fully destroyed at the end of this call, but will terminate soon.
150
+ * fully destroyed at the end of this call, but will terminate soon. This is considered a
151
+ * deliberate destruction.
159
152
*/
160
153
public synchronized void destroyMultiplexer () {
161
154
if (this .process != null ) {
162
- destroyProcess (this .process );
163
- this .process = null ;
155
+ destroyProcess ();
164
156
}
165
157
wasDestroyed = true ;
166
158
}
167
159
168
160
/** Destroys the worker subprocess. This might block forever if the subprocess refuses to die. */
169
- private void destroyProcess (Subprocess process ) {
161
+ private synchronized void destroyProcess () {
170
162
boolean wasInterrupted = false ;
171
163
try {
172
- process .destroy ();
164
+ this . process .destroy ();
173
165
while (true ) {
174
166
try {
175
- process .waitFor ();
167
+ this . process .waitFor ();
176
168
return ;
177
169
} catch (InterruptedException ie ) {
178
170
wasInterrupted = true ;
@@ -183,7 +175,6 @@ private void destroyProcess(Subprocess process) {
183
175
if (wasInterrupted ) {
184
176
Thread .currentThread ().interrupt (); // preserve interrupted status
185
177
}
186
- isWorkerStreamClosed = true ;
187
178
}
188
179
}
189
180
@@ -200,10 +191,6 @@ public synchronized void putRequest(WorkRequest request) throws IOException {
200
191
// We can't know how much of the request was sent, so we have to assume the worker's input
201
192
// now contains garbage.
202
193
// TODO(b/151767359): Avoid causing garbage! Maybe by sending in a separate thread?
203
- isWorkerStreamCorrupted = true ;
204
- if (e instanceof InterruptedIOException ) {
205
- Thread .currentThread ().interrupt ();
206
- }
207
194
responseChecker .remove (request .getRequestId ());
208
195
throw e ;
209
196
}
@@ -228,10 +215,8 @@ public WorkResponse getResponse(Integer requestId) throws InterruptedException {
228
215
// Wait for the multiplexer to get our response and release this semaphore. The semaphore will
229
216
// throw {@code InterruptedException} when the multiplexer is terminated.
230
217
waitForResponse .acquire ();
231
- report ("Acquired response semaphore for " + requestId );
232
218
233
219
WorkResponse workResponse = workerProcessResponse .get (requestId );
234
- report ("Response for " + requestId + " is " + workResponse );
235
220
return workResponse ;
236
221
} finally {
237
222
responseChecker .remove (requestId );
@@ -247,25 +232,25 @@ public WorkResponse getResponse(Integer requestId) throws InterruptedException {
247
232
* execution cancellation.
248
233
*/
249
234
private void waitResponse () throws InterruptedException , IOException {
250
- Subprocess p = this .process ;
251
- if (p == null || !p .isAlive ()) {
252
- // Avoid busy-wait for a new process.
235
+ recordingStream = new RecordingInputStream (this .process .getInputStream ());
236
+ recordingStream .startRecording (4096 );
237
+ // TODO(larsrc): Turn this into a loop that also sends requests.
238
+ // Allow interrupts while waiting for responses, without conflating it with I/O errors.
239
+ while (recordingStream .available () == 0 ) {
240
+ if (!this .process .isAlive ()) {
241
+ throw new IOException (
242
+ String .format ("Multiplexer process for %s is dead" , workerKey .getMnemonic ()));
243
+ }
253
244
Thread .sleep (1 );
254
- return ;
255
245
}
256
- recordingStream = new RecordingInputStream (p .getInputStream ());
257
- recordingStream .startRecording (4096 );
258
246
WorkResponse parsedResponse = WorkResponse .parseDelimitedFrom (recordingStream );
259
247
260
248
// A null parsedResponse can only happen if the input stream is closed, in which case we
261
249
// drop everything.
262
250
if (parsedResponse == null ) {
263
- isWorkerStreamClosed = true ;
264
- report (
251
+ throw new IOException (
265
252
String .format (
266
- "Multiplexer process for %s has closed its output, aborting multiplexer" ,
267
- workerKey .getMnemonic ()));
268
- return ;
253
+ "Multiplexer process for %s died while reading response" , workerKey .getMnemonic ()));
269
254
}
270
255
271
256
int requestId = parsedResponse .getRequestId ();
@@ -287,13 +272,15 @@ private void waitResponse() throws InterruptedException, IOException {
287
272
/** The multiplexer thread that listens to the WorkResponse from worker process. */
288
273
@ Override
289
274
public void run () {
290
- while (! isWorkerStreamClosed && ! isWorkerStreamCorrupted ) {
275
+ while (this . process . isAlive () ) {
291
276
try {
292
277
waitResponse ();
293
278
} catch (IOException e ) {
294
279
// We got this exception while reading from the worker's stdout. We can't trust the
295
280
// output any more at that point.
296
- isWorkerStreamCorrupted = true ;
281
+ if (this .process .isAlive ()) {
282
+ destroyProcess ();
283
+ }
297
284
if (e instanceof InterruptedIOException ) {
298
285
report (
299
286
String .format (
@@ -315,17 +302,12 @@ public void run() {
315
302
// will let fall on the floor, but we still want to leave the process running for the next
316
303
// build.
317
304
// TODO(b/151767359): Cancel all outstanding requests when cancellation is implemented.
318
- releaseAllSemaphores ();
305
+ for (Semaphore semaphore : responseChecker .values ()) {
306
+ semaphore .release ();
307
+ }
319
308
}
320
309
}
321
- // If we get here, the worker process is either dead or corrupted. We could attempt to restart
322
- // it, but the outstanding requests will have failed already. Until we have a way to signal
323
- // transient failures, we have to just reject all further requests and make sure the process
324
- // is really dead
325
310
synchronized (this ) {
326
- if (process != null && process .isAlive ()) {
327
- destroyMultiplexer ();
328
- }
329
311
releaseAllSemaphores ();
330
312
}
331
313
}
@@ -350,14 +332,14 @@ String getRecordingStreamMessage() {
350
332
351
333
/** Returns true if this process has died for other reasons than a call to {@code #destroy()}. */
352
334
boolean diedUnexpectedly () {
353
- Subprocess p = this .process ; // Protects against this.process getting null.
354
- return p != null && !p .isAlive () && !wasDestroyed ;
335
+ return this .process != null && !this .process .isAlive () && !wasDestroyed ;
355
336
}
356
337
357
338
/** Returns the exit value of multiplexer's process, if it has exited. */
358
339
Optional <Integer > getExitValue () {
359
- Subprocess p = this .process ; // Protects against this.process getting null.
360
- return p != null && !p .isAlive () ? Optional .of (p .exitValue ()) : Optional .empty ();
340
+ return this .process != null && !this .process .isAlive ()
341
+ ? Optional .of (this .process .exitValue ())
342
+ : Optional .empty ();
361
343
}
362
344
363
345
/** For testing only, to verify that maps are cleared after responses are reaped. */
0 commit comments