Skip to content

Commit 5dfae70

Browse files
work around new Popen.send_signal behavior in Python 3.9
In Python 3.9, Popen.send_signal (which is called by Popen.kill, which we call in SharedChild.kill) now calls Popen.poll first, as a best-effort check to make sure the child's PID hasn't already been freed for reuse. If the child has exited, this may reap the child. Previously we didn't check for this, and we called os.waitid after kill assuming the child was still avilable. Now in Python 3.9 this raises an uncaught "no child found" exception. It's a race condition that only triggers if the child has exited before kill is called. We got lucky and caught this crash in a doctest in CI. An example failing run: https://github.com/oconnor663/duct.py/runs/1488376578 The upstream bug tracking the send_signal change is https://bugs.python.org/issue38630. This commit does a few things: 1. Add a test that reliably triggers the crash. 2. Read Popen.returncode directly, instead of tracking our own SharedChild._status. This makes the test pass, but it's unclear whether it's robust to all possible race conditions. 3. Stop calling Popen.kill, and instead reimplement its previous non-polling behavior. Technically this means we don't need the returncode fix above, but the code is simpler that way anyway, and it's some insurance against more changes like this in future Python versions.
1 parent 07010e2 commit 5dfae70

File tree

2 files changed

+73
-24
lines changed

2 files changed

+73
-24
lines changed

duct.py

+46-24
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
import io
5757
import os
5858
import shutil
59+
import signal
5960
import subprocess
6061
import threading
6162

@@ -1118,6 +1119,10 @@ def maybe_canonicalize_exe_path(exe_name, iocontext):
11181119
popen_lock = threading.Lock()
11191120

11201121

1122+
def is_windows():
1123+
return os.name == "nt"
1124+
1125+
11211126
# This wrapper works around two major deadlock issues to do with pipes. The
11221127
# first is that, before Python 3.2 on POSIX systems, os.pipe() creates
11231128
# inheritable file descriptors, which leak to all child processes and prevent
@@ -1139,7 +1144,7 @@ def maybe_canonicalize_exe_path(exe_name, iocontext):
11391144
# subprocess.Popen. That type works around another race condition to do with
11401145
# signaling children.
11411146
def safe_popen(*args, **kwargs):
1142-
close_fds = (os.name != 'nt')
1147+
close_fds = not is_windows()
11431148
with popen_lock:
11441149
return SharedChild(*args, close_fds=close_fds, **kwargs)
11451150

@@ -1164,7 +1169,7 @@ def encode_with_universal_newlines(s):
11641169
# removals in that copy won't interact properly with the inherited parent
11651170
# environment.
11661171
def convert_env_var_name(var):
1167-
if os.name == 'nt':
1172+
if is_windows():
11681173
return var.upper()
11691174
return var
11701175

@@ -1194,26 +1199,24 @@ def convert_env_var_name(var):
11941199
class SharedChild:
11951200
def __init__(self, *args, **kwargs):
11961201
self._child = subprocess.Popen(*args, **kwargs)
1197-
self._status = None
1198-
# The status lock is only held long enough to read or write the status,
1199-
# or to make non-blocking calls like Popen.poll(). Threads making a
1200-
# blocking call to os.waitid() release the status lock first. This
1202+
# The child lock is only held for non-blocking calls. Threads making a
1203+
# blocking call to os.waitid() release the child lock first. This
12011204
# ensures that one thread can call try_wait() while another thread is
12021205
# blocked on wait().
1203-
self._status_lock = threading.Lock()
1206+
self._child_lock = threading.Lock()
12041207
self._wait_lock = threading.Lock()
12051208

12061209
def wait(self):
12071210
with self._wait_lock:
12081211
# See if another thread already waited. If so, return the status we
1209-
# got before. If not, immediately release the status lock, and move
1212+
# got before. If not, immediately release the child lock, and move
12101213
# on to call wait ourselves.
1211-
with self._status_lock:
1212-
if self._status is not None:
1213-
return self._status
1214+
with self._child_lock:
1215+
if self._child.returncode is not None:
1216+
return self._child.returncode
12141217

12151218
# No other thread has waited, we're holding the wait lock, and
1216-
# we've released the status lock. It's now our job to wait. As
1219+
# we've released the child lock. It's now our job to wait. As
12171220
# documented above, if os.waitid is defined, use that function to
12181221
# await the child without reaping it. Otherwise we do an ordinary
12191222
# Popen.wait and accept the race condition on some platforms.
@@ -1225,20 +1228,20 @@ def wait(self):
12251228
# races with kill(), which is what all of this is about.
12261229
self._child.wait()
12271230

1228-
# Finally, while still holding the wait lock, re-acquire the status
1231+
# Finally, while still holding the wait lock, re-acquire the child
12291232
# lock to reap the child and write the result. Since we know the
12301233
# child has already exited, this won't block. Any other waiting
12311234
# threads that were blocked on us will see our result.
1232-
with self._status_lock:
1235+
with self._child_lock:
12331236
# If the child was already reaped above in the !HAS_WAITID
1234-
# branch, this will just return the same status again.
1235-
self._status = self._child.wait()
1236-
return self._status
1237+
# branch, this second wait will be a no-op with a cached
1238+
# returncode.
1239+
return self._child.wait()
12371240

12381241
def try_wait(self):
1239-
with self._status_lock:
1240-
if self._status is not None:
1241-
return self._status
1242+
with self._child_lock:
1243+
if self._child.returncode is not None:
1244+
return self._child.returncode
12421245

12431246
# The child hasn't been waited on yet, so we need to do a
12441247
# non-blocking check to see if it's still running. The Popen type
@@ -1256,16 +1259,35 @@ def try_wait(self):
12561259

12571260
# If either of the poll approaches above returned non-None, do a full
12581261
# wait to reap the child, which will not block. Note that we've
1259-
# released the status lock here, because wait() will re-acquire it.
1262+
# released the child lock here, because wait() will re-acquire it.
12601263
if poll_result is not None:
12611264
return self.wait()
12621265
else:
12631266
return None
12641267

12651268
def kill(self):
1266-
with self._status_lock:
1267-
if self._status is None:
1268-
self._child.kill()
1269+
with self._child_lock:
1270+
if self._child.returncode is None:
1271+
# Previously we just used Popen.kill here. However, as of Python 3.9,
1272+
# Popen.send_signal (which is called by Popen.kill) calls Popen.poll first, as a
1273+
# best-effort check for the same PID race that this class is designed around. That
1274+
# means that if the child has already exited, Popen.kill will reap it. Now that we
1275+
# check Popen.returncode throughout this class (as of the same commit that adds this
1276+
# comment), we'll see the non-None exit status there as a side effect if reaping has
1277+
# happened. That *might* mean we could still call Popen.kill here safely. However,
1278+
# there's also the question of how Popen.poll's call to os.waitpid would interact
1279+
# with our own blocking call to os.waitid from another thread. The worry is that the
1280+
# waitpid call might take effect first, causing waitid to return a "no child found"
1281+
# error. I can confirm that happens on Linux when both calls are blocking. Here
1282+
# though, the waitpid call is non-blocking, which *might* mean it can't happen
1283+
# first, but that's going to depend on the OS. We could assume that it can happen
1284+
# and try to catch the error from waitid, but that codepath would be impossible to
1285+
# test. So what we actually do here is reimplement the documented behavior of
1286+
# Popen.kill: os.kill(pid, SIGKILL) on Unix, and Popen.terminate on Windows.
1287+
if is_windows():
1288+
self._child.terminate()
1289+
else:
1290+
os.kill(self._child.pid, signal.SIGKILL)
12691291

12701292
def pid(self):
12711293
return self._child.pid

test_duct.py

+27
Original file line numberDiff line numberDiff line change
@@ -694,3 +694,30 @@ def test_pids():
694694
assert type(reader.pids()[0]) is int
695695
assert type(reader.pids()[1]) is int
696696
reader.read()
697+
698+
699+
# This test was added after the release of Python 3.9, which included a
700+
# behavior change that caused a crash in this case. There wasn't previously a
701+
# explicit test for this, but I got lucky and one of the doctests hit it.
702+
# (Example run: https://github.com/oconnor663/duct.py/runs/1488376578)
703+
#
704+
# The behavior change in Python 3.9 is that Popen.send_signal (which is called
705+
# by Popen.kill, which we call in SharedChild.kill) now calls Popen.poll first,
706+
# as a best-effort check to make sure the child's PID hasn't already been freed
707+
# for reuse. If the child has not yet exited, this is effectively no different
708+
# from before. However, if the child has exited, this may reap the child, which
709+
# was not previously possible. This test guarantees that the child has exited
710+
# before kill, and then makes sure kill doesn't crash.
711+
def test_kill_after_child_exit():
712+
# Create a child process and wait for it to exit, without actually waiting
713+
# on it and reaping it, by reading its output. We can't use the .read()
714+
# method for this, because that would actually wait on it and reap it, so
715+
# we create our own pipe manually.
716+
pipe_reader, pipe_writer = os.pipe()
717+
handle = echo_cmd("hi").stdout_file(pipe_writer).start()
718+
os.close(pipe_writer)
719+
reader_file = os.fdopen(pipe_reader)
720+
assert reader_file.read() == "hi\n"
721+
722+
# The child has exited. Now just test that kill doesn't crash.
723+
handle.kill()

0 commit comments

Comments
 (0)