libct: move killing logic to container.Signal

kolyshkin · kolyshkin · commit 9583b3d1c297 · 2023-06-08T09:29:25.000-07:00
By default, the container has its own PID namespace, and killing (with
SIGKILL) its init process from the parent PID namespace also kills all
the other processes.

Obviously, it does not work that way when the container is sharing its
PID namespace with the host or another container, since init is no
longer special (it's not PID 1). In this case, killing container's init
will result in a bunch of other processes left running (and thus the
inability to remove the cgroup).

The solution to the above problem is killing all the container
processes, not just init.

The problem with the current implementation is, the killing logic is
implemented in libcontainer's initProcess.wait, and thus only available
to libcontainer users, but not the runc kill command (which uses
nonChildProcess.kill and does not use wait at all). So, some workarounds
exist:
 - func destroy(c *Container) calls signalAllProcesses;
 - runc kill implements -a flag.

This code became very tangled over time. Let's simplify things by moving
the killing all processes from initProcess.wait to container.Signal,
and documents the new behavior.

In essence, this also makes `runc kill` to automatically kill all container
processes when the container does not have its own PID namespace.
Document that as well.

Signed-off-by: Kir Kolyshkin &lt;kolyshkin@gmail.com&gt;
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
@@ -358,9 +358,9 @@ func (c *Container) start(process *Process) (retErr error) {
 }
 
 // Signal sends a specified signal to container's init, or, if all is true,
-// true, to all container's processes (as determined by container's cgroup).
+// to all container's processes (as determined by container's cgroup).
 //
-// Setting all=true is useful when s is SIGKILL and the container does not have
+// Note all=true is implied when s is SIGKILL and the container does not have
 // its own PID namespace. In this scenario, the libcontainer user may be required
 // to implement a proper child reaper.
 func (c *Container) Signal(s os.Signal, all bool) error {
@@ -382,22 +382,36 @@ func (c *Container) Signal(s os.Signal, all bool) error {
 		}
 		return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig))
 	}
-	// to avoid a PID reuse attack
-	if status == Running || status == Created || status == Paused {
-		if err := c.initProcess.signal(s); err != nil {
-			return fmt.Errorf("unable to signal init: %w", err)
-		}
-		if status == Paused {
-			// For cgroup v1, killing a process in a frozen cgroup
-			// does nothing until it's thawed. Only thaw the cgroup
-			// for SIGKILL.
-			if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
-				_ = c.cgroupManager.Freeze(configs.Thawed)
-			}
-		}
-		return nil
+
+	// To avoid a PID reuse attack, don't kill non-running container.
+	switch status {
+	case Running, Created, Paused:
+	default:
+		return ErrNotRunning
 	}
-	return ErrNotRunning
+
+	// When a container has its own PID namespace, inside it the init PID
+	// is 1, and thus it is handled specially by the kernel. In particular,
+	// killing init with SIGKILL from an ancestor namespace will also kill
+	// all other processes in that PID namespace (see pid_namespaces(7)).
+	//
+	// OTOH, if PID namespace is shared, we should kill all pids to avoid
+	// leftover processes.
+	if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
+		err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
+	} else {
+		err = c.initProcess.signal(s)
+	}
+	if err != nil {
+		return fmt.Errorf("unable to signal init: %w", err)
+	}
+	if status == Paused && s == unix.SIGKILL {
+		// For cgroup v1, killing a process in a frozen cgroup
+		// does nothing until it's thawed. Only thaw the cgroup
+		// for SIGKILL.
+		_ = c.cgroupManager.Freeze(configs.Thawed)
+	}
+	return nil
 }
 
 func (c *Container) createExecFifo() error {
@@ -593,7 +607,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 		container:       c,
 		process:         p,
 		bootstrapData:   data,
-		sharePidns:      !c.config.Namespaces.IsPrivate(configs.NEWPID),
 	}
 	c.initProcess = init
 	return init, nil
@@ -696,8 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
 	return cfg
 }
 
-// Destroy destroys the container, if its in a valid state, after killing any
-// remaining running processes.
+// Destroy destroys the container, if its in a valid state.
 //
 // Any event registrations are removed before the container is destroyed.
 // No error is returned if the container is already destroyed.
diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go
@@ -1399,8 +1399,8 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) {
 	err = container.Run(process2)
 	ok(t, err)
 
-	// Kill the init process and Wait for it.
-	err = process1.Signal(syscall.SIGKILL)
+	// Kill the container.
+	err = container.Signal(syscall.SIGKILL, false)
 	ok(t, err)
 	_, err = process1.Wait()
 	if err == nil {
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
@@ -304,7 +304,6 @@ type initProcess struct {
 	fds             []string
 	process         *Process
 	bootstrapData   io.Reader
-	sharePidns      bool
 }
 
 func (p *initProcess) pid() int {
@@ -616,10 +615,6 @@ func (p *initProcess) start() (retErr error) {
 
 func (p *initProcess) wait() (*os.ProcessState, error) {
 	err := p.cmd.Wait()
-	// we should kill all processes in cgroup when init is died if we use host PID namespace
-	if p.sharePidns {
-		_ = signalAllProcesses(p.manager, unix.SIGKILL)
-	}
 	return p.cmd.ProcessState, err
 }
 
diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go
@@ -7,7 +7,6 @@ import (
 
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -36,12 +35,6 @@ type containerState interface {
 }
 
 func destroy(c *Container) error {
-	if !c.config.Namespaces.Contains(configs.NEWPID) ||
-		c.config.Namespaces.PathOf(configs.NEWPID) != "" {
-		if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
-			logrus.Warn(err)
-		}
-	}
 	err := c.cgroupManager.Destroy()
 	if c.intelRdtManager != nil {
 		if ierr := c.intelRdtManager.Destroy(); err == nil {
diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md
@@ -18,8 +18,8 @@ to list available signals.
 # OPTIONS
 **--all**|**-a**
 : Send the signal to all processes inside the container, rather than
-the container's init only. This option is useful when the container does not
-have its own PID namespace.
+the container's init only. This option is implied when the _signal_ is **KILL**
+and the container does not have its own PID namespace.
 
 : When this option is set, no error is returned if the container is stopped
 or does not exist.