Skip to content

Commit 9583b3d

Browse files
committed
libct: move killing logic to container.Signal
By default, the container has its own PID namespace, and killing (with SIGKILL) its init process from the parent PID namespace also kills all the other processes. Obviously, it does not work that way when the container is sharing its PID namespace with the host or another container, since init is no longer special (it's not PID 1). In this case, killing container's init will result in a bunch of other processes left running (and thus the inability to remove the cgroup). The solution to the above problem is killing all the container processes, not just init. The problem with the current implementation is, the killing logic is implemented in libcontainer's initProcess.wait, and thus only available to libcontainer users, but not the runc kill command (which uses nonChildProcess.kill and does not use wait at all). So, some workarounds exist: - func destroy(c *Container) calls signalAllProcesses; - runc kill implements -a flag. This code became very tangled over time. Let's simplify things by moving the killing all processes from initProcess.wait to container.Signal, and documents the new behavior. In essence, this also makes `runc kill` to automatically kill all container processes when the container does not have its own PID namespace. Document that as well. Signed-off-by: Kir Kolyshkin <[email protected]>
1 parent 2a7dcbb commit 9583b3d

File tree

5 files changed

+36
-36
lines changed

5 files changed

+36
-36
lines changed

libcontainer/container_linux.go

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -358,9 +358,9 @@ func (c *Container) start(process *Process) (retErr error) {
358358
}
359359

360360
// Signal sends a specified signal to container's init, or, if all is true,
361-
// true, to all container's processes (as determined by container's cgroup).
361+
// to all container's processes (as determined by container's cgroup).
362362
//
363-
// Setting all=true is useful when s is SIGKILL and the container does not have
363+
// Note all=true is implied when s is SIGKILL and the container does not have
364364
// its own PID namespace. In this scenario, the libcontainer user may be required
365365
// to implement a proper child reaper.
366366
func (c *Container) Signal(s os.Signal, all bool) error {
@@ -382,22 +382,36 @@ func (c *Container) Signal(s os.Signal, all bool) error {
382382
}
383383
return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig))
384384
}
385-
// to avoid a PID reuse attack
386-
if status == Running || status == Created || status == Paused {
387-
if err := c.initProcess.signal(s); err != nil {
388-
return fmt.Errorf("unable to signal init: %w", err)
389-
}
390-
if status == Paused {
391-
// For cgroup v1, killing a process in a frozen cgroup
392-
// does nothing until it's thawed. Only thaw the cgroup
393-
// for SIGKILL.
394-
if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
395-
_ = c.cgroupManager.Freeze(configs.Thawed)
396-
}
397-
}
398-
return nil
385+
386+
// To avoid a PID reuse attack, don't kill non-running container.
387+
switch status {
388+
case Running, Created, Paused:
389+
default:
390+
return ErrNotRunning
399391
}
400-
return ErrNotRunning
392+
393+
// When a container has its own PID namespace, inside it the init PID
394+
// is 1, and thus it is handled specially by the kernel. In particular,
395+
// killing init with SIGKILL from an ancestor namespace will also kill
396+
// all other processes in that PID namespace (see pid_namespaces(7)).
397+
//
398+
// OTOH, if PID namespace is shared, we should kill all pids to avoid
399+
// leftover processes.
400+
if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
401+
err = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
402+
} else {
403+
err = c.initProcess.signal(s)
404+
}
405+
if err != nil {
406+
return fmt.Errorf("unable to signal init: %w", err)
407+
}
408+
if status == Paused && s == unix.SIGKILL {
409+
// For cgroup v1, killing a process in a frozen cgroup
410+
// does nothing until it's thawed. Only thaw the cgroup
411+
// for SIGKILL.
412+
_ = c.cgroupManager.Freeze(configs.Thawed)
413+
}
414+
return nil
401415
}
402416

403417
func (c *Container) createExecFifo() error {
@@ -593,7 +607,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
593607
container: c,
594608
process: p,
595609
bootstrapData: data,
596-
sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID),
597610
}
598611
c.initProcess = init
599612
return init, nil
@@ -696,8 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig {
696709
return cfg
697710
}
698711

699-
// Destroy destroys the container, if its in a valid state, after killing any
700-
// remaining running processes.
712+
// Destroy destroys the container, if its in a valid state.
701713
//
702714
// Any event registrations are removed before the container is destroyed.
703715
// No error is returned if the container is already destroyed.

libcontainer/integration/exec_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1399,8 +1399,8 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) {
13991399
err = container.Run(process2)
14001400
ok(t, err)
14011401

1402-
// Kill the init process and Wait for it.
1403-
err = process1.Signal(syscall.SIGKILL)
1402+
// Kill the container.
1403+
err = container.Signal(syscall.SIGKILL, false)
14041404
ok(t, err)
14051405
_, err = process1.Wait()
14061406
if err == nil {

libcontainer/process_linux.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,6 @@ type initProcess struct {
304304
fds []string
305305
process *Process
306306
bootstrapData io.Reader
307-
sharePidns bool
308307
}
309308

310309
func (p *initProcess) pid() int {
@@ -616,10 +615,6 @@ func (p *initProcess) start() (retErr error) {
616615

617616
func (p *initProcess) wait() (*os.ProcessState, error) {
618617
err := p.cmd.Wait()
619-
// we should kill all processes in cgroup when init is died if we use host PID namespace
620-
if p.sharePidns {
621-
_ = signalAllProcesses(p.manager, unix.SIGKILL)
622-
}
623618
return p.cmd.ProcessState, err
624619
}
625620

libcontainer/state_linux.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77

88
"github.com/opencontainers/runc/libcontainer/configs"
99
"github.com/opencontainers/runtime-spec/specs-go"
10-
"github.com/sirupsen/logrus"
1110
"golang.org/x/sys/unix"
1211
)
1312

@@ -36,12 +35,6 @@ type containerState interface {
3635
}
3736

3837
func destroy(c *Container) error {
39-
if !c.config.Namespaces.Contains(configs.NEWPID) ||
40-
c.config.Namespaces.PathOf(configs.NEWPID) != "" {
41-
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
42-
logrus.Warn(err)
43-
}
44-
}
4538
err := c.cgroupManager.Destroy()
4639
if c.intelRdtManager != nil {
4740
if ierr := c.intelRdtManager.Destroy(); err == nil {

man/runc-kill.8.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ to list available signals.
1818
# OPTIONS
1919
**--all**|**-a**
2020
: Send the signal to all processes inside the container, rather than
21-
the container's init only. This option is useful when the container does not
22-
have its own PID namespace.
21+
the container's init only. This option is implied when the _signal_ is **KILL**
22+
and the container does not have its own PID namespace.
2323

2424
: When this option is set, no error is returned if the container is stopped
2525
or does not exist.

0 commit comments

Comments
 (0)