Skip to content

Commit ca8ca3c

Browse files
authored
Merge pull request #4448 from cyphar/cloned-binary-overlayfs
dmz: use overlayfs to write-protect /proc/self/exe if possible
2 parents 08faf15 + 515f09f commit ca8ca3c

File tree

5 files changed

+201
-1
lines changed

5 files changed

+201
-1
lines changed

libcontainer/dmz/cloned_binary_linux.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,23 @@ func IsCloned(exe *os.File) bool {
212212
// make sure the container process can never resolve the original runc binary.
213213
// For more details on why this is necessary, see CVE-2019-5736.
214214
func CloneSelfExe(tmpDir string) (*os.File, error) {
215+
// Try to create a temporary overlayfs to produce a readonly version of
216+
// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
217+
// to CloneBinary, this technique does not require any extra memory usage
218+
// and does not have the (fairly noticeable) performance impact of copying
219+
// a large binary file into a memfd.
220+
//
221+
// Based on some basic performance testing, the overlayfs approach has
222+
// effectively no performance overhead (it is on par with both
223+
// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
224+
// around ~60% overhead during container startup.
225+
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
226+
if err == nil {
227+
logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
228+
return overlayFile, nil
229+
}
230+
logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
231+
215232
selfExe, err := os.Open("/proc/self/exe")
216233
if err != nil {
217234
return nil, fmt.Errorf("opening current binary: %w", err)

libcontainer/dmz/overlayfs_linux.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package dmz
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"path/filepath"
7+
"runtime"
8+
"strings"
9+
10+
"golang.org/x/sys/unix"
11+
12+
"github.com/opencontainers/runc/libcontainer/utils"
13+
)
14+
15+
func fsopen(fsName string, flags int) (*os.File, error) {
16+
// Make sure we always set O_CLOEXEC.
17+
flags |= unix.FSOPEN_CLOEXEC
18+
fd, err := unix.Fsopen(fsName, flags)
19+
if err != nil {
20+
return nil, os.NewSyscallError("fsopen "+fsName, err)
21+
}
22+
return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
23+
}
24+
25+
func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
26+
// Make sure we always set O_CLOEXEC.
27+
flags |= unix.FSMOUNT_CLOEXEC
28+
fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
29+
if err != nil {
30+
return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
31+
}
32+
runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
33+
return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
34+
}
35+
36+
func escapeOverlayLowerDir(path string) string {
37+
// If the lowerdir path contains ":" we need to escape them, and if there
38+
// were any escape characters already (\) we need to escape those first.
39+
return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
40+
}
41+
42+
// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
43+
// uses the directory containing the binary as a lowerdir and a temporary tmpfs
44+
// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
45+
// and so we can create a safe zero-copy sealed version of /proc/self/exe.
46+
// This only works for privileged users and on kernels with overlayfs and
47+
// fsopen() enabled.
48+
//
49+
// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
50+
// it is technically possible to create an overlayfs even for rootless
51+
// containers. Unfortunately, this would require some ugly manual CGo+fork
52+
// magic so we can do this later if we feel it's really needed.
53+
func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
54+
// Try to do the superblock creation first to bail out early if we can't
55+
// use this method.
56+
overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
57+
if err != nil {
58+
return nil, err
59+
}
60+
defer overlayCtx.Close()
61+
62+
// binPath is going to be /proc/self/exe, so do a readlink to get the real
63+
// path. overlayfs needs the real underlying directory for this protection
64+
// mode to work properly.
65+
if realPath, err := os.Readlink(binPath); err == nil {
66+
binPath = realPath
67+
}
68+
binLowerDirPath, binName := filepath.Split(binPath)
69+
// Escape any ":"s or "\"s in the path.
70+
binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
71+
72+
// Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
73+
// where writes are completely blocked. Ideally we would create a dummy
74+
// tmpfs for this, but it turns out that overlayfs doesn't allow for
75+
// anonymous mountns paths.
76+
// NOTE: I'm working on a patch to fix this but it won't be backported.
77+
dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
78+
79+
// Configure the lowerdirs. The binary lowerdir needs to be on the top to
80+
// ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
81+
// mask the binary.
82+
lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
83+
if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
84+
return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
85+
}
86+
87+
// Get an actual handle to the overlayfs.
88+
if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
89+
return nil, os.NewSyscallError("fsconfig create overlayfs", err)
90+
}
91+
overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
92+
if err != nil {
93+
return nil, err
94+
}
95+
defer overlayFd.Close()
96+
97+
// Grab a handle to the binary through overlayfs.
98+
exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
99+
if err != nil {
100+
return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
101+
}
102+
// NOTE: We would like to check that exeFile is the same as /proc/self/exe,
103+
// except this is a little difficult. Depending on what filesystems the
104+
// layers are on, overlayfs can remap the inode numbers (and it always
105+
// creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
106+
// basic stat-based check. The only reasonable option would be to hash both
107+
// files and compare them, but this would require fully reading both files
108+
// which would produce a similar performance overhead to memfd cloning.
109+
//
110+
// Ultimately, there isn't a real attack to be worried about here. An
111+
// attacker would need to be able to modify files in /usr/sbin (or wherever
112+
// runc lives), at which point they could just replace the runc binary with
113+
// something malicious anyway.
114+
return exeFile, nil
115+
}

libcontainer/utils/utils_unix.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error {
346346
}
347347
return err
348348
}
349+
350+
// Openat is a Go-friendly openat(2) wrapper.
351+
func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
352+
dirFd := unix.AT_FDCWD
353+
if dir != nil {
354+
dirFd = int(dir.Fd())
355+
}
356+
flags |= unix.O_CLOEXEC
357+
358+
fd, err := unix.Openat(dirFd, path, flags, mode)
359+
if err != nil {
360+
return nil, &os.PathError{Op: "openat", Path: path, Err: err}
361+
}
362+
return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
363+
}

tests/integration/helpers.bash

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,55 @@ function rootless_cgroup() {
368368
[[ "$ROOTLESS_FEATURES" == *"cgroup"* || -v RUNC_USE_SYSTEMD ]]
369369
}
370370

371+
function in_userns() {
372+
# The kernel guarantees the root userns inode number (and thus the value of
373+
# the magic-link) is always the same value (PROC_USER_INIT_INO).
374+
[[ "$(readlink /proc/self/ns/user)" != "user:[$((0xEFFFFFFD))]" ]]
375+
}
376+
377+
function can_fsopen() {
378+
fstype="$1"
379+
380+
# At the very least you need 5.1 for fsopen() and the filesystem needs to
381+
# be supported by the running kernel.
382+
if ! is_kernel_gte 5.1 || ! grep -qFw "$fstype" /proc/filesystems; then
383+
return 1
384+
fi
385+
386+
# You need to be root to use fsopen.
387+
if [ "$EUID" -ne 0 ]; then
388+
return 1
389+
fi
390+
391+
# If we're root in the initial userns, we're done.
392+
if ! in_userns; then
393+
return 0
394+
fi
395+
396+
# If we are running in a userns, then the filesystem needs to support
397+
# FS_USERNS_MOUNT, which is a per-filesystem flag that depends on the
398+
# kernel version.
399+
case "$fstype" in
400+
overlay)
401+
# 459c7c565ac3 ("ovl: unprivieged mounts")
402+
is_kernel_gte 5.11 || return 2
403+
;;
404+
fuse)
405+
# 4ad769f3c346 ("fuse: Allow fully unprivileged mounts")
406+
is_kernel_gte 4.18 || return 2
407+
;;
408+
ramfs | tmpfs)
409+
# b3c6761d9b5c ("userns: Allow the userns root to mount ramfs.")
410+
# 2b8576cb09a7 ("userns: Allow the userns root to mount tmpfs.")
411+
is_kernel_gte 3.9 || return 2
412+
;;
413+
*)
414+
# If we don't know about the filesystem, return an error.
415+
fail "can_fsopen: unknown filesystem $fstype"
416+
;;
417+
esac
418+
}
419+
371420
# Check if criu is available and working.
372421
function have_criu() {
373422
command -v criu &>/dev/null || return 1
@@ -396,7 +445,7 @@ function requires() {
396445
fi
397446
;;
398447
root)
399-
if [ $EUID -ne 0 ]; then
448+
if [ $EUID -ne 0 ] || in_userns; then
400449
skip_me=1
401450
fi
402451
;;

tests/integration/run.bats

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,10 @@ function teardown() {
159159
[ "$status" -eq 0 ]
160160
[[ "$output" = *"Hello World"* ]]
161161
[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
162+
# runc will use fsopen("overlay") if it can.
163+
if can_fsopen overlay; then
164+
[[ "$output" = *"runc-dmz: using overlayfs for sealed /proc/self/exe"* ]]
165+
fi
162166
}
163167

164168
@test "runc run [joining existing container namespaces]" {

0 commit comments

Comments
 (0)