Skip to content

Commit a68529c

Browse files
authored
Merge pull request #3967 from cyphar/remove-mount-fallback-flag
rootfs: remove --no-mount-fallback and finally fix MS_REMOUNT
2 parents 3272edf + 7c71a22 commit a68529c

File tree

10 files changed

+548
-132
lines changed

10 files changed

+548
-132
lines changed

contrib/completions/bash/runc

-3
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,6 @@ _runc_run() {
461461
--no-subreaper
462462
--no-pivot
463463
--no-new-keyring
464-
--no-mount-fallback
465464
"
466465

467466
local options_with_args="
@@ -568,7 +567,6 @@ _runc_create() {
568567
--help
569568
--no-pivot
570569
--no-new-keyring
571-
--no-mount-fallback
572570
"
573571

574572
local options_with_args="
@@ -629,7 +627,6 @@ _runc_restore() {
629627
--no-pivot
630628
--auto-dedup
631629
--lazy-pages
632-
--no-mount-fallback
633630
"
634631

635632
local options_with_args="

create.go

-4
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,6 @@ command(s) that get executed on start, edit the args parameter of the spec. See
5151
Name: "preserve-fds",
5252
Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
5353
},
54-
cli.BoolFlag{
55-
Name: "no-mount-fallback",
56-
Usage: "Do not fallback when the specific configuration is not applicable (e.g., do not try to remount a bind mount again after the first attempt failed on source filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set)",
57-
},
5854
},
5955
Action: func(context *cli.Context) error {
6056
if err := checkArgs(context, 1, exactArgs); err != nil {

libcontainer/configs/config.go

-4
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,6 @@ type Config struct {
214214
// When RootlessCgroups is set, cgroups errors are ignored.
215215
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
216216

217-
// Do not try to remount a bind mount again after the first attempt failed on source
218-
// filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set
219-
NoMountFallback bool `json:"no_mount_fallback,omitempty"`
220-
221217
// TimeOffsets specifies the offset for supporting time namespaces.
222218
TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`
223219

libcontainer/configs/mount_linux.go

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ type Mount struct {
1515
// Mount flags.
1616
Flags int `json:"flags"`
1717

18+
// Mount flags that were explicitly cleared in the configuration (meaning
19+
// the user explicitly requested that these flags *not* be set).
20+
ClearedFlags int `json:"cleared_flags"`
21+
1822
// Propagation Flags
1923
PropagationFlags []int `json:"propagation_flags"`
2024

libcontainer/rootfs_linux.go

+136-38
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ type mountConfig struct {
3434
cgroup2Path string
3535
rootlessCgroups bool
3636
cgroupns bool
37-
noMountFallback bool
3837
}
3938

4039
// mountEntry contains mount data specific to a mount point.
@@ -83,7 +82,6 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (er
8382
cgroup2Path: iConfig.Cgroup2Path,
8483
rootlessCgroups: iConfig.RootlessCgroups,
8584
cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
86-
noMountFallback: config.NoMountFallback,
8785
}
8886
for i, m := range config.Mounts {
8987
entry := mountEntry{Mount: m}
@@ -409,6 +407,51 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
409407
})
410408
}
411409

410+
const (
411+
// The atime "enum" flags (which are mutually exclusive).
412+
mntAtimeEnumFlags = unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME
413+
// All atime-related flags.
414+
mntAtimeFlags = mntAtimeEnumFlags | unix.MS_NODIRATIME
415+
// Flags which can be locked when inheriting mounts in a different userns.
416+
// In the kernel, these are the mounts that are locked using MNT_LOCK_*.
417+
mntLockFlags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC |
418+
unix.MS_NOSUID | mntAtimeFlags
419+
)
420+
421+
func statfsToMountFlags(st unix.Statfs_t) int {
422+
// From <linux/statfs.h>.
423+
const ST_NOSYMFOLLOW = 0x2000 //nolint:revive
424+
425+
var flags int
426+
for _, f := range []struct {
427+
st, ms int
428+
}{
429+
// See calculate_f_flags() in fs/statfs.c.
430+
{unix.ST_RDONLY, unix.MS_RDONLY},
431+
{unix.ST_NOSUID, unix.MS_NOSUID},
432+
{unix.ST_NODEV, unix.MS_NODEV},
433+
{unix.ST_NOEXEC, unix.MS_NOEXEC},
434+
{unix.ST_MANDLOCK, unix.MS_MANDLOCK},
435+
{unix.ST_SYNCHRONOUS, unix.MS_SYNCHRONOUS},
436+
{unix.ST_NOATIME, unix.MS_NOATIME},
437+
{unix.ST_NODIRATIME, unix.MS_NODIRATIME},
438+
{unix.ST_RELATIME, unix.MS_RELATIME},
439+
{ST_NOSYMFOLLOW, unix.MS_NOSYMFOLLOW},
440+
// There is no ST_STRICTATIME -- see below.
441+
} {
442+
if int(st.Flags)&f.st == f.st {
443+
flags |= f.ms
444+
}
445+
}
446+
// MS_STRICTATIME is a "fake" MS_* flag. It isn't stored in mnt->mnt_flags,
447+
// and so it doesn't show up in statfs(2). If none of the other flags in
448+
// atime enum are present, the mount is MS_STRICTATIME.
449+
if flags&mntAtimeEnumFlags == 0 {
450+
flags |= unix.MS_STRICTATIME
451+
}
452+
return flags
453+
}
454+
412455
func mountToRootfs(c *mountConfig, m mountEntry) error {
413456
rootfs := c.root
414457

@@ -509,11 +552,97 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
509552
return err
510553
}
511554
}
512-
// bind mount won't change mount options, we need remount to make mount options effective.
513-
// first check that we have non-default options required before attempting a remount
514-
if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
515-
// only remount if unique mount options are set
516-
if err := remount(m, rootfs, c.noMountFallback); err != nil {
555+
556+
// The initial MS_BIND won't change the mount options, we need to do a
557+
// separate MS_BIND|MS_REMOUNT to apply the mount options. We skip
558+
// doing this if the user has not specified any mount flags at all
559+
// (including cleared flags) -- in which case we just keep the original
560+
// mount flags.
561+
//
562+
// Note that the fact we check whether any clearing flags are set is in
563+
// contrast to mount(8)'s current behaviour, but is what users probably
564+
// expect. See <https://github.com/util-linux/util-linux/issues/2433>.
565+
if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 {
566+
if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
567+
flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT
568+
// The runtime-spec says we SHOULD map to the relevant mount(8)
569+
// behaviour. However, it's not clear whether we want the
570+
// "mount --bind -o ..." or "mount --bind -o remount,..."
571+
// behaviour here -- both of which are somewhat broken[1].
572+
//
573+
// So, if the user has passed "remount" as a mount option, we
574+
// implement the "mount --bind -o remount" behaviour, otherwise
575+
// we implement the spiritual intent of the "mount --bind -o"
576+
// behaviour, which should match what users expect. Maybe
577+
// mount(8) will eventually implement this behaviour too..
578+
//
579+
// [1]: https://github.com/util-linux/util-linux/issues/2433
580+
581+
// Initially, we emulate "mount --bind -o ..." where we set
582+
// only the requested flags (clearing any existing flags). The
583+
// only difference from mount(8) is that we do this
584+
// unconditionally, regardless of whether any set-me mount
585+
// options have been requested.
586+
//
587+
// TODO: We are not doing any special handling of the atime
588+
// flags here, which means that the mount will inherit the old
589+
// atime flags if the user didn't explicitly request a
590+
// different set of flags. This also has the mount(8) bug where
591+
// "nodiratime,norelatime" will result in a
592+
// "nodiratime,relatime" mount.
593+
mountErr := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "")
594+
if mountErr == nil {
595+
return nil
596+
}
597+
598+
// If the mount failed, the mount may contain locked mount
599+
// flags. In that case, we emulate "mount --bind -o
600+
// remount,...", where we take the existing mount flags of the
601+
// mount and apply the request flags (including clearing flags)
602+
// on top. The main divergence we have from mount(8) here is
603+
// that we handle atimes correctly to make sure we error out if
604+
// we cannot fulfil the requested mount flags.
605+
606+
var st unix.Statfs_t
607+
if err := unix.Statfs(m.src(), &st); err != nil {
608+
return &os.PathError{Op: "statfs", Path: m.src(), Err: err}
609+
}
610+
srcFlags := statfsToMountFlags(st)
611+
// If the user explicitly request one of the locked flags *not*
612+
// be set, we need to return an error to avoid producing mounts
613+
// that don't match the user's request.
614+
if srcFlags&m.ClearedFlags&mntLockFlags != 0 {
615+
return mountErr
616+
}
617+
618+
// If an MS_*ATIME flag was requested, it must match the
619+
// existing one. This handles two separate kernel bugs, and
620+
// matches the logic of can_change_locked_flags() but without
621+
// these bugs:
622+
//
623+
// * (2.6.30+) Since commit 613cbe3d4870 ("Don't set relatime
624+
// when noatime is specified"), MS_RELATIME is ignored when
625+
// MS_NOATIME is set. This means that us inheriting MS_NOATIME
626+
// from a mount while requesting MS_RELATIME would *silently*
627+
// produce an MS_NOATIME mount.
628+
//
629+
// * (2.6.30+) Since its introduction in commit d0adde574b84
630+
// ("Add a strictatime mount option"), MS_STRICTATIME has
631+
// caused any passed MS_RELATIME and MS_NOATIME flags to be
632+
// ignored which results in us *silently* producing
633+
// MS_STRICTATIME mounts even if the user requested MS_RELATIME
634+
// or MS_NOATIME.
635+
if m.Flags&mntAtimeFlags != 0 && m.Flags&mntAtimeFlags != srcFlags&mntAtimeFlags {
636+
return mountErr
637+
}
638+
639+
// Retry the mount with the existing lockable mount flags
640+
// applied.
641+
flags |= srcFlags & mntLockFlags
642+
mountErr = mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "")
643+
logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr)
644+
return mountErr
645+
}); err != nil {
517646
return err
518647
}
519648
}
@@ -1103,37 +1232,6 @@ func writeSystemProperty(key, value string) error {
11031232
return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
11041233
}
11051234

1106-
func remount(m mountEntry, rootfs string, noMountFallback bool) error {
1107-
return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
1108-
flags := uintptr(m.Flags | unix.MS_REMOUNT)
1109-
err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "")
1110-
if err == nil {
1111-
return nil
1112-
}
1113-
// Check if the source has flags set according to noMountFallback
1114-
src := m.src()
1115-
var s unix.Statfs_t
1116-
if err := unix.Statfs(src, &s); err != nil {
1117-
return &os.PathError{Op: "statfs", Path: src, Err: err}
1118-
}
1119-
var checkflags int
1120-
if noMountFallback {
1121-
// Check for ro only
1122-
checkflags = unix.MS_RDONLY
1123-
} else {
1124-
// Check for ro, nodev, noexec, nosuid, noatime, relatime, strictatime,
1125-
// nodiratime
1126-
checkflags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME | unix.MS_NODIRATIME
1127-
}
1128-
if int(s.Flags)&checkflags == 0 {
1129-
return err
1130-
}
1131-
// ... and retry the mount with flags found above.
1132-
flags |= uintptr(int(s.Flags) & checkflags)
1133-
return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "")
1134-
})
1135-
}
1136-
11371235
// Do the mount operation followed by additional mounts required to take care
11381236
// of propagation flags. This will always be scoped inside the container rootfs.
11391237
func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {

libcontainer/specconv/spec_linux.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ type CreateOpts struct {
313313
Spec *specs.Spec
314314
RootlessEUID bool
315315
RootlessCgroups bool
316-
NoMountFallback bool
317316
}
318317

319318
// getwd is a wrapper similar to os.Getwd, except it always gets
@@ -359,7 +358,6 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
359358
NoNewKeyring: opts.NoNewKeyring,
360359
RootlessEUID: opts.RootlessEUID,
361360
RootlessCgroups: opts.RootlessCgroups,
362-
NoMountFallback: opts.NoMountFallback,
363361
}
364362

365363
for _, m := range spec.Mounts {
@@ -977,10 +975,15 @@ func parseMountOptions(options []string) *configs.Mount {
977975
// or the flag is not supported on the platform,
978976
// then it is a data value for a specific fs type.
979977
if f, exists := mountFlags[o]; exists && f.flag != 0 {
978+
// FIXME: The *atime flags are special (they are more of an enum
979+
// with quite hairy semantics) and thus arguably setting some of
980+
// them should clear unrelated flags.
980981
if f.clear {
981982
m.Flags &= ^f.flag
983+
m.ClearedFlags |= f.flag
982984
} else {
983985
m.Flags |= f.flag
986+
m.ClearedFlags &= ^f.flag
984987
}
985988
} else if f, exists := mountPropagationMapping[o]; exists && f != 0 {
986989
m.PropagationFlags = append(m.PropagationFlags, f)

restore.go

-4
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,6 @@ using the runc checkpoint command.`,
9898
Value: "",
9999
Usage: "Specify an LSM mount context to be used during restore.",
100100
},
101-
cli.BoolFlag{
102-
Name: "no-mount-fallback",
103-
Usage: "Do not fallback when the specific configuration is not applicable (e.g., do not try to remount a bind mount again after the first attempt failed on source filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set)",
104-
},
105101
},
106102
Action: func(context *cli.Context) error {
107103
if err := checkArgs(context, 1, exactArgs); err != nil {

run.go

-4
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,6 @@ command(s) that get executed on start, edit the args parameter of the spec. See
6464
Name: "preserve-fds",
6565
Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
6666
},
67-
cli.BoolFlag{
68-
Name: "no-mount-fallback",
69-
Usage: "Do not fallback when the specific configuration is not applicable (e.g., do not try to remount a bind mount again after the first attempt failed on source filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set)",
70-
},
7167
},
7268
Action: func(context *cli.Context) error {
7369
if err := checkArgs(context, 1, exactArgs); err != nil {

0 commit comments

Comments
 (0)