Skip to content

Commit 189749a

Browse files
authored
Merge pull request #4492 from cyphar/nsenter-flexible-joining
nsenter: implement a two-stage join for setns
2 parents 396a975 + fffc165 commit 189749a

File tree

3 files changed

+234
-57
lines changed

3 files changed

+234
-57
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ install-man: man
207207
.PHONY: cfmt
208208
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
209209
cfmt:
210-
indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC)
210+
indent -linux -l120 -il0 -ppi2 -cp1 -sar -T size_t -T jmp_buf $(C_SRC)
211211

212212
.PHONY: shellcheck
213213
shellcheck:

libcontainer/nsenter/nsexec.c

+170-55
Original file line numberDiff line numberDiff line change
@@ -322,30 +322,6 @@ static int clone_parent(jmp_buf *env, int jmpval)
322322
return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
323323
}
324324

325-
/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
326-
static int nsflag(char *name)
327-
{
328-
if (!strcmp(name, "cgroup"))
329-
return CLONE_NEWCGROUP;
330-
else if (!strcmp(name, "ipc"))
331-
return CLONE_NEWIPC;
332-
else if (!strcmp(name, "mnt"))
333-
return CLONE_NEWNS;
334-
else if (!strcmp(name, "net"))
335-
return CLONE_NEWNET;
336-
else if (!strcmp(name, "pid"))
337-
return CLONE_NEWPID;
338-
else if (!strcmp(name, "user"))
339-
return CLONE_NEWUSER;
340-
else if (!strcmp(name, "uts"))
341-
return CLONE_NEWUTS;
342-
else if (!strcmp(name, "time"))
343-
return CLONE_NEWTIME;
344-
345-
/* If we don't recognise a name, fallback to 0. */
346-
return 0;
347-
}
348-
349325
static uint32_t readint32(char *buf)
350326
{
351327
return *(uint32_t *) buf;
@@ -444,35 +420,67 @@ void nl_free(struct nlconfig_t *config)
444420
free(config->data);
445421
}
446422

447-
void join_namespaces(char *nslist)
448-
{
449-
int num = 0, i;
450-
char *saveptr = NULL;
451-
char *namespace = strtok_r(nslist, ",", &saveptr);
452-
struct namespace_t {
453-
int fd;
454-
char type[PATH_MAX];
455-
char path[PATH_MAX];
456-
} *namespaces = NULL;
423+
struct namespace_t {
424+
int fd;
425+
char type[PATH_MAX];
426+
char path[PATH_MAX];
427+
};
457428

458-
if (!namespace || !strlen(namespace) || !strlen(nslist))
459-
bail("ns paths are empty");
429+
typedef int nsset_t;
430+
431+
static struct nstype_t {
432+
int type;
433+
char *name;
434+
} all_ns_types[] = {
435+
{ CLONE_NEWCGROUP, "cgroup" },
436+
{ CLONE_NEWIPC, "ipc" },
437+
{ CLONE_NEWNS, "mnt" },
438+
{ CLONE_NEWNET, "net" },
439+
{ CLONE_NEWPID, "pid" },
440+
{ CLONE_NEWTIME, "time" },
441+
{ CLONE_NEWUSER, "user" },
442+
{ CLONE_NEWUTS, "uts" },
443+
{ }, /* null terminator */
444+
};
460445

446+
/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
447+
static int nstype(char *name)
448+
{
449+
for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++)
450+
if (!strcmp(name, ns->name))
451+
return ns->type;
461452
/*
462-
* We have to open the file descriptors first, since after
463-
* we join the mnt namespace we might no longer be able to
464-
* access the paths.
453+
* setns(2) lets us join namespaces without knowing the type, but
454+
* namespaces usually require special handling of some kind (so joining
455+
* a namespace without knowing its type or joining a new namespace type
456+
* without corresponding handling could result in broken behaviour) and
457+
* the rest of runc doesn't allow unknown namespace types anyway.
465458
*/
459+
bail("unknown namespace type %s", name);
460+
}
461+
462+
static nsset_t __open_namespaces(char *nsspec, struct namespace_t **ns_list, size_t *ns_len)
463+
{
464+
int len = 0;
465+
nsset_t ns_to_join = 0;
466+
char *namespace, *saveptr = NULL;
467+
struct namespace_t *namespaces = NULL;
468+
469+
namespace = strtok_r(nsspec, ",", &saveptr);
470+
471+
if (!namespace || !strlen(namespace) || !strlen(nsspec))
472+
bail("ns paths are empty");
473+
466474
do {
467475
int fd;
468476
char *path;
469477
struct namespace_t *ns;
470478

471479
/* Resize the namespace array. */
472-
namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
480+
namespaces = realloc(namespaces, ++len * sizeof(struct namespace_t));
473481
if (!namespaces)
474482
bail("failed to reallocate namespace array");
475-
ns = &namespaces[num - 1];
483+
ns = &namespaces[len - 1];
476484

477485
/* Split 'ns:path'. */
478486
path = strstr(namespace, ":");
@@ -488,38 +496,145 @@ void join_namespaces(char *nslist)
488496
strncpy(ns->type, namespace, PATH_MAX - 1);
489497
strncpy(ns->path, path, PATH_MAX - 1);
490498
ns->path[PATH_MAX - 1] = '\0';
491-
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
492499

493-
/*
494-
* The ordering in which we join namespaces is important. We should
495-
* always join the user namespace *first*. This is all guaranteed
496-
* from the container_linux.go side of this, so we're just going to
497-
* follow the order given to us.
498-
*/
500+
ns_to_join |= nstype(ns->type);
501+
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
499502

500-
for (i = 0; i < num; i++) {
501-
struct namespace_t *ns = &namespaces[i];
502-
int flag = nsflag(ns->type);
503+
*ns_list = namespaces;
504+
*ns_len = len;
505+
return ns_to_join;
506+
}
503507

504-
write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
505-
if (setns(ns->fd, flag) < 0)
508+
/*
509+
* Try to join all namespaces that are in the "allow" nsset, and return the
510+
* set we were able to successfully join. If a permission error is returned
511+
* from nsset(2), the namespace is skipped (non-permission errors are fatal).
512+
*/
513+
static nsset_t __join_namespaces(nsset_t allow, struct namespace_t *ns_list, size_t ns_len)
514+
{
515+
nsset_t joined = 0;
516+
517+
for (size_t i = 0; i < ns_len; i++) {
518+
struct namespace_t *ns = &ns_list[i];
519+
int type = nstype(ns->type);
520+
int err, saved_errno;
521+
522+
if (!(type & allow))
523+
continue;
524+
525+
err = setns(ns->fd, type);
526+
saved_errno = errno;
527+
write_log(DEBUG, "setns(%#x) into %s namespace (with path %s): %s",
528+
type, ns->type, ns->path, strerror(errno));
529+
if (err < 0) {
530+
/* Skip permission errors. */
531+
if (saved_errno == EPERM)
532+
continue;
506533
bail("failed to setns into %s namespace", ns->type);
534+
}
535+
joined |= type;
507536

508537
/*
509538
* If we change user namespaces, make sure we switch to root in the
510539
* namespace (this matches the logic for unshare(CLONE_NEWUSER)), lots
511540
* of things can break if we aren't the right user. See
512541
* <https://github.com/opencontainers/runc/issues/4466> for one example.
513542
*/
514-
if (flag == CLONE_NEWUSER) {
543+
if (type == CLONE_NEWUSER) {
515544
if (setresuid(0, 0, 0) < 0)
516545
bail("failed to become root in user namespace");
517546
}
518547

519548
close(ns->fd);
549+
ns->fd = -1;
550+
}
551+
return joined;
552+
}
553+
554+
static char *strappend(char *dst, char *src)
555+
{
556+
if (!dst)
557+
return strdup(src);
558+
559+
size_t len = strlen(dst) + strlen(src) + 1;
560+
dst = realloc(dst, len);
561+
strncat(dst, src, len);
562+
return dst;
563+
}
564+
565+
static char *nsset_to_str(nsset_t nsset)
566+
{
567+
char *str = NULL;
568+
for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++) {
569+
if (ns->type & nsset) {
570+
if (str)
571+
str = strappend(str, ", ");
572+
str = strappend(str, ns->name);
573+
}
574+
}
575+
return str ? : strdup("");
576+
}
577+
578+
static void __close_namespaces(nsset_t to_join, nsset_t joined, struct namespace_t *ns_list, size_t ns_len)
579+
{
580+
/* We expect to have joined every namespace. */
581+
nsset_t failed_to_join = to_join & ~joined;
582+
583+
/* Double-check that we used up (and thus joined) all of the nsfds. */
584+
for (size_t i = 0; i < ns_len; i++) {
585+
struct namespace_t *ns = &ns_list[i];
586+
int type = nstype(ns->type);
587+
588+
if (ns->fd < 0)
589+
continue;
590+
591+
failed_to_join |= type;
592+
write_log(FATAL, "failed to setns(%#x) into %s namespace (with path %s): %s",
593+
type, ns->type, ns->path, strerror(EPERM));
594+
close(ns->fd);
595+
ns->fd = -1;
520596
}
521597

522-
free(namespaces);
598+
/* Make sure we joined the namespaces we planned to. */
599+
if (failed_to_join)
600+
bail("failed to join {%s} namespaces: %s", nsset_to_str(failed_to_join), strerror(EPERM));
601+
602+
free(ns_list);
603+
}
604+
605+
void join_namespaces(char *nsspec)
606+
{
607+
nsset_t to_join = 0, joined = 0;
608+
struct namespace_t *ns_list;
609+
size_t ns_len;
610+
611+
/*
612+
* We have to open the file descriptors first, since after we join the
613+
* mnt or user namespaces we might no longer be able to access the
614+
* paths.
615+
*/
616+
to_join = __open_namespaces(nsspec, &ns_list, &ns_len);
617+
618+
/*
619+
* We first try to join all non-userns namespaces to join any namespaces
620+
* that we might not be able to join once we switch credentials to the
621+
* container's userns. We then join the user namespace, and then try to
622+
* join any remaining namespaces (this last step is needed for rootless
623+
* containers -- we don't get setns(2) permissions until we join the userns
624+
* and get CAP_SYS_ADMIN).
625+
*
626+
* Splitting the joins this way is necessary for containers that are
627+
* configured to join some externally-created namespace but are also
628+
* configured to join an unrelated user namespace.
629+
*
630+
* This is similar to what nsenter(1) seems to do in practice.
631+
*/
632+
joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
633+
joined |= __join_namespaces(CLONE_NEWUSER, ns_list, ns_len);
634+
joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
635+
636+
/* Verify that we joined all of the namespaces. */
637+
__close_namespaces(to_join, joined, ns_list, ns_len);
523638
}
524639

525640
static inline int sane_kill(pid_t pid, int signum)

tests/integration/userns.bats

+63-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ function teardown() {
2929
if [ -v to_umount_list ]; then
3030
while read -r mount_path; do
3131
umount -l "$mount_path" || :
32-
rm -f "$mount_path"
32+
rm -rf "$mount_path"
3333
done <"$to_umount_list"
3434
rm -f "$to_umount_list"
3535
unset to_umount_list
@@ -184,3 +184,65 @@ function teardown() {
184184
grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output"
185185
fi
186186
}
187+
188+
# <https://github.com/opencontainers/runc/issues/4390>
189+
@test "userns join external namespaces [wrong userns owner]" {
190+
requires root
191+
192+
# Create an external user namespace for us to join. It seems on some
193+
# operating systems (AlmaLinux in particular) "unshare -U" will
194+
# automatically use an identity mapping (which breaks this test) so we need
195+
# to use runc to create the userns.
196+
update_config '.process.args = ["sleep", "infinity"]'
197+
runc run -d --console-socket "$CONSOLE_SOCKET" target_userns
198+
[ "$status" -eq 0 ]
199+
200+
# Bind-mount the first containers userns nsfd to a different path, to
201+
# exercise the non-fast-path (where runc has to join the userns to get the
202+
# mappings).
203+
userns_pid="$(__runc state target_userns | jq .pid)"
204+
userns_path="$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX")"
205+
mount --bind "/proc/$userns_pid/ns/user" "$userns_path"
206+
echo "$userns_path" >>"$to_umount_list"
207+
208+
# Kill the container -- we have the userns bind-mounted.
209+
runc delete -f target_userns
210+
[ "$status" -eq 0 ]
211+
212+
# Configure our container to attach to the external userns.
213+
update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end)
214+
| del(.linux.uidMappings)
215+
| del(.linux.gidMappings)'
216+
217+
# Also create a network namespace that *is not owned* by the above userns.
218+
# NOTE: Having no permissions in a namespaces makes it necessary to modify
219+
# the config so that we don't get mount errors (for reference: no netns
220+
# permissions == no sysfs mounts, no pidns permissoins == no procfs mounts,
221+
# no utsns permissions == no sethostname(2), no ipc permissions == no
222+
# mqueue mounts, etc).
223+
netns_path="$(mktemp "$BATS_RUN_TMPDIR/netns.XXXXXX")"
224+
unshare -i -- mount --bind "/proc/self/ns/net" "$netns_path"
225+
echo "$netns_path" >>"$to_umount_list"
226+
# Configure our container to attach to the external netns.
227+
update_config '.linux.namespaces |= map(if .type == "network" then (.path = "'"$netns_path"'") else . end)'
228+
229+
# Convert sysfs mounts to a bind-mount from the host, to avoid permission
230+
# issues due to the netns setup we have.
231+
update_config '.mounts |= map((select(.type == "sysfs") | { "source": "/sys", "destination": .destination, "type": "bind", "options": ["rbind"] }) // .)'
232+
233+
# Create a detached container to verify the namespaces are correct.
234+
update_config '.process.args = ["sleep", "infinity"]'
235+
runc --debug run -d --console-socket "$CONSOLE_SOCKET" ctr
236+
[ "$status" -eq 0 ]
237+
238+
userns_id="user:[$(stat -c "%i" "$userns_path")]"
239+
netns_id="net:[$(stat -c "%i" "$netns_path")]"
240+
241+
runc exec ctr readlink /proc/self/ns/user
242+
[ "$status" -eq 0 ]
243+
[[ "$output" == "$userns_id" ]]
244+
245+
runc exec ctr readlink /proc/self/ns/net
246+
[ "$status" -eq 0 ]
247+
[[ "$output" == "$netns_id" ]]
248+
}

0 commit comments

Comments
 (0)