Skip to content

nsenter: implement a two-stage join for setns #4492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ install-man: man
.PHONY: cfmt
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
cfmt:
indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC)
indent -linux -l120 -il0 -ppi2 -cp1 -sar -T size_t -T jmp_buf $(C_SRC)

.PHONY: shellcheck
shellcheck:
Expand Down
225 changes: 170 additions & 55 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -322,30 +322,6 @@ static int clone_parent(jmp_buf *env, int jmpval)
return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
}

/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
static int nsflag(char *name)
{
if (!strcmp(name, "cgroup"))
return CLONE_NEWCGROUP;
else if (!strcmp(name, "ipc"))
return CLONE_NEWIPC;
else if (!strcmp(name, "mnt"))
return CLONE_NEWNS;
else if (!strcmp(name, "net"))
return CLONE_NEWNET;
else if (!strcmp(name, "pid"))
return CLONE_NEWPID;
else if (!strcmp(name, "user"))
return CLONE_NEWUSER;
else if (!strcmp(name, "uts"))
return CLONE_NEWUTS;
else if (!strcmp(name, "time"))
return CLONE_NEWTIME;

/* If we don't recognise a name, fallback to 0. */
return 0;
}

static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
Expand Down Expand Up @@ -444,35 +420,67 @@ void nl_free(struct nlconfig_t *config)
free(config->data);
}

void join_namespaces(char *nslist)
{
int num = 0, i;
char *saveptr = NULL;
char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t {
int fd;
char type[PATH_MAX];
char path[PATH_MAX];
} *namespaces = NULL;
struct namespace_t {
int fd;
char type[PATH_MAX];
char path[PATH_MAX];
};

if (!namespace || !strlen(namespace) || !strlen(nslist))
bail("ns paths are empty");
typedef int nsset_t;

static struct nstype_t {
int type;
char *name;
} all_ns_types[] = {
{ CLONE_NEWCGROUP, "cgroup" },
{ CLONE_NEWIPC, "ipc" },
{ CLONE_NEWNS, "mnt" },
{ CLONE_NEWNET, "net" },
{ CLONE_NEWPID, "pid" },
{ CLONE_NEWTIME, "time" },
{ CLONE_NEWUSER, "user" },
{ CLONE_NEWUTS, "uts" },
{ }, /* null terminator */
};

/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
static int nstype(char *name)
{
for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++)
if (!strcmp(name, ns->name))
return ns->type;
/*
* We have to open the file descriptors first, since after
* we join the mnt namespace we might no longer be able to
* access the paths.
* setns(2) lets us join namespaces without knowing the type, but
* namespaces usually require special handling of some kind (so joining
* a namespace without knowing its type or joining a new namespace type
* without corresponding handling could result in broken behaviour) and
* the rest of runc doesn't allow unknown namespace types anyway.
*/
bail("unknown namespace type %s", name);
}

static nsset_t __open_namespaces(char *nsspec, struct namespace_t **ns_list, size_t *ns_len)
{
int len = 0;
nsset_t ns_to_join = 0;
char *namespace, *saveptr = NULL;
struct namespace_t *namespaces = NULL;

namespace = strtok_r(nsspec, ",", &saveptr);

if (!namespace || !strlen(namespace) || !strlen(nsspec))
bail("ns paths are empty");

do {
int fd;
char *path;
struct namespace_t *ns;

/* Resize the namespace array. */
namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
namespaces = realloc(namespaces, ++len * sizeof(struct namespace_t));
if (!namespaces)
bail("failed to reallocate namespace array");
ns = &namespaces[num - 1];
ns = &namespaces[len - 1];

/* Split 'ns:path'. */
path = strstr(namespace, ":");
Expand All @@ -488,38 +496,145 @@ void join_namespaces(char *nslist)
strncpy(ns->type, namespace, PATH_MAX - 1);
strncpy(ns->path, path, PATH_MAX - 1);
ns->path[PATH_MAX - 1] = '\0';
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);

/*
* The ordering in which we join namespaces is important. We should
* always join the user namespace *first*. This is all guaranteed
* from the container_linux.go side of this, so we're just going to
* follow the order given to us.
*/
ns_to_join |= nstype(ns->type);
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);

for (i = 0; i < num; i++) {
struct namespace_t *ns = &namespaces[i];
int flag = nsflag(ns->type);
*ns_list = namespaces;
*ns_len = len;
return ns_to_join;
}

write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
if (setns(ns->fd, flag) < 0)
/*
* Try to join all namespaces that are in the "allow" nsset, and return the
* set we were able to successfully join. If a permission error is returned
* from nsset(2), the namespace is skipped (non-permission errors are fatal).
*/
static nsset_t __join_namespaces(nsset_t allow, struct namespace_t *ns_list, size_t ns_len)
{
nsset_t joined = 0;

for (size_t i = 0; i < ns_len; i++) {
struct namespace_t *ns = &ns_list[i];
int type = nstype(ns->type);
int err, saved_errno;

if (!(type & allow))
continue;

err = setns(ns->fd, type);
saved_errno = errno;
write_log(DEBUG, "setns(%#x) into %s namespace (with path %s): %s",
type, ns->type, ns->path, strerror(errno));
if (err < 0) {
/* Skip permission errors. */
if (saved_errno == EPERM)
continue;
bail("failed to setns into %s namespace", ns->type);
}
joined |= type;

/*
* If we change user namespaces, make sure we switch to root in the
* namespace (this matches the logic for unshare(CLONE_NEWUSER)), lots
* of things can break if we aren't the right user. See
* <https://github.com/opencontainers/runc/issues/4466> for one example.
*/
if (flag == CLONE_NEWUSER) {
if (type == CLONE_NEWUSER) {
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}

close(ns->fd);
ns->fd = -1;
}
return joined;
}

static char *strappend(char *dst, char *src)
{
if (!dst)
return strdup(src);

size_t len = strlen(dst) + strlen(src) + 1;
dst = realloc(dst, len);
strncat(dst, src, len);
return dst;
}

static char *nsset_to_str(nsset_t nsset)
{
char *str = NULL;
for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++) {
if (ns->type & nsset) {
if (str)
str = strappend(str, ", ");
str = strappend(str, ns->name);
}
}
return str ? : strdup("");
}

static void __close_namespaces(nsset_t to_join, nsset_t joined, struct namespace_t *ns_list, size_t ns_len)
{
/* We expect to have joined every namespace. */
nsset_t failed_to_join = to_join & ~joined;

/* Double-check that we used up (and thus joined) all of the nsfds. */
for (size_t i = 0; i < ns_len; i++) {
struct namespace_t *ns = &ns_list[i];
int type = nstype(ns->type);

if (ns->fd < 0)
continue;

failed_to_join |= type;
write_log(FATAL, "failed to setns(%#x) into %s namespace (with path %s): %s",
type, ns->type, ns->path, strerror(EPERM));
close(ns->fd);
ns->fd = -1;
}

free(namespaces);
/* Make sure we joined the namespaces we planned to. */
if (failed_to_join)
bail("failed to join {%s} namespaces: %s", nsset_to_str(failed_to_join), strerror(EPERM));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: bail appends : %m to the message printed so maybe instead of : %s", ... strerror(EPERM) you can just assign errno = EPERM before calling it.

Otherwise we'll end up with something like ... namespaces: Operation not permitted: Success which is somewhat confusing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like:

DEBU[0000]libcontainer/logs/logs.go:55 libcontainer/logs.processEntry() nsexec-1[11318]: skipping setns(0x10000000) into user namespace (with path /proc/1/ns/user) 
DEBU[0000]libcontainer/logs/logs.go:55 libcontainer/logs.processEntry() nsexec-1[11318]: setns(0x10000000) into user namespace (with path /proc/1/ns/user): Invalid argument 
FATA[0000]libcontainer/logs/logs.go:55 libcontainer/logs.processEntry() nsexec-1[11318]: failed to setns into user namespace: Invalid argument 
FATA[0000]libcontainer/logs/logs.go:55 libcontainer/logs.processEntry() nsexec-0[11317]: failed to sync with stage-1: next state: Success 
ERRO[0000]utils.go:62 main.fatalWithCode() runc run failed: unable to start container process: can't get final child's PID from pipe: EOF


free(ns_list);
}

void join_namespaces(char *nsspec)
{
nsset_t to_join = 0, joined = 0;
struct namespace_t *ns_list;
size_t ns_len;

/*
* We have to open the file descriptors first, since after we join the
* mnt or user namespaces we might no longer be able to access the
* paths.
*/
to_join = __open_namespaces(nsspec, &ns_list, &ns_len);

/*
* We first try to join all non-userns namespaces to join any namespaces
* that we might not be able to join once we switch credentials to the
* container's userns. We then join the user namespace, and then try to
* join any remaining namespaces (this last step is needed for rootless
* containers -- we don't get setns(2) permissions until we join the userns
* and get CAP_SYS_ADMIN).
*
* Splitting the joins this way is necessary for containers that are
* configured to join some externally-created namespace but are also
* configured to join an unrelated user namespace.
*
* This is similar to what nsenter(1) seems to do in practice.
*/
joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
Copy link
Member

@lifubang lifubang Nov 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe for rootless container, all to_join ns paths will be joined fail in the first step? So it will introduce some unnecessary syscalls for rootless container? From this aspect, please look #4491 to check whether it will be more simple to fix the related issue in go code or not?

joined |= __join_namespaces(CLONE_NEWUSER, ns_list, ns_len);
joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In practice, is there any namespaces will be joined failed before we join user namespace, but will join successfully after user ns joined?

Copy link
Member Author

@cyphar cyphar Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are rootless and there are userns-owned namespaces, we need to be in the userns in order to have the necessary permissions to do setns. This is what 2cd9c31 fixed, and so we need to keep this part of that behaviour. The way we fixed it before was changing the order (so the userns would be first), but by doing it this way the order is irrelevant.

If you remove this line, the rootless docker exec tests all fail.

(This is also what nsenter(1) does afaics.)


/* Verify that we joined all of the namespaces. */
__close_namespaces(to_join, joined, ns_list, ns_len);
}

static inline int sane_kill(pid_t pid, int signum)
Expand Down
64 changes: 63 additions & 1 deletion tests/integration/userns.bats
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ function teardown() {
if [ -v to_umount_list ]; then
while read -r mount_path; do
umount -l "$mount_path" || :
rm -f "$mount_path"
rm -rf "$mount_path"
done <"$to_umount_list"
rm -f "$to_umount_list"
unset to_umount_list
Expand Down Expand Up @@ -184,3 +184,65 @@ function teardown() {
grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output"
fi
}

# <https://github.com/opencontainers/runc/issues/4390>
@test "userns join external namespaces [wrong userns owner]" {
requires root

# Create an external user namespace for us to join. It seems on some
# operating systems (AlmaLinux in particular) "unshare -U" will
# automatically use an identity mapping (which breaks this test) so we need
# to use runc to create the userns.
update_config '.process.args = ["sleep", "infinity"]'
runc run -d --console-socket "$CONSOLE_SOCKET" target_userns
[ "$status" -eq 0 ]

# Bind-mount the first containers userns nsfd to a different path, to
# exercise the non-fast-path (where runc has to join the userns to get the
# mappings).
userns_pid="$(__runc state target_userns | jq .pid)"
userns_path="$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX")"
mount --bind "/proc/$userns_pid/ns/user" "$userns_path"
echo "$userns_path" >>"$to_umount_list"

# Kill the container -- we have the userns bind-mounted.
runc delete -f target_userns
[ "$status" -eq 0 ]

# Configure our container to attach to the external userns.
update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end)
| del(.linux.uidMappings)
| del(.linux.gidMappings)'

# Also create a network namespace that *is not owned* by the above userns.
# NOTE: Having no permissions in a namespaces makes it necessary to modify
# the config so that we don't get mount errors (for reference: no netns
# permissions == no sysfs mounts, no pidns permissoins == no procfs mounts,
# no utsns permissions == no sethostname(2), no ipc permissions == no
# mqueue mounts, etc).
netns_path="$(mktemp "$BATS_RUN_TMPDIR/netns.XXXXXX")"
unshare -i -- mount --bind "/proc/self/ns/net" "$netns_path"
echo "$netns_path" >>"$to_umount_list"
# Configure our container to attach to the external netns.
update_config '.linux.namespaces |= map(if .type == "network" then (.path = "'"$netns_path"'") else . end)'

# Convert sysfs mounts to a bind-mount from the host, to avoid permission
# issues due to the netns setup we have.
update_config '.mounts |= map((select(.type == "sysfs") | { "source": "/sys", "destination": .destination, "type": "bind", "options": ["rbind"] }) // .)'

# Create a detached container to verify the namespaces are correct.
update_config '.process.args = ["sleep", "infinity"]'
runc --debug run -d --console-socket "$CONSOLE_SOCKET" ctr
[ "$status" -eq 0 ]

userns_id="user:[$(stat -c "%i" "$userns_path")]"
netns_id="net:[$(stat -c "%i" "$netns_path")]"

runc exec ctr readlink /proc/self/ns/user
[ "$status" -eq 0 ]
[[ "$output" == "$userns_id" ]]

runc exec ctr readlink /proc/self/ns/net
[ "$status" -eq 0 ]
[[ "$output" == "$netns_id" ]]
}
Loading