opencontainers · lifubang · Dec 5, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/Makefile b/Makefile
@@ -207,7 +207,7 @@ install-man: man
 .PHONY: cfmt
 cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
 cfmt:
-	indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC)
+	indent -linux -l120 -il0 -ppi2 -cp1 -sar -T size_t -T jmp_buf $(C_SRC)
 
 .PHONY: shellcheck
 shellcheck:

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
@@ -322,30 +322,6 @@ static int clone_parent(jmp_buf *env, int jmpval)
 	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
 }
 
-/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
-static int nsflag(char *name)
-{
-	if (!strcmp(name, "cgroup"))
-		return CLONE_NEWCGROUP;
-	else if (!strcmp(name, "ipc"))
-		return CLONE_NEWIPC;
-	else if (!strcmp(name, "mnt"))
-		return CLONE_NEWNS;
-	else if (!strcmp(name, "net"))
-		return CLONE_NEWNET;
-	else if (!strcmp(name, "pid"))
-		return CLONE_NEWPID;
-	else if (!strcmp(name, "user"))
-		return CLONE_NEWUSER;
-	else if (!strcmp(name, "uts"))
-		return CLONE_NEWUTS;
-	else if (!strcmp(name, "time"))
-		return CLONE_NEWTIME;
-
-	/* If we don't recognise a name, fallback to 0. */
-	return 0;
-}
-
 static uint32_t readint32(char *buf)
 {
 	return *(uint32_t *) buf;
@@ -444,35 +420,67 @@ void nl_free(struct nlconfig_t *config)
 	free(config->data);
 }
 
-void join_namespaces(char *nslist)
-{
-	int num = 0, i;
-	char *saveptr = NULL;
-	char *namespace = strtok_r(nslist, ",", &saveptr);
-	struct namespace_t {
-		int fd;
-		char type[PATH_MAX];
-		char path[PATH_MAX];
-	} *namespaces = NULL;
+struct namespace_t {
+	int fd;
+	char type[PATH_MAX];
+	char path[PATH_MAX];
+};
 
-	if (!namespace || !strlen(namespace) || !strlen(nslist))
-		bail("ns paths are empty");
+typedef int nsset_t;
+
+static struct nstype_t {
+	int type;
+	char *name;
+} all_ns_types[] = {
+	{ CLONE_NEWCGROUP, "cgroup" },
+	{ CLONE_NEWIPC, "ipc" },
+	{ CLONE_NEWNS, "mnt" },
+	{ CLONE_NEWNET, "net" },
+	{ CLONE_NEWPID, "pid" },
+	{ CLONE_NEWTIME, "time" },
+	{ CLONE_NEWUSER, "user" },
+	{ CLONE_NEWUTS, "uts" },
+	{ },			/* null terminator */
+};
 
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nstype(char *name)
+{
+	for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++)
+		if (!strcmp(name, ns->name))
+			return ns->type;
 	/*
-	 * We have to open the file descriptors first, since after
-	 * we join the mnt namespace we might no longer be able to
-	 * access the paths.
+	 * setns(2) lets us join namespaces without knowing the type, but
+	 * namespaces usually require special handling of some kind (so joining
+	 * a namespace without knowing its type or joining a new namespace type
+	 * without corresponding handling could result in broken behaviour) and
+	 * the rest of runc doesn't allow unknown namespace types anyway.
 	 */
+	bail("unknown namespace type %s", name);
+}
+
+static nsset_t __open_namespaces(char *nsspec, struct namespace_t **ns_list, size_t *ns_len)
+{
+	int len = 0;
+	nsset_t ns_to_join = 0;
+	char *namespace, *saveptr = NULL;
+	struct namespace_t *namespaces = NULL;
+
+	namespace = strtok_r(nsspec, ",", &saveptr);
+
+	if (!namespace || !strlen(namespace) || !strlen(nsspec))
+		bail("ns paths are empty");
+
 	do {
 		int fd;
 		char *path;
 		struct namespace_t *ns;
 
 		/* Resize the namespace array. */
-		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+		namespaces = realloc(namespaces, ++len * sizeof(struct namespace_t));
 		if (!namespaces)
 			bail("failed to reallocate namespace array");
-		ns = &namespaces[num - 1];
+		ns = &namespaces[len - 1];
 
 		/* Split 'ns:path'. */
 		path = strstr(namespace, ":");
@@ -488,38 +496,145 @@ void join_namespaces(char *nslist)
 		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
-	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 
-	/*
-	 * The ordering in which we join namespaces is important. We should
-	 * always join the user namespace *first*. This is all guaranteed
-	 * from the container_linux.go side of this, so we're just going to
-	 * follow the order given to us.
-	 */
+		ns_to_join |= nstype(ns->type);
+	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 
-	for (i = 0; i < num; i++) {
-		struct namespace_t *ns = &namespaces[i];
-		int flag = nsflag(ns->type);
+	*ns_list = namespaces;
+	*ns_len = len;
+	return ns_to_join;
+}
 
-		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
-		if (setns(ns->fd, flag) < 0)
+/*
+ * Try to join all namespaces that are in the "allow" nsset, and return the
+ * set we were able to successfully join. If a permission error is returned
+ * from nsset(2), the namespace is skipped (non-permission errors are fatal).
+ */
+static nsset_t __join_namespaces(nsset_t allow, struct namespace_t *ns_list, size_t ns_len)
+{
+	nsset_t joined = 0;
+
+	for (size_t i = 0; i < ns_len; i++) {
+		struct namespace_t *ns = &ns_list[i];
+		int type = nstype(ns->type);
+		int err, saved_errno;
+
+		if (!(type & allow))
+			continue;
+
+		err = setns(ns->fd, type);
+		saved_errno = errno;
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s): %s",
+			  type, ns->type, ns->path, strerror(errno));
+		if (err < 0) {
+			/* Skip permission errors. */
+			if (saved_errno == EPERM)
+				continue;
 			bail("failed to setns into %s namespace", ns->type);
+		}
+		joined |= type;
 
 		/*
 		 * If we change user namespaces, make sure we switch to root in the
 		 * namespace (this matches the logic for unshare(CLONE_NEWUSER)), lots
 		 * of things can break if we aren't the right user. See
 		 * <https://github.com/opencontainers/runc/issues/4466> for one example.
 		 */
-		if (flag == CLONE_NEWUSER) {
+		if (type == CLONE_NEWUSER) {
 			if (setresuid(0, 0, 0) < 0)
 				bail("failed to become root in user namespace");
 		}
 
 		close(ns->fd);
+		ns->fd = -1;
+	}
+	return joined;
+}
+
+static char *strappend(char *dst, char *src)
+{
+	if (!dst)
+		return strdup(src);
+
+	size_t len = strlen(dst) + strlen(src) + 1;
+	dst = realloc(dst, len);
+	strncat(dst, src, len);
+	return dst;
+}
+
+static char *nsset_to_str(nsset_t nsset)
+{
+	char *str = NULL;
+	for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++) {
+		if (ns->type & nsset) {
+			if (str)
+				str = strappend(str, ", ");
+			str = strappend(str, ns->name);
+		}
+	}
+	return str ? : strdup("");
+}
+
+static void __close_namespaces(nsset_t to_join, nsset_t joined, struct namespace_t *ns_list, size_t ns_len)
+{
+	/* We expect to have joined every namespace. */
+	nsset_t failed_to_join = to_join & ~joined;
+
+	/* Double-check that we used up (and thus joined) all of the nsfds. */
+	for (size_t i = 0; i < ns_len; i++) {
+		struct namespace_t *ns = &ns_list[i];
+		int type = nstype(ns->type);
+
+		if (ns->fd < 0)
+			continue;
+
+		failed_to_join |= type;
+		write_log(FATAL, "failed to setns(%#x) into %s namespace (with path %s): %s",
+			  type, ns->type, ns->path, strerror(EPERM));
+		close(ns->fd);
+		ns->fd = -1;
 	}
 
-	free(namespaces);
+	/* Make sure we joined the namespaces we planned to. */
+	if (failed_to_join)
+		bail("failed to join {%s} namespaces: %s", nsset_to_str(failed_to_join), strerror(EPERM));
+
+	free(ns_list);
+}
+
+void join_namespaces(char *nsspec)
+{
+	nsset_t to_join = 0, joined = 0;
+	struct namespace_t *ns_list;
+	size_t ns_len;
+
+	/*
+	 * We have to open the file descriptors first, since after we join the
+	 * mnt or user namespaces we might no longer be able to access the
+	 * paths.
+	 */
+	to_join = __open_namespaces(nsspec, &ns_list, &ns_len);
+
+	/*
+	 * We first try to join all non-userns namespaces to join any namespaces
+	 * that we might not be able to join once we switch credentials to the
+	 * container's userns. We then join the user namespace, and then try to
+	 * join any remaining namespaces (this last step is needed for rootless
+	 * containers -- we don't get setns(2) permissions until we join the userns
+	 * and get CAP_SYS_ADMIN).
+	 *
+	 * Splitting the joins this way is necessary for containers that are
+	 * configured to join some externally-created namespace but are also
+	 * configured to join an unrelated user namespace.
+	 *
+	 * This is similar to what nsenter(1) seems to do in practice.
+	 */
+	joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
+	joined |= __join_namespaces(CLONE_NEWUSER, ns_list, ns_len);
+	joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
+
+	/* Verify that we joined all of the namespaces. */
+	__close_namespaces(to_join, joined, ns_list, ns_len);
 }
 
 static inline int sane_kill(pid_t pid, int signum)

diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats
@@ -29,7 +29,7 @@ function teardown() {
 	if [ -v to_umount_list ]; then
 		while read -r mount_path; do
 			umount -l "$mount_path" || :
-			rm -f "$mount_path"
+			rm -rf "$mount_path"
 		done <"$to_umount_list"
 		rm -f "$to_umount_list"
 		unset to_umount_list
@@ -184,3 +184,65 @@ function teardown() {
 		grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output"
 	fi
 }
+
+# <https://github.com/opencontainers/runc/issues/4390>
+@test "userns join external namespaces [wrong userns owner]" {
+	requires root
+
+	# Create an external user namespace for us to join. It seems on some
+	# operating systems (AlmaLinux in particular) "unshare -U" will
+	# automatically use an identity mapping (which breaks this test) so we need
+	# to use runc to create the userns.
+	update_config '.process.args = ["sleep", "infinity"]'
+	runc run -d --console-socket "$CONSOLE_SOCKET" target_userns
+	[ "$status" -eq 0 ]
+
+	# Bind-mount the first containers userns nsfd to a different path, to
+	# exercise the non-fast-path (where runc has to join the userns to get the
+	# mappings).
+	userns_pid="$(__runc state target_userns | jq .pid)"
+	userns_path="$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX")"
+	mount --bind "/proc/$userns_pid/ns/user" "$userns_path"
+	echo "$userns_path" >>"$to_umount_list"
+
+	# Kill the container -- we have the userns bind-mounted.
+	runc delete -f target_userns
+	[ "$status" -eq 0 ]
+
+	# Configure our container to attach to the external userns.
+	update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end)
+		| del(.linux.uidMappings)
+		| del(.linux.gidMappings)'
+
+	# Also create a network namespace that *is not owned* by the above userns.
+	# NOTE: Having no permissions in a namespaces makes it necessary to modify
+	# the config so that we don't get mount errors (for reference: no netns
+	# permissions == no sysfs mounts, no pidns permissoins == no procfs mounts,
+	# no utsns permissions == no sethostname(2), no ipc permissions == no
+	# mqueue mounts, etc).
+	netns_path="$(mktemp "$BATS_RUN_TMPDIR/netns.XXXXXX")"
+	unshare -i -- mount --bind "/proc/self/ns/net" "$netns_path"
+	echo "$netns_path" >>"$to_umount_list"
+	# Configure our container to attach to the external netns.
+	update_config '.linux.namespaces |= map(if .type == "network" then (.path = "'"$netns_path"'") else . end)'
+
+	# Convert sysfs mounts to a bind-mount from the host, to avoid permission
+	# issues due to the netns setup we have.
+	update_config '.mounts |= map((select(.type == "sysfs") | { "source": "/sys", "destination": .destination, "type": "bind", "options": ["rbind"] }) // .)'
+
+	# Create a detached container to verify the namespaces are correct.
+	update_config '.process.args = ["sleep", "infinity"]'
+	runc --debug run -d --console-socket "$CONSOLE_SOCKET" ctr
+	[ "$status" -eq 0 ]
+
+	userns_id="user:[$(stat -c "%i" "$userns_path")]"
+	netns_id="net:[$(stat -c "%i" "$netns_path")]"
+
+	runc exec ctr readlink /proc/self/ns/user
+	[ "$status" -eq 0 ]
+	[[ "$output" == "$userns_id" ]]
+
+	runc exec ctr readlink /proc/self/ns/net
+	[ "$status" -eq 0 ]
+	[[ "$output" == "$netns_id" ]]
+}