Merge pull request #307 from elezar/forward-compat-by-folder

elezar · web-flow · commit d26524ab5db9 · 2025-05-13T21:46:35.000+02:00
Add cuda-compat-mode flag to configure command
diff --git a/Makefile b/Makefile
@@ -86,6 +86,7 @@ LIB_RPC_SRCS := $(SRCS_DIR)/nvc_rpc.h \
                 $(SRCS_DIR)/nvc_clt.c
 
 BIN_SRCS     := $(SRCS_DIR)/cli/common.c    \
+                $(SRCS_DIR)/cli/compat_mode.c \
                 $(SRCS_DIR)/cli/configure.c \
                 $(SRCS_DIR)/cli/dsl.c       \
                 $(SRCS_DIR)/cli/info.c      \
diff --git a/src/cli/compat_mode.c b/src/cli/compat_mode.c
@@ -0,0 +1,127 @@
+/**
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+#include <err.h>
+#include <libgen.h>
+#undef basename /* Use the GNU version of basename. */
+#include <stdlib.h>
+
+#include "cli.h"
+#include "compat_mode.h"
+
+static void filter_by_major_version(bool, const struct nvc_driver_info *, char * [], size_t *);
+static int get_compat_library_path(struct error *, const char * [], size_t, char **);
+
+int
+update_compat_libraries(struct nvc_context *ctx, struct nvc_container *cnt, const struct nvc_driver_info *info) {
+        if (cnt->flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
+                return (0);
+        }
+        if (cnt->libs == NULL || cnt->nlibs == 0) {
+                return (0);
+        }
+        size_t nlibs = cnt->nlibs;
+        char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
+        if (libs == NULL) {
+                return (-1);
+        }
+
+        /* For cuda-compat-mode=mount, we also allow compat libraries with a LOWER major versions. */
+        bool allow_lower_major_versions = (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT);
+        filter_by_major_version(allow_lower_major_versions, info, libs, &nlibs);
+
+        /* Use the filtered library list. */
+        free(cnt->libs);
+        cnt->libs = libs;
+        cnt->nlibs = nlibs;
+
+        if (!(cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
+                return (0);
+        }
+        /* For cuda-compat-mode=ldconfig we also ensure that cuda_compat_dir is set. */
+        if (get_compat_library_path(&ctx->err, (const char **)libs, nlibs, &cnt->cuda_compat_dir) < 0) {
+                return (-1);
+        }
+        return (0);
+}
+
+static void
+filter_by_major_version(bool allow_lower_major_versions, const struct nvc_driver_info *info, char * paths[], size_t *size)
+{
+        char *lib, *maj;
+        bool exclude;
+        /*
+         * XXX Filter out any library that has a lower or equal major version than RM to prevent us from
+         * running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
+         */
+        for (size_t i = 0; i < *size; ++i) {
+                lib = basename(paths[i]);
+                if ((maj = strstr(lib, ".so.")) != NULL) {
+                        maj += strlen(".so.");
+                        exclude = false;
+                        if (allow_lower_major_versions) {
+                                // Only filter out EQUAL RM versions.
+                                exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) == 0);
+                        } else {
+                                // If the major version of RM is greater than or equal to the major version
+                                // of the library that we are considering, we remove the library from the
+                                // list.
+                                exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) >= 0);
+                        }
+                        if (exclude) {
+                                paths[i] = NULL;
+                        }
+                }
+        }
+        array_pack(paths, size);
+}
+
+static int
+get_compat_library_path(struct error *err, const char * paths[], size_t size, char **compat_dir_result)
+{
+        char *dir;
+        char *compat_dir;
+
+        if (size == 0) {
+                return 0;
+        }
+
+        char **dirnames = array_copy(err, (const char * const *)paths, size);
+        if (dirnames == NULL) {
+                return -1;
+        }
+
+        for (size_t i = 0; i < size; ++i) {
+                dir = dirname(dirnames[i]);
+                if (i == 0) {
+                        compat_dir = strdup(dir);
+                        if (compat_dir == NULL) {
+                                return -1;
+                        }
+                        continue;
+                }
+                if (strcmp(dir, compat_dir)) {
+                        goto fail;
+                }
+        }
+
+        *compat_dir_result = compat_dir;
+        return 0;
+fail:
+        free(dirnames);
+        free(compat_dir);
+        return -1;
+}
diff --git a/src/cli/compat_mode.h b/src/cli/compat_mode.h
@@ -0,0 +1,33 @@
+/**
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ # SPDX-License-Identifier: Apache-2.0
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ **/
+
+#ifndef HEADER_COMPAT_MODE_H
+#define HEADER_COMPAT_MODE_H
+
+// TODO: These are duplicated from options.h to prevent conflicts with the CLI
+// options header.
+enum {
+    /* OPT_CUDA_COMPAT_MODE_DISABLED replaced OPT_NO_CNTLIBS. */
+    OPT_CUDA_COMPAT_MODE_DISABLED = 1 << 14,
+    OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
+    OPT_CUDA_COMPAT_MODE_MOUNT    = 1 << 16,
+};
+
+int update_compat_libraries(struct nvc_context *, struct nvc_container *, const struct nvc_driver_info *);
+
+
+#endif /* HEADER_COMPAT_MODE_H */
diff --git a/src/cli/configure.c b/src/cli/configure.c
@@ -7,6 +7,7 @@
 
 #include "cli.h"
 #include "dsl.h"
+#include "compat_mode.h"
 
 static error_t configure_parser(int, char *, struct argp_state *);
 static int check_cuda_version(const struct dsl_data *, enum dsl_comparator, const char *);
@@ -36,7 +37,8 @@ const struct argp configure_usage = {
                 {"no-persistenced", 0x86, NULL, 0, "Don't include the NVIDIA persistenced socket", -1},
                 {"no-fabricmanager", 0x87, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
                 {"no-gsp-firmware", 0x88, NULL, 0, "Don't include GSP Firmware", -1},
-                {"no-cntlibs", 0x89, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
+                {"no-cntlibs", 0x89, NULL, 0, "[Deprecated] Equivalent to --cuda-compat-mode=disabled", -1},
+                {"cuda-compat-mode", 0x90, "MODE", 0, "The mode to use to support CUDA Forward Compatibility. One of [ mount (default) | ldconfig | disabled]", -1},
                 {0},
         },
         configure_parser,
@@ -167,7 +169,15 @@ configure_parser(int key, char *arg, struct argp_state *state)
                         goto fatal;
                 break;
         case 0x89:
-                if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
+                /* The --no-cntlibs command line flag is equivalent to --cuda-compat-mode=disabled. */
+                if (str_join(&err, &ctx->container_flags, "cuda-compat-mode=disabled", " ") < 0)
+                        goto fatal;
+                break;
+        case 0x90:
+                /* We add cuda-compat-mode=$arg to the container_flags. */
+                if (str_join(&err, &ctx->container_flags, "cuda-compat-mode", " ") < 0)
+                        goto fatal;
+                if (str_join(&err, &ctx->container_flags, arg, "=") < 0)
                         goto fatal;
                 break;
         case ARGP_KEY_ARG:
@@ -316,6 +326,15 @@ configure_command(const struct context *ctx)
                 goto fail;
         }
 
+        /*
+         * We now have the driver version and can update the list of compat
+         * libraries discovered above accordingly.
+         */
+        if (update_compat_libraries(nvc, cnt, drv) < 0) {
+                warn("updating compat library settings failed: %s", libnvc.error(nvc));
+                goto fail;
+        }
+
         /* Allocate space for selecting GPU devices and MIG devices */
         if (new_devices(&err, dev, &devices) < 0) {
                 warn("memory allocation failed: %s", err.msg);
diff --git a/src/cli/list.c b/src/cli/list.c
@@ -25,7 +25,6 @@ const struct argp list_usage = {
                 {"no-persistenced", 0x84, NULL, 0, "Don't include the NVIDIA persistenced socket", -1},
                 {"no-fabricmanager", 0x85, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
                 {"no-gsp-firmware", 0x86, NULL, 0, "Don't include GSP Firmware", -1},
-                {"no-cntlibs", 0x87, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
                 {0},
         },
         list_parser,
@@ -87,10 +86,6 @@ list_parser(int key, char *arg, struct argp_state *state)
                         goto fatal;
                 ctx->list_firmwares = false;
                 break;
-        case 0x87:
-                if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
-                        goto fatal;
-                break;
         case ARGP_KEY_END:
                 if (state->argc == 1 || (state->argc == 2 && ctx->imex_channels != NULL)) {
                         if ((ctx->devices = xstrdup(&err, "all")) == NULL)
diff --git a/src/nvc_container.c b/src/nvc_container.c
@@ -24,6 +24,7 @@ static char *find_namespace_path(struct error *, const struct nvc_container *, c
 static int  find_compat_library_paths(struct error *, struct nvc_container *);
 static int  lookup_owner(struct error *, struct nvc_container *);
 static int  copy_config(struct error *, struct nvc_container *, const struct nvc_container_config *);
+static int  validate_cuda_compat_mode_flags(struct error *, int32_t *);
 
 struct nvc_container_config *
 nvc_container_config_new(pid_t pid, const char *rootfs)
@@ -236,6 +237,9 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
                 error_setx(&ctx->err, "invalid mode of operation");
                 return (NULL);
         }
+        if (validate_cuda_compat_mode_flags(&ctx->err, &flags) < 0) {
+                return (NULL);
+        }
 
         log_infof("configuring container with '%s'", opts);
         if ((cnt = xcalloc(&ctx->err, 1, sizeof(*cnt))) == NULL)
@@ -246,7 +250,7 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
                 goto fail;
         if (lookup_owner(&ctx->err, cnt) < 0)
                 goto fail;
-        if (!(flags & OPT_NO_CNTLIBS)) {
+        if (!(flags & OPT_CUDA_COMPAT_MODE_DISABLED)) {
                 if (find_compat_library_paths(&ctx->err, cnt) < 0)
                         goto fail;
         }
@@ -293,5 +297,41 @@ nvc_container_free(struct nvc_container *cnt)
         free(cnt->mnt_ns);
         free(cnt->dev_cg);
         array_free(cnt->libs, cnt->nlibs);
+        free(cnt->cuda_compat_dir);
         free(cnt);
 }
+
+/*
+ * validate_cuda_compat_mode_flags checks the options associated with the
+ * cuda-compat-mode flags.
+ * This function does the following:
+ * - Ensures that if OPT_CUDA_COMPAT_MODE_DISABLED is set, other modes are ignored.
+ * - Ensures that the mode is set to the default (OPT_CUDA_COMPAT_MODE_MOUNT) if unset.
+ * - Ensures that only a single mode is set.
+ */
+static int
+validate_cuda_compat_mode_flags(struct error *err, int32_t *flags) {
+        if (*flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
+                /*
+                 * If the OPT_CUDA_COMPAT_MODE_DISABLED flag is specified, we
+                 * explicitly ignore other OP_CUDA_COMPAT_MODE_* flags.
+                 */
+                *flags &= ~(OPT_CUDA_COMPAT_MODE_MOUNT | OPT_CUDA_COMPAT_MODE_LDCONFIG);
+                return (0);
+        }
+        if (!(*flags & (OPT_CUDA_COMPAT_MODE_LDCONFIG | OPT_CUDA_COMPAT_MODE_MOUNT))) {
+                /*
+                 * If no OPT_CUDA_COMPAT_MODE_* flags are specified,
+                 * default to OPT_CUDA_COMPAT_MODE_MOUNT to maintain
+                 * backward compatibility.
+                 */
+                *flags &= OPT_CUDA_COMPAT_MODE_MOUNT;
+                return (0);
+        }
+
+        if ((*flags & OPT_CUDA_COMPAT_MODE_MOUNT) && (*flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
+                error_setx(err, "only one cuda-compat-mode can be specified at a time");
+                return (-1);
+        }
+        return (0);
+}
diff --git a/src/nvc_internal.h b/src/nvc_internal.h
@@ -84,6 +84,7 @@ struct nvc_container {
         char *dev_cg;
         char **libs;
         size_t nlibs;
+        char *cuda_compat_dir;
 };
 
 enum {
diff --git a/src/nvc_ldcache.c b/src/nvc_ldcache.c
@@ -30,6 +30,7 @@
 #include "nvc_internal.h"
 
 #include "error.h"
+#include "options.h"
 #include "utils.h"
 #include "xfuncs.h"
 
@@ -471,7 +472,19 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
         if (validate_args(ctx, cnt != NULL) < 0)
                 return (-1);
 
-        argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        if (cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG && cnt->cuda_compat_dir != NULL) {
+                /*
+                 * We include the cuda_compat_dir directory on the ldconfig
+                 * command line. This ensures that the CUDA Forward compat
+                 * libraries take precendence over the user-mode driver
+                 * libraries in the standard library paths (libs_dir and
+                 * libs32_dir).
+                 * */
+                argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cuda_compat_dir, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        } else {
+                argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        }
+
         if (*argv[0] == '@') {
                 /*
                  * We treat this path specially to be relative to the host filesystem.
diff --git a/src/nvc_mount.c b/src/nvc_mount.c
diff --git a/src/options.h b/src/options.h