Add cuda-compat-mode flag to configure command

elezar · elezar · commit 96ec4e17f69b · 2025-04-30T14:49:07.000+02:00
This changes adds a --cuda-compat-mode flag to the configure
CLI. This allows more flexibility than the existing --no-cntlibs flag.

Possible values of the flag are:
* mount (default) - CUDA compat libraries are mounted from /usr/local/cuda/compat to
  the standard library path in the container.
* ldconfig - The folder containing the CUDA compat libraries is added as a command
  line argument to the ldconfig command executed in the container.
* disabled - This is equivalent ot specifying the --no-cntlibs flag.

Signed-off-by: Evan Lezar &lt;elezar@nvidia.com&gt;
diff --git a/Makefile b/Makefile
@@ -86,6 +86,7 @@ LIB_RPC_SRCS := $(SRCS_DIR)/nvc_rpc.h \
                 $(SRCS_DIR)/nvc_clt.c
 
 BIN_SRCS     := $(SRCS_DIR)/cli/common.c    \
+				$(SRCS_DIR)/cli/compat_mode.c \
                 $(SRCS_DIR)/cli/configure.c \
                 $(SRCS_DIR)/cli/dsl.c       \
                 $(SRCS_DIR)/cli/info.c      \
diff --git a/src/cli/compat_mode.c b/src/cli/compat_mode.c
@@ -0,0 +1,134 @@
+/**
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+#include <err.h>
+#include <libgen.h>
+#undef basename /* Use the GNU version of basename. */
+#include <stdlib.h>
+
+#include "cli.h"
+#include "compat_mode.h"
+
+static void filter_by_major_version(bool, const struct nvc_driver_info *, char * [], size_t *);
+static int get_compat_library_path(struct error *, const char * [], size_t, char **);
+
+int
+update_compat_libraries(struct nvc_context *ctx, struct nvc_container *cnt, const struct nvc_driver_info *info) {
+        if (cnt->libs == NULL || cnt->nlibs == 0) {
+                return 0;
+        }
+        size_t nlibs = cnt->nlibs;
+        char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
+        if (libs == NULL) {
+                return -1;
+        }
+
+        // For cuda-compat-mode=mount, we also allow compat libraries with a LOWER major versions.
+        bool allow_lower_major_versions = (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT);
+        filter_by_major_version(allow_lower_major_versions, info, libs, &nlibs);
+
+        // We free the previously allocated libs to allow these to be reassigned if required.
+        free(cnt->libs);
+        cnt->libs = NULL;
+        cnt->nlibs = 0;
+
+        if (cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG) {
+                if (get_compat_library_path(&ctx->err, (const char **)libs, nlibs, &cnt->cuda_compat_dir) < 0) {
+                        goto fail;
+                }
+                // For cuda-compat-mode=ldconfig we don't require the library
+                // paths since we already have the compat dir stored.
+                free(libs);
+        }
+        if (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT) {
+                // Munting is handled later. We only need to update the required
+                // container information.
+                cnt->libs = libs;
+                cnt->nlibs = nlibs;
+        }
+        return (0);
+fail:
+        free(libs);
+        return (-1);
+}
+
+static void
+filter_by_major_version(bool allow_lower_major_versions, const struct nvc_driver_info *info, char * paths[], size_t *size)
+{
+        char *lib, *maj;
+        bool exclude;
+        /*
+         * XXX Filter out any library that has a lower or equal major version than RM to prevent us from
+         * running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
+         */
+        for (size_t i = 0; i < *size; ++i) {
+                lib = basename(paths[i]);
+                if ((maj = strstr(lib, ".so.")) != NULL) {
+                        maj += strlen(".so.");
+                        exclude = false;
+                        if (allow_lower_major_versions) {
+                                // Only filter out EQUAL RM versions.
+                                exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) == 0);
+                        } else {
+                                // If the major version of RM is greater than or equal to the major version
+                                // of the library that we are considering, we remove the library from the
+                                // list.
+                                exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) >= 0);
+                        }
+                        if (exclude) {
+                                paths[i] = NULL;
+                        }
+                }
+        }
+        array_pack(paths, size);
+}
+
+static int
+get_compat_library_path(struct error *err, const char * paths[], size_t size, char **compat_dir_result)
+{
+        char *dir;
+        char *compat_dir;
+
+        if (size == 0) {
+                return 0;
+        }
+
+        char **dirnames = array_copy(err, (const char * const *)paths, size);
+        if (dirnames == NULL) {
+                return -1;
+        }
+
+        for (size_t i = 0; i < size; ++i) {
+                dir = dirname(dirnames[i]);
+                if (i == 0) {
+                        compat_dir = strdup(dir);
+                        if (compat_dir == NULL) {
+                                return -1;
+                        }
+                        continue;
+                }
+                if (strcmp(dir, compat_dir)) {
+                        goto fail;
+                }
+        }
+
+        *compat_dir_result = compat_dir;
+        return 0;
+fail:
+        free(dirnames);
+        free(compat_dir);
+        return -1;
+}
diff --git a/src/cli/compat_mode.h b/src/cli/compat_mode.h
@@ -0,0 +1,31 @@
+/**
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ # SPDX-License-Identifier: Apache-2.0
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ #     http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+ **/
+
+#ifndef HEADER_COMPAT_MODE_H
+#define HEADER_COMPAT_MODE_H
+
+// TODO: These are duplicated from options.h to prevent conflicts with the CLI
+// options header.
+enum {
+    OPT_CUDA_COMPAT_MODE_MOUNT    = 1 << 14,
+    OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
+};
+
+int update_compat_libraries(struct nvc_context *, struct nvc_container *, const struct nvc_driver_info *);
+
+
+#endif /* HEADER_COMPAT_MODE_H */
diff --git a/src/cli/configure.c b/src/cli/configure.c
@@ -7,6 +7,7 @@
 
 #include "cli.h"
 #include "dsl.h"
+#include "compat_mode.h"
 
 static error_t configure_parser(int, char *, struct argp_state *);
 static int check_cuda_version(const struct dsl_data *, enum dsl_comparator, const char *);
@@ -37,6 +38,7 @@ const struct argp configure_usage = {
                 {"no-fabricmanager", 0x87, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
                 {"no-gsp-firmware", 0x88, NULL, 0, "Don't include GSP Firmware", -1},
                 {"no-cntlibs", 0x89, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
+                {"cuda-compat-mode", 0x90, "MODE", 0, "The mode to use to support CUDA Forward Compatibility. One of [ mount (default) | ldconfig | disabled]", -1},
                 {0},
         },
         configure_parser,
@@ -170,6 +172,19 @@ configure_parser(int key, char *arg, struct argp_state *state)
                 if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
                         goto fatal;
                 break;
+        case 0x90:
+                // cuda-compat-mode=disabled is equivalent to no-cntlibs.
+                if (strcmp(arg, "disabled") == 0) {
+                        if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
+                                goto fatal;
+                } else {
+                        // We add cuda-compat-mode=$arg to the container_flags.
+                        if (str_join(&err, &ctx->container_flags, "cuda-compat-mode", " ") < 0)
+                                goto fatal;
+                        if (str_join(&err, &ctx->container_flags, arg, "=") < 0)
+                                goto fatal;
+                }
+                break;
         case ARGP_KEY_ARG:
                 if (state->arg_num > 0)
                         argp_usage(state);
@@ -316,6 +331,13 @@ configure_command(const struct context *ctx)
                 goto fail;
         }
 
+        /* We now have the driver version and can update the list of compat
+           libraries discovered above accordingly. */
+        if (update_compat_libraries(nvc, cnt, drv) < 0) {
+                warn("updating compat library settings failed: %s", libnvc.error(nvc));
+                goto fail;
+        }
+
         /* Allocate space for selecting GPU devices and MIG devices */
         if (new_devices(&err, dev, &devices) < 0) {
                 warn("memory allocation failed: %s", err.msg);
diff --git a/src/nvc_container.c b/src/nvc_container.c
@@ -236,6 +236,15 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
                 error_setx(&ctx->err, "invalid mode of operation");
                 return (NULL);
         }
+        if ((flags & OPT_CUDA_COMPAT_MODE_MOUNT) & (flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
+                error_setx(&ctx->err, "only one cuda-compat-mode can be specified at a time");
+                return (NULL);
+        }
+        // If CUDA Compat is enabled and neither cuda-compat-mode=mount nor cuda-compat-mode=ldconfig is specified
+        // default to cuda-compat-mode=mount to maintain backward compatibility.
+        if (!(flags & OPT_NO_CNTLIBS) & !(flags & OPT_CUDA_COMPAT_MODE_MOUNT) & !(flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
+                flags &= OPT_CUDA_COMPAT_MODE_MOUNT;
+        }
 
         log_infof("configuring container with '%s'", opts);
         if ((cnt = xcalloc(&ctx->err, 1, sizeof(*cnt))) == NULL)
@@ -293,5 +302,6 @@ nvc_container_free(struct nvc_container *cnt)
         free(cnt->mnt_ns);
         free(cnt->dev_cg);
         array_free(cnt->libs, cnt->nlibs);
+        free(cnt->cuda_compat_dir);
         free(cnt);
 }
diff --git a/src/nvc_internal.h b/src/nvc_internal.h
@@ -84,6 +84,7 @@ struct nvc_container {
         char *dev_cg;
         char **libs;
         size_t nlibs;
+        char *cuda_compat_dir;
 };
 
 enum {
diff --git a/src/nvc_ldcache.c b/src/nvc_ldcache.c
@@ -471,7 +471,15 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
         if (validate_args(ctx, cnt != NULL) < 0)
                 return (-1);
 
-        argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        if (cnt->cuda_compat_dir == NULL) {
+                argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        } else {
+                // If the cuda_compat_dir has been set, this means that the CUDA
+                // Forward compat libraries should take precedence over the
+                // user-mode driver libraries from the host.
+                argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cuda_compat_dir, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        }
+
         if (*argv[0] == '@') {
                 /*
                  * We treat this path specially to be relative to the host filesystem.
diff --git a/src/nvc_mount.c b/src/nvc_mount.c
@@ -40,7 +40,6 @@ static int  update_app_profile(struct error *, const struct nvc_container *, dev
 static void unmount(const char *);
 static int  symlink_library(struct error *, const char *, const char *, const char *, uid_t, gid_t);
 static int  symlink_libraries(struct error *, const struct nvc_container *, const char * const [], size_t);
-static void filter_libraries(const struct nvc_driver_info *, char * [], size_t *);
 static int  device_mount_dxcore(struct nvc_context *, const struct nvc_container *);
 static int  device_mount_native(struct nvc_context *, const struct nvc_container *, const struct nvc_device *);
 static int  cap_device_mount(struct nvc_context *, const struct nvc_container *, const char *);
@@ -562,27 +561,6 @@ symlink_libraries(struct error *err, const struct nvc_container *cnt, const char
         return (0);
 }
 
-static void
-filter_libraries(const struct nvc_driver_info *info, char * paths[], size_t *size)
-{
-        char *lib, *maj;
-
-        /*
-         * XXX Filter out any library that matches the major version of RM to prevent us from
-         * running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
-         */
-        for (size_t i = 0; i < *size; ++i) {
-                lib = basename(paths[i]);
-                if ((maj = strstr(lib, ".so.")) != NULL) {
-                        maj += strlen(".so.");
-                        if (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")))
-                                continue;
-                }
-                paths[i] = NULL;
-        }
-        array_pack(paths, size);
-}
-
 static int
 device_mount_dxcore(struct nvc_context *ctx, const struct nvc_container *cnt)
 {
@@ -770,19 +748,11 @@ nvc_driver_mount(struct nvc_context *ctx, const struct nvc_container *cnt, const
 
         /* Container library mounts */
         if (cnt->libs != NULL && cnt->nlibs > 0) {
-                size_t nlibs = cnt->nlibs;
-                char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
-                if (libs == NULL)
-                        goto fail;
-
-                filter_libraries(info, libs, &nlibs);
-                if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, libs, nlibs)) == NULL) {
-                        free(libs);
+                if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, cnt->libs, cnt->nlibs)) == NULL) {
                         goto fail;
                 }
                 ptr = array_append(ptr, tmp, array_size(tmp));
                 free(tmp);
-                free(libs);
         }
 
         /* Firmware mounts */
diff --git a/src/options.h b/src/options.h
@@ -75,6 +75,8 @@ enum {
 #else
         OPT_COMPAT32      = 1 << 13,
 #endif /* defined(__powerpc64__) */
+        OPT_CUDA_COMPAT_MODE_MOUNT    = 1 << 14,
+        OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
 };
 
 static const struct option container_opts[] = {
@@ -90,6 +92,8 @@ static const struct option container_opts[] = {
         {"display", OPT_DISPLAY|OPT_GRAPHICS_LIBS},
         {"ngx", OPT_NGX_LIBS},
         {"compat32", OPT_COMPAT32},
+        {"cuda-compat-mode=mount", OPT_CUDA_COMPAT_MODE_MOUNT},
+        {"cuda-compat-mode=ldconfig", OPT_CUDA_COMPAT_MODE_LDCONFIG},
 };
 
 static const char * const default_container_opts = "standalone no-cgroups no-devbind utility";