Skip to content

Commit 96ec4e1

Browse files
committed
Add cuda-compat-mode flag to configure command
This changes adds a --cuda-compat-mode flag to the configure CLI. This allows more flexibility than the existing --no-cntlibs flag. Possible values of the flag are: * mount (default) - CUDA compat libraries are mounted from /usr/local/cuda/compat to the standard library path in the container. * ldconfig - The folder containing the CUDA compat libraries is added as a command line argument to the ldconfig command executed in the container. * disabled - This is equivalent ot specifying the --no-cntlibs flag. Signed-off-by: Evan Lezar <[email protected]>
1 parent a198166 commit 96ec4e1

File tree

9 files changed

+213
-32
lines changed

9 files changed

+213
-32
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ LIB_RPC_SRCS := $(SRCS_DIR)/nvc_rpc.h \
8686
$(SRCS_DIR)/nvc_clt.c
8787

8888
BIN_SRCS := $(SRCS_DIR)/cli/common.c \
89+
$(SRCS_DIR)/cli/compat_mode.c \
8990
$(SRCS_DIR)/cli/configure.c \
9091
$(SRCS_DIR)/cli/dsl.c \
9192
$(SRCS_DIR)/cli/info.c \

src/cli/compat_mode.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/**
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
**/
17+
#include <err.h>
18+
#include <libgen.h>
19+
#undef basename /* Use the GNU version of basename. */
20+
#include <stdlib.h>
21+
22+
#include "cli.h"
23+
#include "compat_mode.h"
24+
25+
static void filter_by_major_version(bool, const struct nvc_driver_info *, char * [], size_t *);
26+
static int get_compat_library_path(struct error *, const char * [], size_t, char **);
27+
28+
int
29+
update_compat_libraries(struct nvc_context *ctx, struct nvc_container *cnt, const struct nvc_driver_info *info) {
30+
if (cnt->libs == NULL || cnt->nlibs == 0) {
31+
return 0;
32+
}
33+
size_t nlibs = cnt->nlibs;
34+
char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
35+
if (libs == NULL) {
36+
return -1;
37+
}
38+
39+
// For cuda-compat-mode=mount, we also allow compat libraries with a LOWER major versions.
40+
bool allow_lower_major_versions = (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT);
41+
filter_by_major_version(allow_lower_major_versions, info, libs, &nlibs);
42+
43+
// We free the previously allocated libs to allow these to be reassigned if required.
44+
free(cnt->libs);
45+
cnt->libs = NULL;
46+
cnt->nlibs = 0;
47+
48+
if (cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG) {
49+
if (get_compat_library_path(&ctx->err, (const char **)libs, nlibs, &cnt->cuda_compat_dir) < 0) {
50+
goto fail;
51+
}
52+
// For cuda-compat-mode=ldconfig we don't require the library
53+
// paths since we already have the compat dir stored.
54+
free(libs);
55+
}
56+
if (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT) {
57+
// Munting is handled later. We only need to update the required
58+
// container information.
59+
cnt->libs = libs;
60+
cnt->nlibs = nlibs;
61+
}
62+
return (0);
63+
fail:
64+
free(libs);
65+
return (-1);
66+
}
67+
68+
static void
69+
filter_by_major_version(bool allow_lower_major_versions, const struct nvc_driver_info *info, char * paths[], size_t *size)
70+
{
71+
char *lib, *maj;
72+
bool exclude;
73+
/*
74+
* XXX Filter out any library that has a lower or equal major version than RM to prevent us from
75+
* running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
76+
*/
77+
for (size_t i = 0; i < *size; ++i) {
78+
lib = basename(paths[i]);
79+
if ((maj = strstr(lib, ".so.")) != NULL) {
80+
maj += strlen(".so.");
81+
exclude = false;
82+
if (allow_lower_major_versions) {
83+
// Only filter out EQUAL RM versions.
84+
exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) == 0);
85+
} else {
86+
// If the major version of RM is greater than or equal to the major version
87+
// of the library that we are considering, we remove the library from the
88+
// list.
89+
exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) >= 0);
90+
}
91+
if (exclude) {
92+
paths[i] = NULL;
93+
}
94+
}
95+
}
96+
array_pack(paths, size);
97+
}
98+
99+
static int
100+
get_compat_library_path(struct error *err, const char * paths[], size_t size, char **compat_dir_result)
101+
{
102+
char *dir;
103+
char *compat_dir;
104+
105+
if (size == 0) {
106+
return 0;
107+
}
108+
109+
char **dirnames = array_copy(err, (const char * const *)paths, size);
110+
if (dirnames == NULL) {
111+
return -1;
112+
}
113+
114+
for (size_t i = 0; i < size; ++i) {
115+
dir = dirname(dirnames[i]);
116+
if (i == 0) {
117+
compat_dir = strdup(dir);
118+
if (compat_dir == NULL) {
119+
return -1;
120+
}
121+
continue;
122+
}
123+
if (strcmp(dir, compat_dir)) {
124+
goto fail;
125+
}
126+
}
127+
128+
*compat_dir_result = compat_dir;
129+
return 0;
130+
fail:
131+
free(dirnames);
132+
free(compat_dir);
133+
return -1;
134+
}

src/cli/compat_mode.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/**
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
**/
17+
18+
#ifndef HEADER_COMPAT_MODE_H
19+
#define HEADER_COMPAT_MODE_H
20+
21+
// TODO: These are duplicated from options.h to prevent conflicts with the CLI
22+
// options header.
23+
enum {
24+
OPT_CUDA_COMPAT_MODE_MOUNT = 1 << 14,
25+
OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
26+
};
27+
28+
int update_compat_libraries(struct nvc_context *, struct nvc_container *, const struct nvc_driver_info *);
29+
30+
31+
#endif /* HEADER_COMPAT_MODE_H */

src/cli/configure.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cli.h"
99
#include "dsl.h"
10+
#include "compat_mode.h"
1011

1112
static error_t configure_parser(int, char *, struct argp_state *);
1213
static int check_cuda_version(const struct dsl_data *, enum dsl_comparator, const char *);
@@ -37,6 +38,7 @@ const struct argp configure_usage = {
3738
{"no-fabricmanager", 0x87, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
3839
{"no-gsp-firmware", 0x88, NULL, 0, "Don't include GSP Firmware", -1},
3940
{"no-cntlibs", 0x89, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
41+
{"cuda-compat-mode", 0x90, "MODE", 0, "The mode to use to support CUDA Forward Compatibility. One of [ mount (default) | ldconfig | disabled]", -1},
4042
{0},
4143
},
4244
configure_parser,
@@ -170,6 +172,19 @@ configure_parser(int key, char *arg, struct argp_state *state)
170172
if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
171173
goto fatal;
172174
break;
175+
case 0x90:
176+
// cuda-compat-mode=disabled is equivalent to no-cntlibs.
177+
if (strcmp(arg, "disabled") == 0) {
178+
if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
179+
goto fatal;
180+
} else {
181+
// We add cuda-compat-mode=$arg to the container_flags.
182+
if (str_join(&err, &ctx->container_flags, "cuda-compat-mode", " ") < 0)
183+
goto fatal;
184+
if (str_join(&err, &ctx->container_flags, arg, "=") < 0)
185+
goto fatal;
186+
}
187+
break;
173188
case ARGP_KEY_ARG:
174189
if (state->arg_num > 0)
175190
argp_usage(state);
@@ -316,6 +331,13 @@ configure_command(const struct context *ctx)
316331
goto fail;
317332
}
318333

334+
/* We now have the driver version and can update the list of compat
335+
libraries discovered above accordingly. */
336+
if (update_compat_libraries(nvc, cnt, drv) < 0) {
337+
warn("updating compat library settings failed: %s", libnvc.error(nvc));
338+
goto fail;
339+
}
340+
319341
/* Allocate space for selecting GPU devices and MIG devices */
320342
if (new_devices(&err, dev, &devices) < 0) {
321343
warn("memory allocation failed: %s", err.msg);

src/nvc_container.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,15 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
236236
error_setx(&ctx->err, "invalid mode of operation");
237237
return (NULL);
238238
}
239+
if ((flags & OPT_CUDA_COMPAT_MODE_MOUNT) & (flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
240+
error_setx(&ctx->err, "only one cuda-compat-mode can be specified at a time");
241+
return (NULL);
242+
}
243+
// If CUDA Compat is enabled and neither cuda-compat-mode=mount nor cuda-compat-mode=ldconfig is specified
244+
// default to cuda-compat-mode=mount to maintain backward compatibility.
245+
if (!(flags & OPT_NO_CNTLIBS) & !(flags & OPT_CUDA_COMPAT_MODE_MOUNT) & !(flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
246+
flags &= OPT_CUDA_COMPAT_MODE_MOUNT;
247+
}
239248

240249
log_infof("configuring container with '%s'", opts);
241250
if ((cnt = xcalloc(&ctx->err, 1, sizeof(*cnt))) == NULL)
@@ -293,5 +302,6 @@ nvc_container_free(struct nvc_container *cnt)
293302
free(cnt->mnt_ns);
294303
free(cnt->dev_cg);
295304
array_free(cnt->libs, cnt->nlibs);
305+
free(cnt->cuda_compat_dir);
296306
free(cnt);
297307
}

src/nvc_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ struct nvc_container {
8484
char *dev_cg;
8585
char **libs;
8686
size_t nlibs;
87+
char *cuda_compat_dir;
8788
};
8889

8990
enum {

src/nvc_ldcache.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,15 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
471471
if (validate_args(ctx, cnt != NULL) < 0)
472472
return (-1);
473473

474-
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
474+
if (cnt->cuda_compat_dir == NULL) {
475+
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
476+
} else {
477+
// If the cuda_compat_dir has been set, this means that the CUDA
478+
// Forward compat libraries should take precedence over the
479+
// user-mode driver libraries from the host.
480+
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cuda_compat_dir, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
481+
}
482+
475483
if (*argv[0] == '@') {
476484
/*
477485
* We treat this path specially to be relative to the host filesystem.

src/nvc_mount.c

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ static int update_app_profile(struct error *, const struct nvc_container *, dev
4040
static void unmount(const char *);
4141
static int symlink_library(struct error *, const char *, const char *, const char *, uid_t, gid_t);
4242
static int symlink_libraries(struct error *, const struct nvc_container *, const char * const [], size_t);
43-
static void filter_libraries(const struct nvc_driver_info *, char * [], size_t *);
4443
static int device_mount_dxcore(struct nvc_context *, const struct nvc_container *);
4544
static int device_mount_native(struct nvc_context *, const struct nvc_container *, const struct nvc_device *);
4645
static int cap_device_mount(struct nvc_context *, const struct nvc_container *, const char *);
@@ -562,27 +561,6 @@ symlink_libraries(struct error *err, const struct nvc_container *cnt, const char
562561
return (0);
563562
}
564563

565-
static void
566-
filter_libraries(const struct nvc_driver_info *info, char * paths[], size_t *size)
567-
{
568-
char *lib, *maj;
569-
570-
/*
571-
* XXX Filter out any library that matches the major version of RM to prevent us from
572-
* running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
573-
*/
574-
for (size_t i = 0; i < *size; ++i) {
575-
lib = basename(paths[i]);
576-
if ((maj = strstr(lib, ".so.")) != NULL) {
577-
maj += strlen(".so.");
578-
if (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")))
579-
continue;
580-
}
581-
paths[i] = NULL;
582-
}
583-
array_pack(paths, size);
584-
}
585-
586564
static int
587565
device_mount_dxcore(struct nvc_context *ctx, const struct nvc_container *cnt)
588566
{
@@ -770,19 +748,11 @@ nvc_driver_mount(struct nvc_context *ctx, const struct nvc_container *cnt, const
770748

771749
/* Container library mounts */
772750
if (cnt->libs != NULL && cnt->nlibs > 0) {
773-
size_t nlibs = cnt->nlibs;
774-
char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
775-
if (libs == NULL)
776-
goto fail;
777-
778-
filter_libraries(info, libs, &nlibs);
779-
if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, libs, nlibs)) == NULL) {
780-
free(libs);
751+
if ((tmp = (const char **)mount_files(&ctx->err, cnt->cfg.rootfs, cnt, cnt->cfg.libs_dir, cnt->libs, cnt->nlibs)) == NULL) {
781752
goto fail;
782753
}
783754
ptr = array_append(ptr, tmp, array_size(tmp));
784755
free(tmp);
785-
free(libs);
786756
}
787757

788758
/* Firmware mounts */

src/options.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ enum {
7575
#else
7676
OPT_COMPAT32 = 1 << 13,
7777
#endif /* defined(__powerpc64__) */
78+
OPT_CUDA_COMPAT_MODE_MOUNT = 1 << 14,
79+
OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
7880
};
7981

8082
static const struct option container_opts[] = {
@@ -90,6 +92,8 @@ static const struct option container_opts[] = {
9092
{"display", OPT_DISPLAY|OPT_GRAPHICS_LIBS},
9193
{"ngx", OPT_NGX_LIBS},
9294
{"compat32", OPT_COMPAT32},
95+
{"cuda-compat-mode=mount", OPT_CUDA_COMPAT_MODE_MOUNT},
96+
{"cuda-compat-mode=ldconfig", OPT_CUDA_COMPAT_MODE_LDCONFIG},
9397
};
9498

9599
static const char * const default_container_opts = "standalone no-cgroups no-devbind utility";

0 commit comments

Comments
 (0)