Skip to content

Commit d26524a

Browse files
authored
Merge pull request #307 from elezar/forward-compat-by-folder
Add cuda-compat-mode flag to configure command
2 parents a198166 + e03b6a8 commit d26524a

File tree

10 files changed

+247
-43
lines changed

10 files changed

+247
-43
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ LIB_RPC_SRCS := $(SRCS_DIR)/nvc_rpc.h \
8686
$(SRCS_DIR)/nvc_clt.c
8787

8888
BIN_SRCS := $(SRCS_DIR)/cli/common.c \
89+
$(SRCS_DIR)/cli/compat_mode.c \
8990
$(SRCS_DIR)/cli/configure.c \
9091
$(SRCS_DIR)/cli/dsl.c \
9192
$(SRCS_DIR)/cli/info.c \

src/cli/compat_mode.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/**
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
**/
17+
#include <err.h>
18+
#include <libgen.h>
19+
#undef basename /* Use the GNU version of basename. */
20+
#include <stdlib.h>
21+
22+
#include "cli.h"
23+
#include "compat_mode.h"
24+
25+
static void filter_by_major_version(bool, const struct nvc_driver_info *, char * [], size_t *);
26+
static int get_compat_library_path(struct error *, const char * [], size_t, char **);
27+
28+
int
29+
update_compat_libraries(struct nvc_context *ctx, struct nvc_container *cnt, const struct nvc_driver_info *info) {
30+
if (cnt->flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
31+
return (0);
32+
}
33+
if (cnt->libs == NULL || cnt->nlibs == 0) {
34+
return (0);
35+
}
36+
size_t nlibs = cnt->nlibs;
37+
char **libs = array_copy(&ctx->err, (const char * const *)cnt->libs, cnt->nlibs);
38+
if (libs == NULL) {
39+
return (-1);
40+
}
41+
42+
/* For cuda-compat-mode=mount, we also allow compat libraries with a LOWER major versions. */
43+
bool allow_lower_major_versions = (cnt-> flags & OPT_CUDA_COMPAT_MODE_MOUNT);
44+
filter_by_major_version(allow_lower_major_versions, info, libs, &nlibs);
45+
46+
/* Use the filtered library list. */
47+
free(cnt->libs);
48+
cnt->libs = libs;
49+
cnt->nlibs = nlibs;
50+
51+
if (!(cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
52+
return (0);
53+
}
54+
/* For cuda-compat-mode=ldconfig we also ensure that cuda_compat_dir is set. */
55+
if (get_compat_library_path(&ctx->err, (const char **)libs, nlibs, &cnt->cuda_compat_dir) < 0) {
56+
return (-1);
57+
}
58+
return (0);
59+
}
60+
61+
static void
62+
filter_by_major_version(bool allow_lower_major_versions, const struct nvc_driver_info *info, char * paths[], size_t *size)
63+
{
64+
char *lib, *maj;
65+
bool exclude;
66+
/*
67+
* XXX Filter out any library that has a lower or equal major version than RM to prevent us from
68+
* running into an unsupported configurations (e.g. CUDA compat on Geforce or non-LTS drivers).
69+
*/
70+
for (size_t i = 0; i < *size; ++i) {
71+
lib = basename(paths[i]);
72+
if ((maj = strstr(lib, ".so.")) != NULL) {
73+
maj += strlen(".so.");
74+
exclude = false;
75+
if (allow_lower_major_versions) {
76+
// Only filter out EQUAL RM versions.
77+
exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) == 0);
78+
} else {
79+
// If the major version of RM is greater than or equal to the major version
80+
// of the library that we are considering, we remove the library from the
81+
// list.
82+
exclude = (strncmp(info->nvrm_version, maj, strspn(maj, "0123456789")) >= 0);
83+
}
84+
if (exclude) {
85+
paths[i] = NULL;
86+
}
87+
}
88+
}
89+
array_pack(paths, size);
90+
}
91+
92+
static int
93+
get_compat_library_path(struct error *err, const char * paths[], size_t size, char **compat_dir_result)
94+
{
95+
char *dir;
96+
char *compat_dir;
97+
98+
if (size == 0) {
99+
return 0;
100+
}
101+
102+
char **dirnames = array_copy(err, (const char * const *)paths, size);
103+
if (dirnames == NULL) {
104+
return -1;
105+
}
106+
107+
for (size_t i = 0; i < size; ++i) {
108+
dir = dirname(dirnames[i]);
109+
if (i == 0) {
110+
compat_dir = strdup(dir);
111+
if (compat_dir == NULL) {
112+
return -1;
113+
}
114+
continue;
115+
}
116+
if (strcmp(dir, compat_dir)) {
117+
goto fail;
118+
}
119+
}
120+
121+
*compat_dir_result = compat_dir;
122+
return 0;
123+
fail:
124+
free(dirnames);
125+
free(compat_dir);
126+
return -1;
127+
}

src/cli/compat_mode.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/**
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
**/
17+
18+
#ifndef HEADER_COMPAT_MODE_H
19+
#define HEADER_COMPAT_MODE_H
20+
21+
// TODO: These are duplicated from options.h to prevent conflicts with the CLI
22+
// options header.
23+
enum {
24+
/* OPT_CUDA_COMPAT_MODE_DISABLED replaced OPT_NO_CNTLIBS. */
25+
OPT_CUDA_COMPAT_MODE_DISABLED = 1 << 14,
26+
OPT_CUDA_COMPAT_MODE_LDCONFIG = 1 << 15,
27+
OPT_CUDA_COMPAT_MODE_MOUNT = 1 << 16,
28+
};
29+
30+
int update_compat_libraries(struct nvc_context *, struct nvc_container *, const struct nvc_driver_info *);
31+
32+
33+
#endif /* HEADER_COMPAT_MODE_H */

src/cli/configure.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "cli.h"
99
#include "dsl.h"
10+
#include "compat_mode.h"
1011

1112
static error_t configure_parser(int, char *, struct argp_state *);
1213
static int check_cuda_version(const struct dsl_data *, enum dsl_comparator, const char *);
@@ -36,7 +37,8 @@ const struct argp configure_usage = {
3637
{"no-persistenced", 0x86, NULL, 0, "Don't include the NVIDIA persistenced socket", -1},
3738
{"no-fabricmanager", 0x87, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
3839
{"no-gsp-firmware", 0x88, NULL, 0, "Don't include GSP Firmware", -1},
39-
{"no-cntlibs", 0x89, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
40+
{"no-cntlibs", 0x89, NULL, 0, "[Deprecated] Equivalent to --cuda-compat-mode=disabled", -1},
41+
{"cuda-compat-mode", 0x90, "MODE", 0, "The mode to use to support CUDA Forward Compatibility. One of [ mount (default) | ldconfig | disabled]", -1},
4042
{0},
4143
},
4244
configure_parser,
@@ -167,7 +169,15 @@ configure_parser(int key, char *arg, struct argp_state *state)
167169
goto fatal;
168170
break;
169171
case 0x89:
170-
if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
172+
/* The --no-cntlibs command line flag is equivalent to --cuda-compat-mode=disabled. */
173+
if (str_join(&err, &ctx->container_flags, "cuda-compat-mode=disabled", " ") < 0)
174+
goto fatal;
175+
break;
176+
case 0x90:
177+
/* We add cuda-compat-mode=$arg to the container_flags. */
178+
if (str_join(&err, &ctx->container_flags, "cuda-compat-mode", " ") < 0)
179+
goto fatal;
180+
if (str_join(&err, &ctx->container_flags, arg, "=") < 0)
171181
goto fatal;
172182
break;
173183
case ARGP_KEY_ARG:
@@ -316,6 +326,15 @@ configure_command(const struct context *ctx)
316326
goto fail;
317327
}
318328

329+
/*
330+
* We now have the driver version and can update the list of compat
331+
* libraries discovered above accordingly.
332+
*/
333+
if (update_compat_libraries(nvc, cnt, drv) < 0) {
334+
warn("updating compat library settings failed: %s", libnvc.error(nvc));
335+
goto fail;
336+
}
337+
319338
/* Allocate space for selecting GPU devices and MIG devices */
320339
if (new_devices(&err, dev, &devices) < 0) {
321340
warn("memory allocation failed: %s", err.msg);

src/cli/list.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ const struct argp list_usage = {
2525
{"no-persistenced", 0x84, NULL, 0, "Don't include the NVIDIA persistenced socket", -1},
2626
{"no-fabricmanager", 0x85, NULL, 0, "Don't include the NVIDIA fabricmanager socket", -1},
2727
{"no-gsp-firmware", 0x86, NULL, 0, "Don't include GSP Firmware", -1},
28-
{"no-cntlibs", 0x87, NULL, 0, "Don't overwrite host mounts with CUDA compat libs from the container", -1},
2928
{0},
3029
},
3130
list_parser,
@@ -87,10 +86,6 @@ list_parser(int key, char *arg, struct argp_state *state)
8786
goto fatal;
8887
ctx->list_firmwares = false;
8988
break;
90-
case 0x87:
91-
if (str_join(&err, &ctx->container_flags, "no-cntlibs", " ") < 0)
92-
goto fatal;
93-
break;
9489
case ARGP_KEY_END:
9590
if (state->argc == 1 || (state->argc == 2 && ctx->imex_channels != NULL)) {
9691
if ((ctx->devices = xstrdup(&err, "all")) == NULL)

src/nvc_container.c

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ static char *find_namespace_path(struct error *, const struct nvc_container *, c
2424
static int find_compat_library_paths(struct error *, struct nvc_container *);
2525
static int lookup_owner(struct error *, struct nvc_container *);
2626
static int copy_config(struct error *, struct nvc_container *, const struct nvc_container_config *);
27+
static int validate_cuda_compat_mode_flags(struct error *, int32_t *);
2728

2829
struct nvc_container_config *
2930
nvc_container_config_new(pid_t pid, const char *rootfs)
@@ -236,6 +237,9 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
236237
error_setx(&ctx->err, "invalid mode of operation");
237238
return (NULL);
238239
}
240+
if (validate_cuda_compat_mode_flags(&ctx->err, &flags) < 0) {
241+
return (NULL);
242+
}
239243

240244
log_infof("configuring container with '%s'", opts);
241245
if ((cnt = xcalloc(&ctx->err, 1, sizeof(*cnt))) == NULL)
@@ -246,7 +250,7 @@ nvc_container_new(struct nvc_context *ctx, const struct nvc_container_config *cf
246250
goto fail;
247251
if (lookup_owner(&ctx->err, cnt) < 0)
248252
goto fail;
249-
if (!(flags & OPT_NO_CNTLIBS)) {
253+
if (!(flags & OPT_CUDA_COMPAT_MODE_DISABLED)) {
250254
if (find_compat_library_paths(&ctx->err, cnt) < 0)
251255
goto fail;
252256
}
@@ -293,5 +297,41 @@ nvc_container_free(struct nvc_container *cnt)
293297
free(cnt->mnt_ns);
294298
free(cnt->dev_cg);
295299
array_free(cnt->libs, cnt->nlibs);
300+
free(cnt->cuda_compat_dir);
296301
free(cnt);
297302
}
303+
304+
/*
305+
* validate_cuda_compat_mode_flags checks the options associated with the
306+
* cuda-compat-mode flags.
307+
* This function does the following:
308+
* - Ensures that if OPT_CUDA_COMPAT_MODE_DISABLED is set, other modes are ignored.
309+
* - Ensures that the mode is set to the default (OPT_CUDA_COMPAT_MODE_MOUNT) if unset.
310+
* - Ensures that only a single mode is set.
311+
*/
312+
static int
313+
validate_cuda_compat_mode_flags(struct error *err, int32_t *flags) {
314+
if (*flags & OPT_CUDA_COMPAT_MODE_DISABLED) {
315+
/*
316+
* If the OPT_CUDA_COMPAT_MODE_DISABLED flag is specified, we
317+
* explicitly ignore other OP_CUDA_COMPAT_MODE_* flags.
318+
*/
319+
*flags &= ~(OPT_CUDA_COMPAT_MODE_MOUNT | OPT_CUDA_COMPAT_MODE_LDCONFIG);
320+
return (0);
321+
}
322+
if (!(*flags & (OPT_CUDA_COMPAT_MODE_LDCONFIG | OPT_CUDA_COMPAT_MODE_MOUNT))) {
323+
/*
324+
* If no OPT_CUDA_COMPAT_MODE_* flags are specified,
325+
* default to OPT_CUDA_COMPAT_MODE_MOUNT to maintain
326+
* backward compatibility.
327+
*/
328+
*flags &= OPT_CUDA_COMPAT_MODE_MOUNT;
329+
return (0);
330+
}
331+
332+
if ((*flags & OPT_CUDA_COMPAT_MODE_MOUNT) && (*flags & OPT_CUDA_COMPAT_MODE_LDCONFIG)) {
333+
error_setx(err, "only one cuda-compat-mode can be specified at a time");
334+
return (-1);
335+
}
336+
return (0);
337+
}

src/nvc_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ struct nvc_container {
8484
char *dev_cg;
8585
char **libs;
8686
size_t nlibs;
87+
char *cuda_compat_dir;
8788
};
8889

8990
enum {

src/nvc_ldcache.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "nvc_internal.h"
3131

3232
#include "error.h"
33+
#include "options.h"
3334
#include "utils.h"
3435
#include "xfuncs.h"
3536

@@ -471,7 +472,19 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
471472
if (validate_args(ctx, cnt != NULL) < 0)
472473
return (-1);
473474

474-
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
475+
if (cnt->flags & OPT_CUDA_COMPAT_MODE_LDCONFIG && cnt->cuda_compat_dir != NULL) {
476+
/*
477+
* We include the cuda_compat_dir directory on the ldconfig
478+
* command line. This ensures that the CUDA Forward compat
479+
* libraries take precendence over the user-mode driver
480+
* libraries in the standard library paths (libs_dir and
481+
* libs32_dir).
482+
* */
483+
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cuda_compat_dir, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
484+
} else {
485+
argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
486+
}
487+
475488
if (*argv[0] == '@') {
476489
/*
477490
* We treat this path specially to be relative to the host filesystem.

0 commit comments

Comments
 (0)