32
32
#define SONAME_LIBCUDA "libcuda.so.1"
33
33
#define SONAME_LIBNVML "libnvidia-ml.so.1"
34
34
35
+ #define MAX_DEVICES 64
35
36
#define REAP_TIMEOUT_MS 10
36
37
37
38
static int reset_cuda_environment (struct error * );
38
39
static int setup_rpc_client (struct driver * );
39
40
static noreturn void setup_rpc_service (struct driver * , uid_t , gid_t , pid_t );
40
41
static int reap_process (struct error * , pid_t , int , bool );
41
42
43
+ static struct driver_device {
44
+ nvmlDevice_t nvml ;
45
+ CUdevice cuda ;
46
+ } device_handles [MAX_DEVICES ];
47
+
42
48
#define call_nvml (ctx , sym , ...) __extension__ ({ \
43
49
union {void *ptr; __typeof__(&sym) fn;} u_; \
44
50
nvmlReturn_t r_; \
@@ -83,7 +89,7 @@ reset_cuda_environment(struct error *err)
83
89
const struct { const char * name , * value ; } env [] = {
84
90
{"CUDA_DISABLE_UNIFIED_MEMORY" , "1" },
85
91
{"CUDA_CACHE_DISABLE" , "1" },
86
- {"CUDA_DEVICE_ORDER" , "FASTEST_FIRST " },
92
+ {"CUDA_DEVICE_ORDER" , "PCI_BUS_ID " },
87
93
{"CUDA_VISIBLE_DEVICES" , NULL },
88
94
{"CUDA_MPS_PIPE_DIRECTORY" , "/dev/null" },
89
95
};
@@ -418,49 +424,46 @@ driver_get_device_count_1_svc(ptr_t ctxptr, driver_get_device_count_res *res, ma
418
424
}
419
425
420
426
int
421
- driver_get_device_handle (struct driver * ctx , unsigned int idx , driver_device_handle * dev , bool pci_order )
427
+ driver_get_device (struct driver * ctx , unsigned int idx , struct driver_device * * dev )
422
428
{
423
- struct driver_get_device_handle_res res = {0 };
429
+ struct driver_get_device_res res = {0 };
424
430
int rv = -1 ;
425
431
426
- if (call_rpc (ctx , & res , driver_get_device_handle_1 , idx , pci_order ) < 0 )
432
+ if (call_rpc (ctx , & res , driver_get_device_1 , idx ) < 0 )
427
433
goto fail ;
428
- * dev = (driver_device_handle )res .driver_get_device_handle_res_u . handle ;
434
+ * dev = (struct driver_device * )res .driver_get_device_res_u . dev ;
429
435
rv = 0 ;
430
436
431
437
fail :
432
- xdr_free ((xdrproc_t )xdr_driver_get_device_handle_res , (caddr_t )& res );
438
+ xdr_free ((xdrproc_t )xdr_driver_get_device_res , (caddr_t )& res );
433
439
return (rv );
434
440
}
435
441
436
442
bool_t
437
- driver_get_device_handle_1_svc (ptr_t ctxptr , u_int idx , bool_t pci_order , driver_get_device_handle_res * res , maybe_unused struct svc_req * req )
443
+ driver_get_device_1_svc (ptr_t ctxptr , u_int idx , driver_get_device_res * res , maybe_unused struct svc_req * req )
438
444
{
439
445
struct driver * ctx = (struct driver * )ctxptr ;
440
- driver_device_handle handle ;
441
- CUdevice cudev ;
442
446
int domainid , deviceid , busid ;
443
447
char buf [NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE ];
444
448
445
449
memset (res , 0 , sizeof (* res ));
446
- if (pci_order ) {
447
- if (call_nvml (ctx , nvmlDeviceGetHandleByIndex , idx , & handle ) < 0 )
448
- goto fail ;
449
- } else {
450
- if (call_cuda (ctx , cuDeviceGet , & cudev , (int )idx ) < 0 )
451
- goto fail ;
452
- if (call_cuda (ctx , cuDeviceGetAttribute , & domainid , CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID , cudev ) < 0 )
453
- goto fail ;
454
- if (call_cuda (ctx , cuDeviceGetAttribute , & busid , CU_DEVICE_ATTRIBUTE_PCI_BUS_ID , cudev ) < 0 )
455
- goto fail ;
456
- if (call_cuda (ctx , cuDeviceGetAttribute , & deviceid , CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID , cudev ) < 0 )
457
- goto fail ;
458
- snprintf (buf , sizeof (buf ), "%04x:%02x:%02x.0" , domainid , busid , deviceid );
459
-
460
- if (call_nvml (ctx , nvmlDeviceGetHandleByPciBusId , buf , & handle ) < 0 )
461
- goto fail ;
450
+ if (idx >= MAX_DEVICES ) {
451
+ error_setx (ctx -> err , "too many devices" );
452
+ goto fail ;
462
453
}
463
- res -> driver_get_device_handle_res_u .handle = (ptr_t )handle ;
454
+ if (call_cuda (ctx , cuDeviceGet , & device_handles [idx ].cuda , (int )idx ) < 0 )
455
+ goto fail ;
456
+ if (call_cuda (ctx , cuDeviceGetAttribute , & domainid , CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID , device_handles [idx ].cuda ) < 0 )
457
+ goto fail ;
458
+ if (call_cuda (ctx , cuDeviceGetAttribute , & busid , CU_DEVICE_ATTRIBUTE_PCI_BUS_ID , device_handles [idx ].cuda ) < 0 )
459
+ goto fail ;
460
+ if (call_cuda (ctx , cuDeviceGetAttribute , & deviceid , CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID , device_handles [idx ].cuda ) < 0 )
461
+ goto fail ;
462
+ snprintf (buf , sizeof (buf ), "%04x:%02x:%02x.0" , domainid , busid , deviceid );
463
+ if (call_nvml (ctx , nvmlDeviceGetHandleByPciBusId , buf , & device_handles [idx ].nvml ) < 0 )
464
+ goto fail ;
465
+
466
+ res -> driver_get_device_res_u .dev = (ptr_t )& device_handles [idx ];
464
467
return (true);
465
468
466
469
fail :
@@ -469,7 +472,7 @@ driver_get_device_handle_1_svc(ptr_t ctxptr, u_int idx, bool_t pci_order, driver
469
472
}
470
473
471
474
int
472
- driver_get_device_minor (struct driver * ctx , driver_device_handle dev , unsigned int * minor )
475
+ driver_get_device_minor (struct driver * ctx , struct driver_device * dev , unsigned int * minor )
473
476
{
474
477
struct driver_get_device_minor_res res = {0 };
475
478
int rv = -1 ;
@@ -488,10 +491,11 @@ bool_t
488
491
driver_get_device_minor_1_svc (ptr_t ctxptr , ptr_t dev , driver_get_device_minor_res * res , maybe_unused struct svc_req * req )
489
492
{
490
493
struct driver * ctx = (struct driver * )ctxptr ;
494
+ struct driver_device * handle = (struct driver_device * )dev ;
491
495
unsigned int minor ;
492
496
493
497
memset (res , 0 , sizeof (* res ));
494
- if (call_nvml (ctx , nvmlDeviceGetMinorNumber , ( nvmlDevice_t ) dev , & minor ) < 0 )
498
+ if (call_nvml (ctx , nvmlDeviceGetMinorNumber , handle -> nvml , & minor ) < 0 )
495
499
goto fail ;
496
500
res -> driver_get_device_minor_res_u .minor = minor ;
497
501
return (true);
@@ -502,7 +506,7 @@ driver_get_device_minor_1_svc(ptr_t ctxptr, ptr_t dev, driver_get_device_minor_r
502
506
}
503
507
504
508
int
505
- driver_get_device_busid (struct driver * ctx , driver_device_handle dev , char * * busid )
509
+ driver_get_device_busid (struct driver * ctx , struct driver_device * dev , char * * busid )
506
510
{
507
511
struct driver_get_device_busid_res res = {0 };
508
512
int rv = -1 ;
@@ -522,10 +526,11 @@ bool_t
522
526
driver_get_device_busid_1_svc (ptr_t ctxptr , ptr_t dev , driver_get_device_busid_res * res , maybe_unused struct svc_req * req )
523
527
{
524
528
struct driver * ctx = (struct driver * )ctxptr ;
529
+ struct driver_device * handle = (struct driver_device * )dev ;
525
530
nvmlPciInfo_t pci ;
526
531
527
532
memset (res , 0 , sizeof (* res ));
528
- if (call_nvml (ctx , nvmlDeviceGetPciInfo_v2 , ( nvmlDevice_t ) dev , & pci ) < 0 )
533
+ if (call_nvml (ctx , nvmlDeviceGetPciInfo_v2 , handle -> nvml , & pci ) < 0 )
529
534
goto fail ;
530
535
if ((res -> driver_get_device_busid_res_u .busid = xstrdup (ctx -> err , pci .busId )) == NULL )
531
536
goto fail ;
@@ -537,7 +542,7 @@ driver_get_device_busid_1_svc(ptr_t ctxptr, ptr_t dev, driver_get_device_busid_r
537
542
}
538
543
539
544
int
540
- driver_get_device_uuid (struct driver * ctx , driver_device_handle dev , char * * uuid )
545
+ driver_get_device_uuid (struct driver * ctx , struct driver_device * dev , char * * uuid )
541
546
{
542
547
struct driver_get_device_uuid_res res = {0 };
543
548
int rv = -1 ;
@@ -557,10 +562,11 @@ bool_t
557
562
driver_get_device_uuid_1_svc (ptr_t ctxptr , ptr_t dev , driver_get_device_uuid_res * res , maybe_unused struct svc_req * req )
558
563
{
559
564
struct driver * ctx = (struct driver * )ctxptr ;
565
+ struct driver_device * handle = (struct driver_device * )dev ;
560
566
char buf [NVML_DEVICE_UUID_BUFFER_SIZE ];
561
567
562
568
memset (res , 0 , sizeof (* res ));
563
- if (call_nvml (ctx , nvmlDeviceGetUUID , ( nvmlDevice_t ) dev , buf , sizeof (buf )) < 0 )
569
+ if (call_nvml (ctx , nvmlDeviceGetUUID , handle -> nvml , buf , sizeof (buf )) < 0 )
564
570
goto fail ;
565
571
if ((res -> driver_get_device_uuid_res_u .uuid = xstrdup (ctx -> err , buf )) == NULL )
566
572
goto fail ;
@@ -570,3 +576,42 @@ driver_get_device_uuid_1_svc(ptr_t ctxptr, ptr_t dev, driver_get_device_uuid_res
570
576
error_to_xdr (ctx -> err , res );
571
577
return (true);
572
578
}
579
+
580
+ int
581
+ driver_get_device_arch (struct driver * ctx , struct driver_device * dev , char * * arch )
582
+ {
583
+ struct driver_get_device_arch_res res = {0 };
584
+ int rv = -1 ;
585
+
586
+ if (call_rpc (ctx , & res , driver_get_device_arch_1 , (ptr_t )dev ) < 0 )
587
+ goto fail ;
588
+ if (xasprintf (ctx -> err , arch , "%u.%u" , res .driver_get_device_arch_res_u .arch .major ,
589
+ res .driver_get_device_arch_res_u .arch .minor ) < 0 )
590
+ goto fail ;
591
+ rv = 0 ;
592
+
593
+ fail :
594
+ xdr_free ((xdrproc_t )xdr_driver_get_device_arch_res , (caddr_t )& res );
595
+ return (rv );
596
+ }
597
+
598
+ bool_t
599
+ driver_get_device_arch_1_svc (ptr_t ctxptr , ptr_t dev , driver_get_device_arch_res * res , maybe_unused struct svc_req * req )
600
+ {
601
+ struct driver * ctx = (struct driver * )ctxptr ;
602
+ struct driver_device * handle = (struct driver_device * )dev ;
603
+ int major , minor ;
604
+
605
+ memset (res , 0 , sizeof (* res ));
606
+ if (call_cuda (ctx , cuDeviceGetAttribute , & major , CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR , handle -> cuda ) < 0 )
607
+ goto fail ;
608
+ if (call_cuda (ctx , cuDeviceGetAttribute , & minor , CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR , handle -> cuda ) < 0 )
609
+ goto fail ;
610
+ res -> driver_get_device_arch_res_u .arch .major = (unsigned int )major ;
611
+ res -> driver_get_device_arch_res_u .arch .minor = (unsigned int )minor ;
612
+ return (true);
613
+
614
+ fail :
615
+ error_to_xdr (ctx -> err , res );
616
+ return (true);
617
+ }
0 commit comments