device api 2.28 is not compatible with 2.29. Check versions and print error if there is a mismatch

This commit is contained in:
Katie Gioioso 2025-11-25 19:11:18 +00:00
parent 24874bdaa8
commit 332e61896f
2 changed files with 42 additions and 15 deletions

View File

@ -23,6 +23,7 @@
#pragma weak ncclCommWindowDeregister
#pragma weak ncclDevCommCreate
#pragma weak ncclDevCommDestroy
#pragma weak ncclCommQueryProperties
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
@ -814,6 +815,13 @@ testResult_t threadInit(struct threadArgs* args) {
/* Create device communicators based on test-specific requirements */
if (deviceImpl) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
if (test_ncclVersion < NCCL_VERSION(2,29,0)) {
fprintf(stderr,
"Incompatible NCCL versions. nccl-tests was compiled with NCCL %d, but is running with NCCL %d. "
"The %d Device API is not compatible with versions before 2.29.\n",
NCCL_VERSION_CODE, test_ncclVersion, NCCL_VERSION_CODE);
return testInvalidUsage;
}
ncclDevCommRequirements reqs = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
if (!ncclTestEngine.getDevCommRequirements) {
fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
@ -823,6 +831,12 @@ testResult_t threadInit(struct threadArgs* args) {
NCCLCHECK(ncclCommQueryProperties(args->comms[0], &commProperties));
TESTCHECK(ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs, &commProperties));
#else
if (test_ncclVersion >= NCCL_VERSION(2,29,0)) {
fprintf(stderr, "Incompatible NCCL versions. nccl-tests was compiled with NCCL 2.28, but is running with NCCL %d. "
"The 2.28 Device API is not compatible with later.\n",
test_ncclVersion);
return testInvalidUsage;
}
ncclDevCommRequirements reqs = {};
if (!ncclTestEngine.getDevCommRequirements ||
!ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs)) {
@ -1396,21 +1410,33 @@ testResult_t run() {
/* Create device communicators based on test-specific requirements */
if (deviceImpl) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,29,0)
ncclDevCommRequirements reqs = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
if (!ncclTestEngine.getDevCommRequirements) {
fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
return testNotImplemented;
}
ncclCommProperties commProperties = NCCL_COMM_PROPERTIES_INITIALIZER;
NCCLCHECK(ncclCommQueryProperties(comms[0], &commProperties));
TESTCHECK(ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs, &commProperties));
if (test_ncclVersion < NCCL_VERSION(2,29,0)) {
fprintf(stderr,
"Incompatible NCCL versions. nccl-tests was compiled with NCCL %d, but is running with NCCL %d. "
"The %d Device API is not compatible with versions before 2.29.\n",
NCCL_VERSION_CODE, test_ncclVersion, NCCL_VERSION_CODE);
return testInvalidUsage;
}
ncclDevCommRequirements reqs = NCCL_DEV_COMM_REQUIREMENTS_INITIALIZER;
if (!ncclTestEngine.getDevCommRequirements) {
fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
return testNotImplemented;
}
ncclCommProperties commProperties = NCCL_COMM_PROPERTIES_INITIALIZER;
NCCLCHECK(ncclCommQueryProperties(comms[0], &commProperties));
TESTCHECK(ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs, &commProperties));
#else
ncclDevCommRequirements reqs = {};
if (!ncclTestEngine.getDevCommRequirements ||
!ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs)) {
fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
return testNotImplemented;
}
if (test_ncclVersion >= NCCL_VERSION(2,29,0)) {
fprintf(stderr, "Incompatible NCCL versions. nccl-tests was compiled with NCCL 2.28, but is running with NCCL %d. "
"The 2.28 Device API is not compatible with later versions.\n", test_ncclVersion);
return testInvalidUsage;
}
ncclDevCommRequirements reqs = {};
if (!ncclTestEngine.getDevCommRequirements ||
!ncclTestEngine.getDevCommRequirements(deviceImpl, &reqs)) {
fprintf(stderr, "Device implementation %d is not supported by this test\n", deviceImpl);
return testNotImplemented;
}
#endif
NCCLCHECK(ncclGroupStart());

View File

@ -72,7 +72,8 @@ typedef enum {
testNcclError = 3,
testTimeout = 4,
testNotImplemented = 5,
testNumResults = 6
testInvalidUsage = 6,
testNumResults = 7, // Must be last
} testResult_t;
// Relay errors up and trace