mirror of
https://github.com/NVIDIA/nccl-tests.git
synced 2026-04-23 16:08:20 +08:00
Perftests: Introduce NCCL_TESTS_SPLIT env
`NCCL_TESTS_SPLIT` serves as new way of computing the color for splitting communicators. Will be overrided by `NCCL_TESTS_SPLIT_MASK`. Examples: NCCL_TESTS_SPLIT_MASK="0x7" # color = rank & 0x7. What we do today to run on a DGX with one GPU per node. NCCL_TESTS_SPLIT="AND 0x7" # color = rank & 0x7. New way to run on one GPU per node on a DGX, equivalent to NCCL_TESTS_SPLIT_MASK=0x7 NCCL_TESTS_SPLIT="MOD 72" # color = rank % 72. One GPU per NVLink domain on an NVL72 system. NCCL_TESTS_SPLIT="DIV 72" # color = rank / 72. Intra NVLink domain on NVL72. You can also use: "%" "&" "|" "/" for short. Extra spaces in the middle will be automatically ignored. Not case sensitive. The followings are all equivalent: NCCL_TESTS_SPLIT="%0x7" NCCL_TESTS_SPLIT="%0b111" NCCL_TESTS_SPLIT="AND 7" NCCL_TESTS_SPLIT="and 0x7"
This commit is contained in:
parent
cb6a46fdd6
commit
a89cf07fe8
@ -10,6 +10,8 @@
|
||||
#include <type_traits>
|
||||
#include <getopt.h>
|
||||
#include <libgen.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include "cuda.h"
|
||||
|
||||
#include "../verifiable/verifiable.h"
|
||||
@ -892,6 +894,26 @@ int main(int argc, char* argv[]) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
// parse int for base 2/10/16, will ignore first whitespaces
|
||||
static bool parseInt(char *s, int *num) {
|
||||
char *p = NULL;
|
||||
if (!s || !num)
|
||||
return false;
|
||||
while (*s && isspace(*s)) ++s;
|
||||
if (!*s) return false;
|
||||
|
||||
if (strncasecmp(s, "0b", 2) == 0)
|
||||
*num = (int)strtoul(s + 2, &p, 2);
|
||||
else
|
||||
*num = (int)strtoul(s, &p, 0);
|
||||
|
||||
if (p == s)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
testResult_t run() {
|
||||
int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
|
||||
int localRank = 0;
|
||||
@ -909,10 +931,33 @@ testResult_t run() {
|
||||
if (hostHashs[p] == hostHashs[proc]) localRank++;
|
||||
}
|
||||
|
||||
char* str = getenv("NCCL_TESTS_SPLIT_MASK");
|
||||
uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
|
||||
char *splitMaskEnv = NULL;
|
||||
if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT_MASK")) {
|
||||
color = proc & strtoul(splitMaskEnv, NULL, 16);
|
||||
} else if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT")) {
|
||||
if (
|
||||
(strncasecmp(splitMaskEnv, "AND", strlen("AND")) == 0 && parseInt(splitMaskEnv + strlen("AND"), &color)) ||
|
||||
(strncasecmp(splitMaskEnv, "&", strlen("&")) == 0 && parseInt(splitMaskEnv + strlen("&"), &color))
|
||||
)
|
||||
color = proc & color;
|
||||
if (
|
||||
(strncasecmp(splitMaskEnv, "OR", strlen("OR")) == 0 && parseInt(splitMaskEnv + strlen("OR"), &color)) ||
|
||||
(strncasecmp(splitMaskEnv, "|", strlen("|")) == 0 && parseInt(splitMaskEnv + strlen("|"), &color))
|
||||
)
|
||||
color = proc | color;
|
||||
if (
|
||||
(strncasecmp(splitMaskEnv, "MOD", strlen("MOD")) == 0 && parseInt(splitMaskEnv + strlen("MOD"), &color)) ||
|
||||
(strncasecmp(splitMaskEnv, "%", strlen("%")) == 0 && parseInt(splitMaskEnv + strlen("%"), &color))
|
||||
)
|
||||
color = proc % color;
|
||||
if (
|
||||
(strncasecmp(splitMaskEnv, "DIV", strlen("DIV")) == 0 && parseInt(splitMaskEnv + strlen("DIV"), &color)) ||
|
||||
(strncasecmp(splitMaskEnv, "/", strlen("/")) == 0 && parseInt(splitMaskEnv + strlen("/"), &color))
|
||||
)
|
||||
color = proc / color;
|
||||
}
|
||||
|
||||
MPI_Comm mpi_comm;
|
||||
color = proc & mask;
|
||||
MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
|
||||
MPI_Comm_size(mpi_comm, &ncclProcs);
|
||||
MPI_Comm_rank(mpi_comm, &ncclProc);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user