struct ncclTopoGraph { // Input / output int id; // ring : 0, tree : 1, collnet : 2 int pattern; int crossNic; int collNet; int minChannels; int maxChannels; // Output int nChannels; // 搜索到的channel数量 float speedIntra; // 节点内单个channel带宽 float...
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) { int nChannels = comm->nChannels; int nNodes = comm->nNodes; for (int c=0; c<nChannels; c++) { int* recv = ringRecv+c*comm->nRanks;...
TopoRanks topoRanks; } *allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); ...
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); int c; for (c=nChannels; c<ncclMinNchannels(); c++) { memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int)); memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*...
realChunkSize = min(chunkSize, divide(size-gridOffset, nChannels*nranks*minChunkSize)); realChunkSize = int(realChunkSize); } // 计算每个 chunk 的偏移量 auto calcOffset = [&]**device**(int chunk)->size_t { if (Proto::id == NCCL_PROTO_SIMPLE) ...
NCCL_MIN_NCHANNELS¶ (NCCL_MIN_NRINGS since 2.2.0, NCCL_MIN_NCHANNELS since 2.5.0) TheNCCL_MIN_NCHANNELSvariable controls the minimum number of channels you want NCCL to use. Increasing the number of channels also increases the number of CUDA blocks NCCL uses, which may be useful to impr...
(NCCL_MIN_NRINGS since 2.2.0, NCCL_MIN_NCHANNELS since 2.5.0) TheNCCL_MIN_NCHANNELSvariable controls the minimum number of channels you want NCCL to use. Increasing the number of channels also increases the number of CUDA blocks NCCL uses, which may be useful to improve performance; however...
Hi, recently I try to use NCCL_MAX_NCHANNELS = 10 to limit nccl:all_to_all operation grid_size(SM counts) from torch/distributed/distributed_c10d.py(3881): all_to_all_single, but result shows that grid_size is 16, which is still larger t...
NCCL_MAX_NCHANNELS=1 NCCL_P2P_DISABLE=0 NCCL_NTHREADS=256 Given the CPU usage shown here, I'm wondering whether CUDA would be recompiling NCCL every time, considering it doesn't have the support for your GPU architecture. Since you mentioned you are using NCCL 2.6.4, may I ask how you...
NCCL_MAX/MIN_NCHANNELS 最小和最大的rings,rings越多对GPU的显存、带宽的压力都越大,也会影响计算性能 NCCL_CHECKS_DISABLE 在每次集合通信进行前对参数检验校对,这会增加延迟时间,在生产环境中可以设为1.缺省是0 NCCL_CHECK_POINTERS 在每次集合通信进行前对CUDA内存 指针进行校验,这会增加延迟时间,在生产环境中...