We are experiencing an issue where 8 processes, each controlling one GPU on a node, al

We now suspect <a href="https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

Reproducer (on 2 H100s): <div class="snippet-clipboard-content notranslate positio

Here's a C++ version (thanks claude) <div class="snippet-clipboard-content notrans

resolved by this commit (i assume will be added to master soon) <a class="commit-link"

Leak in FIFO queue about nccl HOT 7 CLOSED

samsamoa commented on June 27, 2024

Leak in FIFO queue

from nccl.

Comments (7)

samsamoa commented on June 27, 2024

We now suspect https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-mixing-support is at issue here. We had turned it off to get a significant speedup, but we may be misusing that feature.

from nccl.

samsamoa commented on June 27, 2024

We actually were still able to reproduce with graph mixing support turned on. Adding a synchronize between usages somehow also doesn't help. We're working on a more minimal reproducer but it will take some time.

from nccl.

WhiteFangBuck commented on June 27, 2024

@sjeaugey @KaimingOuyang

from nccl.

jbachan commented on June 27, 2024

Can you elaborate on the number of graphs, the number of nccl calls per graph, the number of non-graph nccl calls. With that, could you create a minimal reproducer? Also, @ben ***@***.***> can you help them collect a nccl call trace? Get Outlook for Android<https://aka.ms/AAb9ysg>

…

________________________________ From: WhiteFangBuck ***@***.***> Sent: Saturday, April 13, 2024 10:09:03 AM To: NVIDIA/nccl ***@***.***> Cc: Subscribed ***@***.***> Subject: Re: [NVIDIA/nccl] Leak in FIFO queue (Issue #1251) @sjeaugey<https://github.com/sjeaugey> @KaimingOuyang<https://github.com/KaimingOuyang> — Reply to this email directly, view it on GitHub<#1251 (comment)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AARQAY2MKGKW6O6WGOWNYLLY5FRC7AVCNFSM6AAAAABGFRMS2WVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDANJTG4YDIMJYGM>. You are receiving this because you are subscribed to this thread.Message ID: ***@***.***>

from nccl.

samsamoa commented on June 27, 2024

Reproducer (on 2 H100s):

import torch


def _test(rank):
    torch.cuda.set_device(rank)
    torch.distributed.init_process_group(
        backend="nccl", rank=rank, world_size=2, init_method="tcp://localhost:2379"
    )

    size = 100_000
    t = torch.zeros(size, dtype=torch.bfloat16, device="cuda")
    torch.distributed.all_reduce(t)
    torch.distributed.all_reduce(t)
    with torch.cuda.graphs.graph(torch.cuda.graphs.CUDAGraph()):
        torch.distributed.all_reduce(t)

    # Uncommenting this will fix the hang
    # torch.distributed.all_reduce(t)

    random.seed(0)
    for i in range(100_000):
        if i % 100 == 0 and rank == 0:
            print(i)
        size = 49_000
        t = torch.zeros(size, dtype=torch.bfloat16, device="cuda")
        torch.distributed.all_reduce(t)
        torch.cuda.synchronize()


if __name__ == "__main__":
    torch.multiprocessing.start_processes(fn=_test, nprocs=2)

from nccl.

samsamoa commented on June 27, 2024

Here's a C++ version (thanks claude)

#include <iostream>
#include <nccl.h>
#include <mpi.h>

void test(int rank) {
//    setenv("NCCL_WORK_FIFO_DEPTH", "128", 1);
//    if (rank == 0) {
//        setenv("NCCL_DEBUG", "TRACE", 1);
//        setenv("NCCL_DEBUG_SUBSYS", "ALL", 1);
//    }

    cudaSetDevice(rank);

    ncclComm_t comm;
    ncclUniqueId id;
    if (rank == 0) {
        ncclGetUniqueId(&id);
    }
    MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
    ncclCommInitRank(&comm, 2, id, rank);

    int size = 100000;
    ncclDataType_t dataType = ncclBfloat16;
    size_t elemSize = sizeof(uint16_t);

    uint16_t* d_data;
    cudaMalloc(&d_data, size * elemSize);
    cudaMemset(d_data, 0, size * elemSize);

    ncclAllReduce(d_data, d_data, size, dataType, ncclSum, comm, cudaStreamDefault);
    ncclAllReduce(d_data, d_data, size, dataType, ncclSum, comm, cudaStreamDefault);


    cudaStream_t stream;
    cudaStreamCreate(&stream);

    // Create CUDA graph
    cudaGraph_t graph;
    cudaGraphCreate(&graph, 0);

    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    ncclAllReduce(d_data, d_data, size, dataType, ncclSum, comm, stream);
    cudaStreamEndCapture(stream, &graph);

    cudaGraphExec_t graphExec;
    cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0);


    for (int i = 0; i < 10000; ++i) {
        if (i % 100 == 0 && rank == 0) {
            std::cout << i << std::endl;
        }
        size = 49000;
        cudaMemset(d_data, 0, size * elemSize);
        ncclAllReduce(d_data, d_data, size, dataType, ncclSum, comm, cudaStreamDefault);
        cudaStreamSynchronize(cudaStreamDefault);
    }

    cudaGraphExecDestroy(graphExec);
    cudaGraphDestroy(graph);
    cudaFree(d_data);
    ncclCommDestroy(comm);
}

int main(int argc, char* argv[]) {
    int rank, world_size;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    test(rank);

    MPI_Finalize();
    return 0;
}

nvcc -o nccl_test repro.cc -lnccl -lmpi
mpirun -np 2 ./nccl_test

from nccl.

samsamoa commented on June 27, 2024

resolved by this commit (i assume will be added to master soon) ee3d92b

from nccl.

Leak in FIFO queue about nccl HOT 7 CLOSED

Comments (7)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent