mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
141 lines
5.5 KiB
C++
141 lines
5.5 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#pragma once
|
|
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
|
#include "tensorrt_llm/runtime/worldConfig.h"
|
|
|
|
#include <cuda.h>
|
|
#if defined(__aarch64__) || defined(_M_ARM64)
|
|
#define MNNVL
|
|
#endif
|
|
|
|
#define MAX_REGIONS 16
|
|
#define MAX_SMS 32
|
|
#define MAX_OPS 32
|
|
#define MAX_PEERS 8192
|
|
#define MAX_REQUESTS 1024
|
|
#define LAUNCH_GPU 1
|
|
#define LAUNCH_CPU 2
|
|
#define MAX_NVLINK 32
|
|
|
|
#define UB_MEM_UC_CONTIG 1
|
|
#define UB_MEM_MC_CREATED 2
|
|
#define UB_MEM_ALLOCATED 4
|
|
|
|
// region 0 flag offsets
|
|
#define REG0_OPFLAGS (MAX_PEERS * 2)
|
|
#define REG0_RECV (REG0_OPFLAGS * userbuffers_op_types)
|
|
#define REG0_SINGLENODE (2 * MAX_NVLINK * MAX_SMS + MAX_OPS)
|
|
#define REG0_OFFSET(comm) ((2 * MAX_REGIONS) * MAX_NVLINK + REG0_SINGLENODE * 2 + MAX_PEERS)
|
|
#define REG0_ONESHOT_MAX 32 * 1024
|
|
#define REG0_ONESHOT_BUFFER (MAX_NVLINK * REG0_ONESHOT_MAX)
|
|
#define REG0_COMMBUFFER (REG0_ONESHOT_BUFFER * 2)
|
|
#define REG0_FLAGS (REG0_RECV + MAX_PEERS * MAX_REGIONS * 3)
|
|
|
|
namespace tensorrt_llm::runtime::ub
|
|
{
|
|
enum req_type
|
|
{
|
|
userbuffers_allreduceop_sharp,
|
|
userbuffers_sendop,
|
|
userbuffers_allreduceop_nonsharp,
|
|
userbuffers_allreduceop_nonsharp2,
|
|
userbuffers_alltoall,
|
|
userbuffers_op_types
|
|
};
|
|
|
|
struct communicator
|
|
{
|
|
int myrank, nranks; // global job communicator
|
|
int nvrank, nvsize; // single node comm_intra
|
|
int free_region;
|
|
|
|
void* gpu_ptrs;
|
|
int sms, threads;
|
|
int use_rr_kernel; // Whether to use RR (or RW) for NVLink-only kernel
|
|
int cga_size;
|
|
|
|
void* mem_ptr[MAX_REGIONS];
|
|
void** peer_ptr[MAX_REGIONS];
|
|
|
|
int memflags[MAX_REGIONS]; // UC,MC, user/lib allocated
|
|
|
|
CUmemGenericAllocationHandle* uchandles[MAX_REGIONS];
|
|
void* ucbase_ptr[MAX_REGIONS]; // only for cuMem allocated memory
|
|
size_t mem_size[MAX_REGIONS];
|
|
|
|
void* mc_ptr[MAX_REGIONS];
|
|
void* mc_baseptr;
|
|
CUmemGenericAllocationHandle mc_handle;
|
|
size_t mc_offset, mc_maxsize;
|
|
int use_mc; // 1: use MC if available, 0: override not to use MC
|
|
|
|
int tp_size, tp_first_rank, tp_rank; // with ar_nvsize as a step
|
|
int sm_arch;
|
|
int oneshot, pdl_launch;
|
|
int oneshot_force_enable_threshold;
|
|
|
|
MPI_Comm comm_world, // clone of MPI_COMM_WORLD
|
|
comm_inter, // reduction group communicator (subset of the nodes) along GPU rail
|
|
comm_intra; // full intranode (all ndev GPUS)
|
|
|
|
int *send_id, *recv_id;
|
|
int mydev;
|
|
};
|
|
using communicator = struct communicator;
|
|
|
|
int create_communicator_grouped2(communicator** comm, tensorrt_llm::runtime::WorldConfig const& world_config);
|
|
/* creates communicator with
|
|
allreduce1 to happen in datagpus x datanodes groups,
|
|
allreduce2 to happen in tensorgpus x tensor nodes,
|
|
where num_nodes = pipenodes x tensornodes x datanodes
|
|
nvlink_size = pipegpus x tensorgpus x datagpus
|
|
*/
|
|
|
|
int register_user_buffer_collective(void** gpubuff, size_t bytes, communicator* comm);
|
|
/* returns handler and registers buffers. assumed to be collective i.e. you use same groups and dont mix buffers for
|
|
different operations returns -1 if can't register (too many preregistered regions already) if alloc==true will
|
|
allocate memory and fill the pointers (required for NVL SHARP and NSO/MNNVL)
|
|
*/
|
|
|
|
void destroy_communicator(communicator* comm);
|
|
} // namespace tensorrt_llm::runtime::ub
|
|
|
|
namespace tensorrt_llm::kernels::ub
|
|
{
|
|
using namespace tensorrt_llm::runtime::ub;
|
|
void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, size_t const elements,
|
|
nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream = 0);
|
|
// for TP-parallelism, only single node is implemented
|
|
|
|
int allgather2_userbuff_residual_impl(int const handler, size_t const offset, size_t const elements,
|
|
int const hidden_size, void* residual, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream,
|
|
bool force_enable);
|
|
|
|
int allreduce2_userbuff_rmsnorm_impl(int const handler, size_t const offset, int const out_handler,
|
|
size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps,
|
|
void* residual_in, void* residual_out, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream);
|
|
|
|
int allreduce2_userbuff_inplace_rmsnorm_quant_impl(int const handler, size_t const offset, int const out_handler,
|
|
size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps,
|
|
float* scalefactor, void* residual_in, void* residual_out, nvinfer1::DataType dataType, communicator* comm,
|
|
cudaStream_t stream);
|
|
int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t const offset, int const out_handler,
|
|
size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements,
|
|
int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in,
|
|
void* residual_out, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream);
|
|
} // namespace tensorrt_llm::kernels::ub
|