TensorRT-LLMs/cpp/tensorrt_llm/runtime/moeLoadBalancer/gdrwrap.h
dongxuy04 490d2e5819
feat: large-scale EP(part 8: Online EP load balancer integration for PCIe fp8) (#5226)
Signed-off-by: Dongxu Yang <78518666+dongxuy04@users.noreply.github.com>
2025-06-25 22:25:13 -07:00

267 lines
6.0 KiB
C++

/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef _WIN32
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/logger.h"
#include <cstdint>
#include <cstdlib>
// Dummy types for Windows to allow compilation.
struct gdr;
typedef struct gdr* gdr_t;
typedef struct gdr_mh_s
{
unsigned long h;
} gdr_mh_t;
struct gdr_info
{
};
typedef struct gdr_info gdr_info_t;
namespace tensorrt_llm
{
namespace runtime
{
namespace gdrcopy
{
struct GdrMemDesc
{
};
// On Windows, GDRCopy is not supported. These are stub implementations.
inline bool initialize()
{
TLLM_LOG_INFO("GDRCopy is not supported on Windows.");
return false;
}
inline bool isInitialized()
{
return false;
}
#define GDRCOPY_UNSUPPORTED() TLLM_THROW("GDRCopy is not supported on Windows")
inline gdr_t open()
{
GDRCOPY_UNSUPPORTED();
return nullptr;
}
inline int close(gdr_t /*g*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline int pin_buffer(gdr_t /*g*/, unsigned long /*addr*/, size_t /*size*/, uint64_t /*p2p_token*/,
uint32_t /*va_space*/, gdr_mh_t* /*handle*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline int unpin_buffer(gdr_t /*g*/, gdr_mh_t /*handle*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline int get_info(gdr_t /*g*/, gdr_mh_t /*handle*/, gdr_info_t* /*info*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline int map(gdr_t /*g*/, gdr_mh_t /*handle*/, void** /*va*/, size_t /*size*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline int unmap(gdr_t /*g*/, gdr_mh_t /*handle*/, void* /*va*/, size_t /*size*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline void runtime_get_version(int* /*major*/, int* /*minor*/)
{
GDRCOPY_UNSUPPORTED();
}
inline void driver_get_version(gdr_t /*g*/, int* /*major*/, int* /*minor*/)
{
GDRCOPY_UNSUPPORTED();
}
inline int copy_to_mapping(gdr_mh_t /*handle*/, void* /*map_d_ptr*/, void const* /*h_ptr*/, size_t /*size*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
inline int copy_from_mapping(gdr_mh_t /*handle*/, void* /*h_ptr*/, void const* /*map_d_ptr*/, size_t /*size*/)
{
GDRCOPY_UNSUPPORTED();
return -1;
}
template <typename T>
void gdrCudaMalloc(T** /*ptr*/, T** /*devPtr*/, size_t /*nelem*/, GdrMemDesc** /*memDesc*/, gdr_t /*handle*/)
{
GDRCOPY_UNSUPPORTED();
}
inline void gdrCudaFree(GdrMemDesc* /*memDesc*/, gdr_t /*handle*/)
{
GDRCOPY_UNSUPPORTED();
}
} // namespace gdrcopy
} // namespace runtime
} // namespace tensorrt_llm
#else // NOT _WIN32
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/logger.h"
#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <cuda_runtime.h>
// These definitions are from gdrapi.h to avoid a direct dependency on the header.
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE - 1)
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
struct gdr;
typedef struct gdr* gdr_t;
typedef struct gdr_mh_s
{
unsigned long h;
} gdr_mh_t;
struct gdr_info
{
uint64_t va;
uint64_t mapped_size;
uint32_t page_size;
uint64_t tm_cycles;
uint32_t cycles_per_ms;
unsigned mapped : 1;
unsigned wc_mapping : 1;
};
typedef struct gdr_info gdr_info_t;
namespace tensorrt_llm
{
namespace runtime
{
namespace gdrcopy
{
// This is required as the GDR memory is mapped WC
#if !defined(__NVCC__)
#if defined(__PPC__)
static inline void wc_store_fence(void)
{
asm volatile("sync");
}
#elif defined(__x86_64__)
#include <immintrin.h>
static inline void wc_store_fence(void)
{
_mm_sfence();
}
#elif defined(__aarch64__)
#ifdef __cplusplus
#include <atomic>
static inline void wc_store_fence(void)
{
std::atomic_thread_fence(std::memory_order_release);
}
#else
#include <stdatomic.h>
static inline void wc_store_fence(void)
{
atomic_thread_fence(memory_order_release);
}
#endif
#endif
#endif
// Initializes the GDRCopy library by dynamically loading it.
// This function is thread-safe.
// Returns true on success, false on failure.
bool initialize();
// Returns true if the GDRCopy library has been successfully initialized.
bool isInitialized();
// All functions below are wrappers around the GDRCopy library functions.
// They are thread-safe.
// Before calling any of these functions, ensure the library is initialized.
gdr_t open();
int close(gdr_t g);
int pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t* handle);
int unpin_buffer(gdr_t g, gdr_mh_t handle);
int get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info);
int map(gdr_t g, gdr_mh_t handle, void** va, size_t size);
int unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size);
void runtime_get_version(int* major, int* minor);
void driver_get_version(gdr_t g, int* major, int* minor);
int copy_to_mapping(gdr_mh_t handle, void* map_d_ptr, void const* h_ptr, size_t size);
int copy_from_mapping(gdr_mh_t handle, void* h_ptr, void const* map_d_ptr, size_t size);
// --- GDRCopy Memory Management Helpers ---
struct GdrMemDesc
{
void* gdrDeviceMem;
void* gdrMap;
size_t gdrOffset;
size_t gdrMapSize;
gdr_mh_t gdrMh;
};
// Allocates memory that can be used with GDRCopy.
void gdrCudaMalloc(void** ptr, void** devPtr, size_t mapSize, GdrMemDesc** memDesc, gdr_t handle);
// Frees memory allocated with gdrCudaMalloc.
void gdrCudaFree(GdrMemDesc* memDesc, gdr_t handle);
} // namespace gdrcopy
} // namespace runtime
} // namespace tensorrt_llm
#endif // _WIN32