mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
267 lines
6.0 KiB
C++
267 lines
6.0 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#ifdef _WIN32
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
|
|
// Dummy types for Windows to allow compilation.
|
|
struct gdr;
|
|
typedef struct gdr* gdr_t;
|
|
|
|
typedef struct gdr_mh_s
|
|
{
|
|
unsigned long h;
|
|
} gdr_mh_t;
|
|
|
|
struct gdr_info
|
|
{
|
|
};
|
|
typedef struct gdr_info gdr_info_t;
|
|
|
|
namespace tensorrt_llm
|
|
{
|
|
namespace runtime
|
|
{
|
|
namespace gdrcopy
|
|
{
|
|
|
|
struct GdrMemDesc
|
|
{
|
|
};
|
|
|
|
// On Windows, GDRCopy is not supported. These are stub implementations.
|
|
inline bool initialize()
|
|
{
|
|
TLLM_LOG_INFO("GDRCopy is not supported on Windows.");
|
|
return false;
|
|
}
|
|
|
|
inline bool isInitialized()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
#define GDRCOPY_UNSUPPORTED() TLLM_THROW("GDRCopy is not supported on Windows")
|
|
|
|
inline gdr_t open()
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return nullptr;
|
|
}
|
|
|
|
inline int close(gdr_t /*g*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline int pin_buffer(gdr_t /*g*/, unsigned long /*addr*/, size_t /*size*/, uint64_t /*p2p_token*/,
|
|
uint32_t /*va_space*/, gdr_mh_t* /*handle*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline int unpin_buffer(gdr_t /*g*/, gdr_mh_t /*handle*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline int get_info(gdr_t /*g*/, gdr_mh_t /*handle*/, gdr_info_t* /*info*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline int map(gdr_t /*g*/, gdr_mh_t /*handle*/, void** /*va*/, size_t /*size*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline int unmap(gdr_t /*g*/, gdr_mh_t /*handle*/, void* /*va*/, size_t /*size*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline void runtime_get_version(int* /*major*/, int* /*minor*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
}
|
|
|
|
inline void driver_get_version(gdr_t /*g*/, int* /*major*/, int* /*minor*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
}
|
|
|
|
inline int copy_to_mapping(gdr_mh_t /*handle*/, void* /*map_d_ptr*/, void const* /*h_ptr*/, size_t /*size*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
inline int copy_from_mapping(gdr_mh_t /*handle*/, void* /*h_ptr*/, void const* /*map_d_ptr*/, size_t /*size*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
return -1;
|
|
}
|
|
|
|
template <typename T>
|
|
void gdrCudaMalloc(T** /*ptr*/, T** /*devPtr*/, size_t /*nelem*/, GdrMemDesc** /*memDesc*/, gdr_t /*handle*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
}
|
|
|
|
inline void gdrCudaFree(GdrMemDesc* /*memDesc*/, gdr_t /*handle*/)
|
|
{
|
|
GDRCOPY_UNSUPPORTED();
|
|
}
|
|
|
|
} // namespace gdrcopy
|
|
} // namespace runtime
|
|
} // namespace tensorrt_llm
|
|
|
|
#else // NOT _WIN32
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/cudaUtils.h"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
#include <atomic>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <cuda_runtime.h>
|
|
|
|
// These definitions are from gdrapi.h to avoid a direct dependency on the header.
|
|
#define GPU_PAGE_SHIFT 16
|
|
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
|
|
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE - 1)
|
|
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
|
|
|
|
struct gdr;
|
|
typedef struct gdr* gdr_t;
|
|
|
|
typedef struct gdr_mh_s
|
|
{
|
|
unsigned long h;
|
|
} gdr_mh_t;
|
|
|
|
struct gdr_info
|
|
{
|
|
uint64_t va;
|
|
uint64_t mapped_size;
|
|
uint32_t page_size;
|
|
uint64_t tm_cycles;
|
|
uint32_t cycles_per_ms;
|
|
unsigned mapped : 1;
|
|
unsigned wc_mapping : 1;
|
|
};
|
|
typedef struct gdr_info gdr_info_t;
|
|
|
|
namespace tensorrt_llm
|
|
{
|
|
namespace runtime
|
|
{
|
|
namespace gdrcopy
|
|
{
|
|
|
|
// This is required as the GDR memory is mapped WC
|
|
#if !defined(__NVCC__)
|
|
#if defined(__PPC__)
|
|
static inline void wc_store_fence(void)
|
|
{
|
|
asm volatile("sync");
|
|
}
|
|
#elif defined(__x86_64__)
|
|
#include <immintrin.h>
|
|
|
|
static inline void wc_store_fence(void)
|
|
{
|
|
_mm_sfence();
|
|
}
|
|
#elif defined(__aarch64__)
|
|
#ifdef __cplusplus
|
|
#include <atomic>
|
|
|
|
static inline void wc_store_fence(void)
|
|
{
|
|
std::atomic_thread_fence(std::memory_order_release);
|
|
}
|
|
#else
|
|
#include <stdatomic.h>
|
|
|
|
static inline void wc_store_fence(void)
|
|
{
|
|
atomic_thread_fence(memory_order_release);
|
|
}
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
// Initializes the GDRCopy library by dynamically loading it.
|
|
// This function is thread-safe.
|
|
// Returns true on success, false on failure.
|
|
bool initialize();
|
|
|
|
// Returns true if the GDRCopy library has been successfully initialized.
|
|
bool isInitialized();
|
|
|
|
// All functions below are wrappers around the GDRCopy library functions.
|
|
// They are thread-safe.
|
|
// Before calling any of these functions, ensure the library is initialized.
|
|
|
|
gdr_t open();
|
|
int close(gdr_t g);
|
|
int pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t* handle);
|
|
int unpin_buffer(gdr_t g, gdr_mh_t handle);
|
|
int get_info(gdr_t g, gdr_mh_t handle, gdr_info_t* info);
|
|
int map(gdr_t g, gdr_mh_t handle, void** va, size_t size);
|
|
int unmap(gdr_t g, gdr_mh_t handle, void* va, size_t size);
|
|
void runtime_get_version(int* major, int* minor);
|
|
void driver_get_version(gdr_t g, int* major, int* minor);
|
|
int copy_to_mapping(gdr_mh_t handle, void* map_d_ptr, void const* h_ptr, size_t size);
|
|
int copy_from_mapping(gdr_mh_t handle, void* h_ptr, void const* map_d_ptr, size_t size);
|
|
|
|
// --- GDRCopy Memory Management Helpers ---
|
|
|
|
struct GdrMemDesc
|
|
{
|
|
void* gdrDeviceMem;
|
|
void* gdrMap;
|
|
size_t gdrOffset;
|
|
size_t gdrMapSize;
|
|
gdr_mh_t gdrMh;
|
|
};
|
|
|
|
// Allocates memory that can be used with GDRCopy.
|
|
void gdrCudaMalloc(void** ptr, void** devPtr, size_t mapSize, GdrMemDesc** memDesc, gdr_t handle);
|
|
|
|
// Frees memory allocated with gdrCudaMalloc.
|
|
void gdrCudaFree(GdrMemDesc* memDesc, gdr_t handle);
|
|
|
|
} // namespace gdrcopy
|
|
} // namespace runtime
|
|
} // namespace tensorrt_llm
|
|
|
|
#endif // _WIN32
|