mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Merge remote-tracking branch 'gitlab/main' into feat/gb110_bringup
Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
commit
759e7a0ce7
3
.gitignore
vendored
3
.gitignore
vendored
@ -43,6 +43,9 @@ tensorrt_llm/bindings/**/*.pyi
|
||||
tensorrt_llm/deep_ep/
|
||||
tensorrt_llm/deep_ep_cpp_tllm.*.so
|
||||
tensorrt_llm/deep_ep_cpp_tllm.pyi
|
||||
tensorrt_llm/deep_gemm/
|
||||
tensorrt_llm/deep_gemm_cpp_tllm.*.so
|
||||
tensorrt_llm/deep_gemm_cpp_tllm.pyi
|
||||
*docs/cpp_docs*
|
||||
*docs/source/_cpp_gen*
|
||||
docs/source/**/*.rst
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -26,3 +26,6 @@
|
||||
[submodule "3rdparty/cppzmq"]
|
||||
path = 3rdparty/cppzmq
|
||||
url = https://github.com/zeromq/cppzmq.git
|
||||
[submodule "3rdparty/DeepGEMM"]
|
||||
path = 3rdparty/DeepGEMM
|
||||
url = https://github.com/deepseek-ai/DeepGEMM.git
|
||||
|
||||
1
3rdparty/DeepGEMM
vendored
Submodule
1
3rdparty/DeepGEMM
vendored
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c
|
||||
@ -31,6 +31,7 @@ option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
|
||||
option(BUILD_TESTS "Build Google tests" ON)
|
||||
option(BUILD_BENCHMARKS "Build benchmarks" ON)
|
||||
option(BUILD_DEEP_EP "Build the Deep EP module" ON)
|
||||
option(BUILD_DEEP_GEMM "Build the DeepGEMM module" ON)
|
||||
option(BUILD_MICRO_BENCHMARKS "Build C++ micro benchmarks" OFF)
|
||||
option(NVTX_DISABLE "Disable all NVTX features" ON)
|
||||
option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
|
||||
@ -199,7 +200,9 @@ set(TRT_LIB TensorRT::NvInfer)
|
||||
get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
|
||||
|
||||
set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
|
||||
if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
|
||||
if(BINDING_TYPE STREQUAL "pybind"
|
||||
OR BUILD_DEEP_EP
|
||||
OR BUILD_DEEP_GEMM)
|
||||
add_subdirectory(${3RDPARTY_DIR}/pybind11
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pybind11)
|
||||
endif()
|
||||
@ -219,7 +222,9 @@ include_directories(
|
||||
${3RDPARTY_DIR}/cutlass/tools/util/include
|
||||
${3RDPARTY_DIR}/NVTX/include
|
||||
${3RDPARTY_DIR}/json/include)
|
||||
if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
|
||||
if(BINDING_TYPE STREQUAL "pybind"
|
||||
OR BUILD_DEEP_EP
|
||||
OR BUILD_DEEP_GEMM)
|
||||
include_directories(${3RDPARTY_DIR}/pybind11/include)
|
||||
endif()
|
||||
if(BINDING_TYPE STREQUAL "nanobind")
|
||||
|
||||
@ -314,4 +314,8 @@ if(BUILD_DEEP_EP)
|
||||
add_subdirectory(deep_ep)
|
||||
endif()
|
||||
|
||||
if(BUILD_DEEP_GEMM)
|
||||
add_subdirectory(deep_gemm)
|
||||
endif()
|
||||
|
||||
add_subdirectory(plugins)
|
||||
|
||||
126
cpp/tensorrt_llm/deep_gemm/CMakeLists.txt
Normal file
126
cpp/tensorrt_llm/deep_gemm/CMakeLists.txt
Normal file
@ -0,0 +1,126 @@
|
||||
add_custom_target(deep_gemm)
|
||||
|
||||
if(WIN32)
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Prepare files
|
||||
# =============
|
||||
|
||||
# Use DeepGEMM submodule
|
||||
set(DEEP_GEMM_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../3rdparty/DeepGEMM)
|
||||
get_filename_component(DEEP_GEMM_SOURCE_DIR ${DEEP_GEMM_SOURCE_DIR} ABSOLUTE)
|
||||
|
||||
if(NOT EXISTS ${DEEP_GEMM_SOURCE_DIR})
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"DeepGEMM submodule not found at ${DEEP_GEMM_SOURCE_DIR}. Please run: git submodule update --init --recursive"
|
||||
)
|
||||
endif()
|
||||
|
||||
# Check if submodules are initialized
|
||||
if(NOT EXISTS ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include)
|
||||
message(
|
||||
FATAL_ERROR
|
||||
"DeepGEMM submodules not initialized. Please run: git submodule update --init --recursive"
|
||||
)
|
||||
endif()
|
||||
|
||||
# Copy and update python files
|
||||
set(DEEP_GEMM_PYTHON_DEST ${CMAKE_CURRENT_BINARY_DIR}/python/deep_gemm)
|
||||
file(REMOVE_RECURSE ${DEEP_GEMM_PYTHON_DEST})
|
||||
file(MAKE_DIRECTORY ${DEEP_GEMM_PYTHON_DEST})
|
||||
|
||||
# Copy all files from deep_gemm directory
|
||||
file(GLOB_RECURSE DEEP_GEMM_ALL_FILES ${DEEP_GEMM_SOURCE_DIR}/deep_gemm/*)
|
||||
configure_file(${DEEP_GEMM_SOURCE_DIR}/LICENSE ${DEEP_GEMM_PYTHON_DEST}/LICENSE
|
||||
COPYONLY)
|
||||
foreach(SOURCE_FILE ${DEEP_GEMM_ALL_FILES})
|
||||
file(RELATIVE_PATH REL_PATH ${DEEP_GEMM_SOURCE_DIR}/deep_gemm ${SOURCE_FILE})
|
||||
get_filename_component(REL_DIR ${REL_PATH} DIRECTORY)
|
||||
file(MAKE_DIRECTORY ${DEEP_GEMM_PYTHON_DEST}/${REL_DIR})
|
||||
|
||||
# Check if it's a Python file that needs import renaming
|
||||
get_filename_component(FILE_EXT ${SOURCE_FILE} EXT)
|
||||
if(FILE_EXT STREQUAL ".py")
|
||||
# Read file content and replace module imports for Python files
|
||||
file(READ ${SOURCE_FILE} _content)
|
||||
string(REPLACE "deep_gemm_cpp" "tensorrt_llm.deep_gemm_cpp_tllm" _content
|
||||
"${_content}")
|
||||
|
||||
# Add adaptation header
|
||||
string(
|
||||
PREPEND
|
||||
_content
|
||||
"# Adapted from https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/${REL_PATH}\n"
|
||||
)
|
||||
|
||||
# Write modified content
|
||||
set(_dst "${DEEP_GEMM_PYTHON_DEST}/${REL_PATH}")
|
||||
file(WRITE ${_dst} "${_content}")
|
||||
else()
|
||||
# Copy non-Python files as-is
|
||||
set(_dst "${DEEP_GEMM_PYTHON_DEST}/${REL_PATH}")
|
||||
file(COPY ${SOURCE_FILE} DESTINATION ${DEEP_GEMM_PYTHON_DEST}/${REL_DIR})
|
||||
endif()
|
||||
|
||||
# Add dependency tracking
|
||||
set_property(
|
||||
DIRECTORY
|
||||
APPEND
|
||||
PROPERTY CMAKE_CONFIGURE_DEPENDS ${SOURCE_FILE})
|
||||
endforeach()
|
||||
|
||||
# Copy third-party includes (cutlass and fmt) to the include directory
|
||||
set(DEEP_GEMM_INCLUDE_DEST ${DEEP_GEMM_PYTHON_DEST}/include)
|
||||
file(MAKE_DIRECTORY ${DEEP_GEMM_INCLUDE_DEST})
|
||||
file(COPY ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include/cute
|
||||
DESTINATION ${DEEP_GEMM_INCLUDE_DEST})
|
||||
file(COPY ${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include/cutlass
|
||||
DESTINATION ${DEEP_GEMM_INCLUDE_DEST})
|
||||
|
||||
# Find torch_python
|
||||
find_library(TORCH_PYTHON_LIB torch_python REQUIRED
|
||||
HINTS ${TORCH_INSTALL_PREFIX}/lib)
|
||||
|
||||
# Build deep_gemm_cpp_tllm extension (matching deep_gemm's setup.py)
|
||||
set(DEEP_GEMM_SOURCES ${DEEP_GEMM_SOURCE_DIR}/csrc/python_api.cpp)
|
||||
|
||||
pybind11_add_module(deep_gemm_cpp_tllm ${DEEP_GEMM_SOURCES})
|
||||
set_target_properties(
|
||||
deep_gemm_cpp_tllm
|
||||
PROPERTIES CXX_STANDARD_REQUIRED ON
|
||||
CXX_STANDARD 17
|
||||
CXX_SCAN_FOR_MODULES OFF
|
||||
CUDA_SEPARABLE_COMPILATION ON
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_gemm_cpp_tllm.version
|
||||
INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib"
|
||||
BUILD_WITH_INSTALL_RPATH TRUE)
|
||||
|
||||
target_compile_options(deep_gemm_cpp_tllm PRIVATE ${TORCH_CXX_FLAGS} -std=c++17
|
||||
-O3 -fPIC -Wno-psabi)
|
||||
|
||||
# Extension name definition
|
||||
target_compile_definitions(deep_gemm_cpp_tllm
|
||||
PRIVATE TORCH_EXTENSION_NAME=deep_gemm_cpp_tllm)
|
||||
|
||||
# Include directories matching deep_gemm setup.py
|
||||
target_include_directories(
|
||||
deep_gemm_cpp_tllm
|
||||
PRIVATE ${CUDA_INCLUDE_DIRS} ${DEEP_GEMM_SOURCE_DIR}/deep_gemm/include
|
||||
${DEEP_GEMM_SOURCE_DIR}/third-party/cutlass/include
|
||||
${DEEP_GEMM_SOURCE_DIR}/third-party/fmt/include)
|
||||
|
||||
# Link libraries (matching deep_gemm setup.py: cuda, cudart + torch)
|
||||
target_link_libraries(
|
||||
deep_gemm_cpp_tllm PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}
|
||||
CUDA::cuda_driver CUDA::cudart)
|
||||
|
||||
# Link directories
|
||||
target_link_directories(
|
||||
deep_gemm_cpp_tllm PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/lib64
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
|
||||
|
||||
# Set targets
|
||||
# ===========
|
||||
add_dependencies(deep_gemm deep_gemm_cpp_tllm)
|
||||
4
cpp/tensorrt_llm/deep_gemm/deep_gemm_cpp_tllm.version
Normal file
4
cpp/tensorrt_llm/deep_gemm/deep_gemm_cpp_tllm.version
Normal file
@ -0,0 +1,4 @@
|
||||
{
|
||||
global: PyInit_deep_gemm_cpp_tllm;
|
||||
local: *;
|
||||
};
|
||||
@ -61,6 +61,5 @@ etcd3
|
||||
blake3
|
||||
llguidance==0.7.29
|
||||
soundfile
|
||||
deep_gemm@git+https://github.com/VALLIS-NERIA/DeepGEMM.git@97d97a20c2ecd53a248ab64242219d780cf822b8 ; platform_machine == "x86_64"
|
||||
triton==3.3.1; platform_machine == "x86_64"
|
||||
blobfile
|
||||
|
||||
@ -448,10 +448,12 @@ def main(*,
|
||||
if cpp_only:
|
||||
build_pyt = "OFF"
|
||||
build_deep_ep = "OFF"
|
||||
build_deep_gemm = "OFF"
|
||||
else:
|
||||
targets.extend(["th_common", "bindings", "deep_ep"])
|
||||
targets.extend(["th_common", "bindings", "deep_ep", "deep_gemm"])
|
||||
build_pyt = "ON"
|
||||
build_deep_ep = "ON"
|
||||
build_deep_gemm = "ON"
|
||||
|
||||
if benchmarks:
|
||||
targets.append("benchmarks")
|
||||
@ -490,7 +492,7 @@ def main(*,
|
||||
)
|
||||
cmake_def_args = " ".join(cmake_def_args)
|
||||
cmake_configure_command = (
|
||||
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBINDING_TYPE="{binding_type}" -DBUILD_DEEP_EP="{build_deep_ep}"'
|
||||
f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBINDING_TYPE="{binding_type}" -DBUILD_DEEP_EP="{build_deep_ep}" -DBUILD_DEEP_GEMM="{build_deep_gemm}"'
|
||||
f' -DNVTX_DISABLE="{disable_nvtx}" -DBUILD_MICRO_BENCHMARKS={build_micro_benchmarks}'
|
||||
f' -DBUILD_WHEEL_TARGETS="{";".join(targets)}"'
|
||||
f' -DPython_EXECUTABLE={venv_python} -DPython3_EXECUTABLE={venv_python}'
|
||||
@ -637,6 +639,14 @@ def main(*,
|
||||
clear_folder(deep_ep_dir)
|
||||
deep_ep_dir.rmdir()
|
||||
|
||||
# Handle deep_gemm installation
|
||||
deep_gemm_dir = pkg_dir / "deep_gemm"
|
||||
if deep_gemm_dir.is_symlink():
|
||||
deep_gemm_dir.unlink()
|
||||
elif deep_gemm_dir.is_dir():
|
||||
clear_folder(deep_gemm_dir)
|
||||
deep_gemm_dir.rmdir()
|
||||
|
||||
bin_dir = pkg_dir / "bin"
|
||||
if bin_dir.exists():
|
||||
clear_folder(bin_dir)
|
||||
@ -684,6 +694,14 @@ def main(*,
|
||||
build_dir /
|
||||
"tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103",
|
||||
lib_dir / "nvshmem")
|
||||
|
||||
install_file(get_binding_lib("deep_gemm", "deep_gemm_cpp_tllm"),
|
||||
pkg_dir)
|
||||
install_tree(build_dir / "tensorrt_llm" / "deep_gemm" / "python" /
|
||||
"deep_gemm",
|
||||
deep_gemm_dir,
|
||||
dirs_exist_ok=True)
|
||||
|
||||
if not skip_stubs:
|
||||
with working_directory(project_dir):
|
||||
if binding_type == "nanobind":
|
||||
@ -757,6 +775,9 @@ def main(*,
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
|
||||
env=env_ld)
|
||||
build_run(
|
||||
f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
|
||||
env=env_ld)
|
||||
|
||||
if not skip_building_wheel:
|
||||
if dist_dir is None:
|
||||
|
||||
3
setup.py
3
setup.py
@ -107,7 +107,8 @@ else:
|
||||
'libs/libdecoder_attention_1.so', 'libs/nvshmem/License.txt',
|
||||
'libs/nvshmem/nvshmem_bootstrap_uid.so.3',
|
||||
'libs/nvshmem/nvshmem_transport_ibgda.so.103', 'bindings.*.so',
|
||||
'deep_ep/LICENSE', 'deep_ep_cpp_tllm.*.so', "include/**/*"
|
||||
'deep_ep/LICENSE', 'deep_ep_cpp_tllm.*.so', "include/**/*",
|
||||
'deep_gemm/LICENSE', 'deep_gemm/include/**/*', 'deep_gemm_cpp_tllm.*.so'
|
||||
]
|
||||
|
||||
package_data += [
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import deep_gemm
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
|
||||
from tensorrt_llm import deep_gemm
|
||||
from tensorrt_llm._utils import nvtx_range
|
||||
|
||||
from ...distributed import allgather
|
||||
|
||||
@ -573,7 +573,7 @@ class FP8BlockScalesLinearMethod(LinearMethodBase):
|
||||
assert input.dtype == torch.bfloat16
|
||||
|
||||
if get_sm_version() == 100:
|
||||
import deep_gemm
|
||||
from tensorrt_llm import deep_gemm
|
||||
a, a_sf = fp8_utils.per_token_quant_and_transform(input)
|
||||
output = torch.empty((input.shape[0], module.weight.shape[0]),
|
||||
device=input.device,
|
||||
|
||||
@ -800,7 +800,7 @@ class PyExecutor:
|
||||
# Second last rank does not need to since last rank has original decoded tokens
|
||||
if not self.dist.is_second_last_pp_rank:
|
||||
if self.send_handles[prev_microbatch_id] is not None:
|
||||
self.send_handles[prev_microbatch_id].Wait()
|
||||
self.send_handles[prev_microbatch_id].wait()
|
||||
needs_logits = (
|
||||
self._need_return_logits(scheduled_batch)
|
||||
or (self._need_return_log_probs(scheduled_batch)
|
||||
|
||||
@ -196,6 +196,7 @@ MODEL_MAP = {
|
||||
'Qwen2VLForConditionalGeneration': QWenForCausalLM,
|
||||
'Qwen2VLModel': QWenForCausalLM,
|
||||
'Qwen3ForCausalLM': QWenForCausalLM,
|
||||
'Qwen3MoeForCausalLM': QWenForCausalLM,
|
||||
'WhisperEncoder': WhisperEncoder,
|
||||
'EncoderModel': EncoderModel,
|
||||
'DecoderModel': DecoderModel,
|
||||
|
||||
@ -32,6 +32,8 @@ class QWenConfig(PretrainedConfig):
|
||||
use_logn_attn: bool = False,
|
||||
moe: Optional[Union[MoeConfig, dict]] = None,
|
||||
num_labels: int = 1,
|
||||
mlp_only_layers: Optional[list] = None,
|
||||
decoder_sparse_step: int = 1,
|
||||
**kwargs):
|
||||
self.mlp_bias = mlp_bias
|
||||
self.attn_bias = attn_bias
|
||||
@ -40,6 +42,8 @@ class QWenConfig(PretrainedConfig):
|
||||
self.disable_weight_only_quant_plugin = disable_weight_only_quant_plugin
|
||||
self.num_labels = num_labels
|
||||
self.use_logn_attn = use_logn_attn
|
||||
self.mlp_only_layers = mlp_only_layers or []
|
||||
self.decoder_sparse_step = decoder_sparse_step
|
||||
if moe is None:
|
||||
# Legacy MOE config fields
|
||||
moe = MoeConfig(num_experts=kwargs.pop('moe_num_experts', 0),
|
||||
@ -64,6 +68,8 @@ class QWenConfig(PretrainedConfig):
|
||||
output[
|
||||
'disable_weight_only_quant_plugin'] = self.disable_weight_only_quant_plugin
|
||||
output['use_logn_attn'] = self.use_logn_attn
|
||||
output['mlp_only_layers'] = self.mlp_only_layers
|
||||
output['decoder_sparse_step'] = self.decoder_sparse_step
|
||||
output['moe'] = self.moe.to_dict()
|
||||
return output
|
||||
|
||||
@ -114,7 +120,7 @@ class QWenConfig(PretrainedConfig):
|
||||
hf_config.hidden_size // hf_config.num_attention_heads)
|
||||
head_size = getattr(hf_config, "kv_channels", head_dim)
|
||||
hidden_act = getattr(hf_config, "hidden_act", "silu")
|
||||
if qwen_type == "qwen2_moe":
|
||||
if qwen_type in ("qwen2_moe", "qwen3_moe"):
|
||||
hidden_act = "swiglu"
|
||||
|
||||
# Qwen3 models have no attention bias, while legacy models have bias
|
||||
@ -144,6 +150,11 @@ class QWenConfig(PretrainedConfig):
|
||||
moe_shared_expert_intermediate_size = getattr(
|
||||
hf_config, "shared_expert_intermediate_size", 0)
|
||||
moe_normalization_mode = MoeConfig.ExpertScaleNormalizationMode.NONE
|
||||
|
||||
# Add support for mlp_only_layers and decoder_sparse_step (Qwen3 MoE)
|
||||
mlp_only_layers = getattr(hf_config, "mlp_only_layers", [])
|
||||
decoder_sparse_step = getattr(hf_config, "decoder_sparse_step", 1)
|
||||
|
||||
moe_config = MoeConfig(num_experts=moe_num_experts,
|
||||
top_k=moe_top_k,
|
||||
normalization_mode=moe_normalization_mode)
|
||||
@ -189,6 +200,8 @@ class QWenConfig(PretrainedConfig):
|
||||
moe_intermediate_size=moe_intermediate_size,
|
||||
moe_shared_expert_intermediate_size=
|
||||
moe_shared_expert_intermediate_size,
|
||||
mlp_only_layers=mlp_only_layers,
|
||||
decoder_sparse_step=decoder_sparse_step,
|
||||
moe=moe_config,
|
||||
mapping=mapping,
|
||||
quantization=quant_config,
|
||||
|
||||
@ -714,57 +714,58 @@ def convert_hf_qwen(hf_model,
|
||||
dtype,
|
||||
use_gemm_woq_plugin))
|
||||
|
||||
if qwen_type == "qwen2_moe" and moe_config and moe_config.has_moe():
|
||||
if moe_config and moe_config.has_moe():
|
||||
if qwen_type == "qwen2_moe":
|
||||
# shared_expert for qwen2_moe
|
||||
shared_expert_up_proj = model_params[
|
||||
f'model.layers.{l}.mlp.shared_expert.up_proj.weight']
|
||||
shared_expert_down_proj = model_params[
|
||||
f'model.layers.{l}.mlp.shared_expert.down_proj.weight']
|
||||
shared_expert_gate = model_params[
|
||||
f'model.layers.{l}.mlp.shared_expert.gate_proj.weight']
|
||||
shared_expert_up_proj = split(shared_expert_up_proj,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
shared_expert_down_proj = split(shared_expert_down_proj,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=1)
|
||||
shared_expert_gate = split(shared_expert_gate,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
shared_expert_gate_up_proj = torch.concat(
|
||||
[shared_expert_up_proj, shared_expert_gate],
|
||||
dim=-2).to(dtype)
|
||||
|
||||
# shared_expert for qwen2_moe
|
||||
shared_expert_up_proj = model_params[
|
||||
f'model.layers.{l}.mlp.shared_expert.up_proj.weight']
|
||||
shared_expert_down_proj = model_params[
|
||||
f'model.layers.{l}.mlp.shared_expert.down_proj.weight']
|
||||
shared_expert_gate = model_params[
|
||||
f'model.layers.{l}.mlp.shared_expert.gate_proj.weight']
|
||||
shared_expert_up_proj = split(shared_expert_up_proj,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
shared_expert_down_proj = split(shared_expert_down_proj,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=1)
|
||||
shared_expert_gate = split(shared_expert_gate,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
shared_expert_gate_up_proj = torch.concat(
|
||||
[shared_expert_up_proj, shared_expert_gate], dim=-2).to(dtype)
|
||||
## mlp.shared_expert.gate_up_proj.weight
|
||||
weights.update(
|
||||
get_tllm_linear_weight(shared_expert_gate_up_proj,
|
||||
tllm_prex + 'mlp.shared_expert.fc.',
|
||||
None, use_weight_only,
|
||||
plugin_weight_only_quant_type, dtype,
|
||||
use_gemm_woq_plugin))
|
||||
|
||||
## mlp.shared_expert.gate_up_proj.weight
|
||||
weights.update(
|
||||
get_tllm_linear_weight(shared_expert_gate_up_proj,
|
||||
tllm_prex + 'mlp.shared_expert.fc.',
|
||||
None, use_weight_only,
|
||||
plugin_weight_only_quant_type, dtype,
|
||||
use_gemm_woq_plugin))
|
||||
## mlp.shared_expert.down_proj.weight
|
||||
weights.update(
|
||||
get_tllm_linear_weight(
|
||||
shared_expert_down_proj.to(dtype),
|
||||
tllm_prex + 'mlp.shared_expert.proj.', None,
|
||||
use_weight_only, plugin_weight_only_quant_type, dtype,
|
||||
use_gemm_woq_plugin))
|
||||
|
||||
## mlp.shared_expert.down_proj.weight
|
||||
weights.update(
|
||||
get_tllm_linear_weight(shared_expert_down_proj.to(dtype),
|
||||
tllm_prex + 'mlp.shared_expert.proj.',
|
||||
None, use_weight_only,
|
||||
plugin_weight_only_quant_type, dtype,
|
||||
use_gemm_woq_plugin))
|
||||
|
||||
moe_shared_expert_gate_weights = get_weight(
|
||||
model_params, prefix + 'mlp.shared_expert_gate', dtype)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(
|
||||
moe_shared_expert_gate_weights,
|
||||
tllm_prex + 'mlp.shared_expert_gate.',
|
||||
None,
|
||||
False, # Router should never be quantized
|
||||
plugin_weight_only_quant_type,
|
||||
dtype,
|
||||
use_gemm_woq_plugin))
|
||||
moe_shared_expert_gate_weights = get_weight(
|
||||
model_params, prefix + 'mlp.shared_expert_gate', dtype)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(
|
||||
moe_shared_expert_gate_weights,
|
||||
tllm_prex + 'mlp.shared_expert_gate.',
|
||||
None,
|
||||
False, # Router should never be quantized
|
||||
plugin_weight_only_quant_type,
|
||||
dtype,
|
||||
use_gemm_woq_plugin))
|
||||
|
||||
## fine-grained experts
|
||||
rank_experts = list(range(moe_config.num_experts))
|
||||
@ -811,6 +812,7 @@ def convert_hf_qwen(hf_model,
|
||||
plugin_weight_only_quant_type,
|
||||
dtype,
|
||||
use_gemm_woq_plugin))
|
||||
|
||||
else:
|
||||
mlp_gate_weight = get_weight(model_params, prefix + key_list[2],
|
||||
dtype)
|
||||
|
||||
@ -90,11 +90,15 @@ class QWenDecoderLayer(Module):
|
||||
if config.moe.has_moe():
|
||||
mlp_kwargs = {'moe_config': config.moe, 'mapping': config.mapping}
|
||||
if config.qwen_type == 'qwen2_moe':
|
||||
# Qwen2 MoE uses SharedMoE with shared expert
|
||||
ClsMLP = SharedMoE
|
||||
mlp_kwargs['use_shared_gate'] = True
|
||||
mlp_kwargs['use_side_stream'] = True
|
||||
mlp_kwargs['moe_config'].shared_expert_intermediate_size = \
|
||||
config.moe_shared_expert_intermediate_size
|
||||
elif config.qwen_type == 'qwen3_moe':
|
||||
# Qwen3 MoE uses standard MOE without shared expert
|
||||
ClsMLP = MOE
|
||||
else:
|
||||
ClsMLP = MOE
|
||||
else:
|
||||
@ -104,7 +108,7 @@ class QWenDecoderLayer(Module):
|
||||
# Qwen's real inter_size depends on qwen_type
|
||||
if self.config.qwen_type == 'qwen':
|
||||
intermediate_size = config.intermediate_size // 2
|
||||
elif self.config.qwen_type == 'qwen2_moe':
|
||||
elif self.config.qwen_type in ('qwen2_moe', 'qwen3_moe'):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
else:
|
||||
intermediate_size = config.intermediate_size
|
||||
@ -264,18 +268,11 @@ class QWenForCausalLM(DecoderModelForCausalLM):
|
||||
"mlp_4h_to_h": "mlp.c_proj",
|
||||
"mlp_gate": "w1",
|
||||
}
|
||||
elif config.qwen_type == 'qwen2_moe':
|
||||
elif config.qwen_type in ('qwen2_moe', 'qwen3_moe'):
|
||||
self.trtllm_modules_to_hf_modules = copy.copy(
|
||||
get_default_trtllm_modules_to_hf_modules())
|
||||
# Common MoE expert mappings for both Qwen2 and Qwen3 MoE
|
||||
self.trtllm_modules_to_hf_modules.update({
|
||||
"mlp_h_to_4h":
|
||||
"mlp.shared_expert.gate_proj",
|
||||
"mlp_4h_to_h":
|
||||
"mlp.shared_expert.down_proj",
|
||||
"mlp_gate":
|
||||
"mlp.shared_expert.up_proj",
|
||||
"mlp_router":
|
||||
"mlp.shared_expert_gate",
|
||||
"moe_h_to_4h":
|
||||
"mlp.experts.gate_proj",
|
||||
"moe_4h_to_h":
|
||||
@ -283,6 +280,18 @@ class QWenForCausalLM(DecoderModelForCausalLM):
|
||||
"moe_gate":
|
||||
"mlp.experts.up_proj",
|
||||
})
|
||||
# Qwen2 MoE additionally has shared expert
|
||||
if config.qwen_type == 'qwen2_moe':
|
||||
self.trtllm_modules_to_hf_modules.update({
|
||||
"mlp_h_to_4h":
|
||||
"mlp.shared_expert.gate_proj",
|
||||
"mlp_4h_to_h":
|
||||
"mlp.shared_expert.down_proj",
|
||||
"mlp_gate":
|
||||
"mlp.shared_expert.up_proj",
|
||||
"mlp_router":
|
||||
"mlp.shared_expert_gate",
|
||||
})
|
||||
else:
|
||||
self.trtllm_modules_to_hf_modules = None
|
||||
super().__init__(config, transformer, lm_head)
|
||||
@ -343,6 +352,12 @@ class QWenForCausalLM(DecoderModelForCausalLM):
|
||||
"mlp.shared_expert_gate": "mlp.shared_expert_gate",
|
||||
"fc": ["up_proj", "gate_proj"],
|
||||
}
|
||||
elif config.qwen_type == "qwen3_moe":
|
||||
custom_dict = {
|
||||
"fc": ["up_proj", "gate_proj"],
|
||||
"q_layernorm": "q_norm",
|
||||
"k_layernorm": "k_norm",
|
||||
}
|
||||
elif config.qwen_type in {"qwen2", "qwen2_vl"
|
||||
} and config.tie_word_embeddings:
|
||||
custom_dict = {"lm_head": "model.embed_tokens"}
|
||||
@ -360,7 +375,7 @@ class QWenForCausalLM(DecoderModelForCausalLM):
|
||||
"transformer": "language_model.model",
|
||||
"lm_head": "language_model.lm_head",
|
||||
}
|
||||
elif config.qwen_type in ("qwen3", "qwen3_moe"):
|
||||
elif config.qwen_type == "qwen3":
|
||||
custom_dict = {
|
||||
"q_layernorm": "q_norm",
|
||||
"k_layernorm": "k_norm",
|
||||
@ -412,7 +427,7 @@ class QWenForCausalLM(DecoderModelForCausalLM):
|
||||
loader.load(tllm_key,
|
||||
custom_postprocess_kwargs=arg_dict))
|
||||
loader.fill(tllm_weights)
|
||||
elif config.qwen_type == "qwen2_moe":
|
||||
elif config.qwen_type in ("qwen2_moe", "qwen3_moe"):
|
||||
for tllm_key, _ in model.named_parameters():
|
||||
sub_module = model
|
||||
for attr in tllm_key.split(".")[:-1]:
|
||||
|
||||
@ -95,6 +95,8 @@ MODEL_PATH_DICT = {
|
||||
"gemma_3_1b_it": "gemma/gemma-3-1b-it",
|
||||
"deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
|
||||
"deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
|
||||
"deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
|
||||
"deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
|
||||
"deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
|
||||
"deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
|
||||
"qwen2_7b_instruct": "Qwen2-7B-Instruct",
|
||||
|
||||
@ -68,8 +68,26 @@ trt_llm_release_perf_cluster_test:
|
||||
- perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4]
|
||||
#- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
|
||||
#llama_v3.1_405b_instruct_fp4
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1024,2048-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-gpus:4] TIMEOUT(120)
|
||||
#llama_v3.3_70b_instruct_fp4
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-gpus:4] TIMEOUT(120)
|
||||
#llama_v4_scout_17b_16e_instruct_fp4
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:1500-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1500-gpus:4]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-gpus:4] TIMEOUT(120)
|
||||
|
||||
|
||||
# Tests for systems with 8+ GPUs
|
||||
- condition:
|
||||
@ -83,20 +101,19 @@ trt_llm_release_perf_cluster_test:
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8]
|
||||
#llama_v3.3_70b_instruct_fp4
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
|
||||
#llama_v3.1_405b_instruct_fp4
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-tp:8-gpus:8] TIMEOUT(120)
|
||||
#llama_v4_scout_17b_16e_instruct_fp4
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8]
|
||||
#mixtral_8x22b_v0.1
|
||||
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:8]
|
||||
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
|
||||
@ -111,6 +128,10 @@ trt_llm_release_perf_cluster_test:
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] TIMEOUT (120) #max throughput test
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
|
||||
#deepseek_r1_0528_fp4
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||
|
||||
@ -465,29 +465,34 @@ trt_llm_release_perf_test:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8]
|
||||
# llama_v3.1_405b_fp8
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:1500-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
|
||||
#llama_v3.3_70b_instruct_fp8
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120)
|
||||
|
||||
#llama_v4_maverick_17b_128e_instruct_fp8
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8]
|
||||
#rcca case
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]
|
||||
|
||||
#llama_v4_scout_17b_16e_instruct_fp8
|
||||
#pytorch backend
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.6-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.6-input_output_len:512,32-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.85-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-kv_frac:0.85-input_output_len:512,32-ep:8-tp:8-gpus:8]
|
||||
|
||||
#deepseek_r1_fp8
|
||||
#pytorch backend
|
||||
@ -509,7 +514,9 @@ trt_llm_release_perf_test:
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-streaming-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] TIMEOUT(40)#min latency test
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(80) #max throughput test
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8] TIMEOUT(20)
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120)
|
||||
- perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
|
||||
|
||||
@ -18,21 +18,21 @@ from argparse import ArgumentParser
|
||||
# isort: off
|
||||
import torch
|
||||
# isort: on
|
||||
from cuda import cuda, cudart
|
||||
from cuda import cudart
|
||||
|
||||
import tensorrt_llm as tllm
|
||||
from tensorrt_llm import Mapping, Tensor
|
||||
from tensorrt_llm import Mapping
|
||||
from tensorrt_llm._torch.distributed import AllReduce, AllReduceFusionOp
|
||||
from tensorrt_llm._torch.modules.rms_norm import RMSNorm
|
||||
from tensorrt_llm._utils import local_mpi_rank, local_mpi_size
|
||||
from tensorrt_llm.functional import (AllReduceParams, AllReduceStrategy,
|
||||
allreduce)
|
||||
from tensorrt_llm.plugin.plugin import (current_all_reduce_helper,
|
||||
init_all_reduce_helper)
|
||||
from tensorrt_llm.runtime import Session
|
||||
from tensorrt_llm.bindings.internal.runtime import delay_kernel
|
||||
from tensorrt_llm.functional import AllReduceParams, AllReduceStrategy
|
||||
|
||||
|
||||
def allreduce_benchmark(dtype: str,
|
||||
test_range: str = "10,10000000,10",
|
||||
no_header: bool = False):
|
||||
test_range: str = "1,10000000,10",
|
||||
no_header: bool = False,
|
||||
enable_cudagraph: bool = False):
|
||||
tllm.logger.set_level('error')
|
||||
world_size = tllm.mpi_world_size()
|
||||
rank = tllm.mpi_rank()
|
||||
@ -49,80 +49,120 @@ def allreduce_benchmark(dtype: str,
|
||||
|
||||
torch_dtype = tllm._utils.str_dtype_to_torch(dtype)
|
||||
min_size, max_size, ratio = [int(i) for i in test_range.split(",")]
|
||||
inner_loop = 1000
|
||||
inner_loop = 1200
|
||||
outer_loop = 10
|
||||
|
||||
size = min_size
|
||||
dtype_size = torch.finfo(torch_dtype).bits // 8
|
||||
hidden_size = size
|
||||
bs = 1
|
||||
if mapping.rank == 0 and not no_header:
|
||||
print(
|
||||
f"{'world_size':<15}, {'dtype':<10}, {'message size':<15}, {'strategy':<15}, {'duration (ms)':<10}"
|
||||
f"{'world_size':<15}, {'dtype':<10}, {'message size':<15}, {'strategy':<10}, {'fusion':<20}, {'version':<10}, {'duration (ms)':<10}"
|
||||
)
|
||||
while size < max_size:
|
||||
input = torch.ones(size, dtype=torch_dtype, device="cuda")
|
||||
input = torch.ones((bs, hidden_size), dtype=torch_dtype, device="cuda")
|
||||
|
||||
for strategy in [
|
||||
AllReduceStrategy.AUTO,
|
||||
AllReduceStrategy.NCCL,
|
||||
AllReduceStrategy.ONESHOT,
|
||||
AllReduceStrategy.TWOSHOT,
|
||||
]:
|
||||
builder = tllm.Builder()
|
||||
net = builder.create_network()
|
||||
net.plugin_config.set_nccl_plugin(dtype)
|
||||
init_all_reduce_helper()
|
||||
_buffers, workspace = current_all_reduce_helper(
|
||||
).allocate_workspace(mapping, size * dtype_size)
|
||||
for version in ["v1"]:
|
||||
for fusion in [
|
||||
AllReduceFusionOp.RESIDUAL_RMS_NORM, AllReduceFusionOp.NONE
|
||||
]:
|
||||
for strategy in [
|
||||
AllReduceStrategy.NCCL,
|
||||
AllReduceStrategy.ONESHOT,
|
||||
AllReduceStrategy.TWOSHOT,
|
||||
]:
|
||||
if size >= 25600000 and fusion != AllReduceFusionOp.NONE:
|
||||
continue
|
||||
allreduce = AllReduce(mapping=mapping, strategy=strategy)
|
||||
if fusion == AllReduceFusionOp.RESIDUAL_RMS_NORM:
|
||||
norm_weight = torch.randn((hidden_size, ),
|
||||
dtype=torch_dtype,
|
||||
device="cuda")
|
||||
norm = RMSNorm(hidden_size=hidden_size,
|
||||
dtype=torch_dtype,
|
||||
eps=1e-5).cuda()
|
||||
norm.weight.data.copy_(norm_weight)
|
||||
if version == "v1":
|
||||
params = {
|
||||
"all_reduce_params":
|
||||
AllReduceParams(fusion_op=fusion,
|
||||
residual=input,
|
||||
norm_weight=norm.weight,
|
||||
eps=norm.variance_epsilon)
|
||||
}
|
||||
else:
|
||||
params = {
|
||||
"reduce_fusion_inputs": [input, norm.weight],
|
||||
"eps": norm.variance_epsilon,
|
||||
"fusion_op": fusion
|
||||
}
|
||||
else:
|
||||
if version == "v1":
|
||||
params = {
|
||||
"all_reduce_params":
|
||||
AllReduceParams(fusion_op=fusion)
|
||||
}
|
||||
else:
|
||||
continue
|
||||
|
||||
with tllm.net_guard(net):
|
||||
tllm.default_trtnet()
|
||||
def func(input):
|
||||
for _ in range(inner_loop):
|
||||
input = allreduce(input, **params)
|
||||
if fusion == AllReduceFusionOp.RESIDUAL_RMS_NORM:
|
||||
input = input[0]
|
||||
return input
|
||||
|
||||
x = Tensor(name='x',
|
||||
shape=input.shape,
|
||||
dtype=tllm.str_dtype_to_trt(dtype))
|
||||
start = [
|
||||
torch.cuda.Event(enable_timing=True)
|
||||
for _ in range(outer_loop)
|
||||
]
|
||||
stop = [
|
||||
torch.cuda.Event(enable_timing=True)
|
||||
for _ in range(outer_loop)
|
||||
]
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
|
||||
current_all_reduce_helper().set_workspace_tensor(mapping)
|
||||
stream = torch.cuda.Stream()
|
||||
with torch.cuda.stream(stream):
|
||||
if enable_cudagraph:
|
||||
for _ in range(2):
|
||||
func(input)
|
||||
with torch.cuda.graph(graph, stream=stream):
|
||||
output = func(input)
|
||||
tllm.mpi_barrier()
|
||||
delay_kernel(2000000, stream)
|
||||
torch.cuda.profiler.start()
|
||||
for i in range(outer_loop):
|
||||
start[i].record(stream)
|
||||
if enable_cudagraph:
|
||||
graph.replay()
|
||||
else:
|
||||
output = func(input)
|
||||
stop[i].record(stream)
|
||||
|
||||
current = x
|
||||
for _ in range(inner_loop):
|
||||
current = allreduce(
|
||||
current,
|
||||
mapping.tp_group,
|
||||
all_reduce_params=AllReduceParams(strategy=strategy))
|
||||
current.mark_output('output', dtype)
|
||||
feed_dict = {'x': input, 'all_reduce_workspace': workspace}
|
||||
builder_config = builder.create_builder_config(precision=dtype)
|
||||
engine = builder.build_engine(net, builder_config)
|
||||
assert engine is not None, "Failed to build engine"
|
||||
session = Session.from_serialized_engine(engine)
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.profiler.stop()
|
||||
runtimes = [
|
||||
start[i].elapsed_time(stop[i])
|
||||
for i in range(outer_loop)
|
||||
]
|
||||
median_ms = sorted(runtimes)[len(runtimes) // 2]
|
||||
|
||||
_, start = cuda.cuEventCreate(0)
|
||||
_, stop = cuda.cuEventCreate(0)
|
||||
runtimes = []
|
||||
if fusion == AllReduceFusionOp.NONE:
|
||||
allreduce_ref = (input * world_size)**inner_loop
|
||||
torch.testing.assert_close(output, allreduce_ref)
|
||||
|
||||
tllm.mpi_barrier()
|
||||
output = torch.empty(input.shape, dtype=torch_dtype, device='cuda')
|
||||
stream = torch.cuda.current_stream()
|
||||
for _ in range(10):
|
||||
cuda.cuEventRecord(start, stream.cuda_stream)
|
||||
session.run(inputs=feed_dict,
|
||||
outputs={"output": output},
|
||||
stream=stream.cuda_stream)
|
||||
cuda.cuEventRecord(stop, stream.cuda_stream)
|
||||
torch.cuda.synchronize()
|
||||
_, ms = cuda.cuEventElapsedTime(start, stop)
|
||||
runtimes.append(ms)
|
||||
|
||||
median_ms = sorted(runtimes)[len(runtimes) // 2]
|
||||
|
||||
allreduce_ref = (input * world_size)**inner_loop
|
||||
assert torch.allclose(output, allreduce_ref)
|
||||
|
||||
if mapping.rank == 0:
|
||||
print(
|
||||
f"{mapping.world_size:<15}, {dtype:<10}, {size:<15}, {strategy.name:<15}, {median_ms:<10.2f}"
|
||||
)
|
||||
if mapping.rank == 0:
|
||||
print(
|
||||
f"{mapping.world_size:<15}, {dtype:<10}, {size:<15}, {strategy.name:<10}, {fusion.name:<20}, {version:<10}, {median_ms:<10.2f}"
|
||||
)
|
||||
|
||||
size *= ratio
|
||||
if hidden_size * ratio > 4096:
|
||||
bs *= ratio
|
||||
else:
|
||||
hidden_size *= ratio
|
||||
assert size == bs * hidden_size
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -134,6 +174,8 @@ if __name__ == "__main__":
|
||||
default="256,256000000,10", # 256 to 256M
|
||||
help="min_size,max_size,multiplicative_ratio")
|
||||
parser.add_argument("--no-header", action="store_true")
|
||||
parser.add_argument("--enable-cudagraph", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_benchmark(args.dtype, args.range, args.no_header)
|
||||
allreduce_benchmark(args.dtype, args.range, args.no_header,
|
||||
args.enable_cudagraph)
|
||||
|
||||
@ -50,7 +50,7 @@ def test_fp8_block_scale_deep_gemm(dtype, m, k, n):
|
||||
act_b_fp8, act_b_sf = per_block_cast_to_fp8_e8m0(b)
|
||||
|
||||
output_expected = a @ b.t()
|
||||
import deep_gemm
|
||||
from tensorrt_llm import deep_gemm
|
||||
output = torch.empty((act_a_fp8.shape[0], act_b_fp8.shape[0]),
|
||||
device=act_a_fp8.device,
|
||||
dtype=torch.bfloat16)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user