disable 3xfp4

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
Xiwen Yu 2025-07-21 14:38:59 +08:00
parent 469a38d0d8
commit 52ad4436bc
10 changed files with 37 additions and 19 deletions

5
.gitmodules vendored
View File

@ -25,7 +25,4 @@
url = https://github.com/wjakob/nanobind
[submodule "3rdparty/cppzmq"]
path = 3rdparty/cppzmq
url = https://github.com/zeromq/cppzmq.git
[submodule "3rdparty/dynamic-kernel-generator"]
path = 3rdparty/dynamic-kernel-generator
url = ssh://git@gitlab-master.nvidia.com:12051/dlarch-fastkernels/dynamic-kernel-generator.git
url = https://github.com/zeromq/cppzmq.github

@ -1 +0,0 @@
Subproject commit 34bfe3557372d1d2cebe3c90448b03756c6a16eb

View File

@ -215,8 +215,8 @@ include_directories(
${CUDAToolkit_INCLUDE_DIRS}/cccl
${CUDNN_ROOT_DIR}/include
$<TARGET_PROPERTY:TensorRT::NvInfer,INTERFACE_INCLUDE_DIRECTORIES>
${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/include
${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/tools/util/include
${3RDPARTY_DIR}/cutlass/include
${3RDPARTY_DIR}/cutlass/tools/util/include
${3RDPARTY_DIR}/NVTX/include
${3RDPARTY_DIR}/json/include)
if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)

View File

@ -28,7 +28,7 @@ if(NOT Python3_EXECUTABLE)
endif()
execute_process(
WORKING_DIRECTORY ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/python/
WORKING_DIRECTORY ${3RDPARTY_DIR}/cutlass/python/
COMMAND ${Python3_EXECUTABLE} setup_library.py develop --user
RESULT_VARIABLE _CUTLASS_LIBRARY_SUCCESS)

View File

@ -135,7 +135,14 @@ void tma_warp_specialized_generic_moe_gemm_kernelLauncher(TmaWarpSpecializedGrou
#ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS
else if constexpr (ArchTag::kMinComputeCapability == 103)
{
TLLM_THROW("Please recompile with support for blackwell by passing 103-real as an arch to build_wheel.py.");
static bool first_time = true;
if (first_time)
{
TLLM_LOG_WARNING(
"Falling back to sm100f version. For best performance please recompile with support for blackwell by "
"passing 103-real as an arch to build_wheel.py.");
first_time = false;
}
}
#endif
#ifndef COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS
@ -344,13 +351,8 @@ using SafeBF16 = void;
using EpilogueScheduleSM90 = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized; \
\
constexpr static bool Is2SM = IsBlackwell && (cute::size<0>(ClusterShape{}) % 2) == 0; \
using EpilogueScheduleSM100 = std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm, \
using EpilogueScheduleSM10x = std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm, \
cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm>; \
using EpilogueScheduleSM103 \
= std::conditional_t<Is2SM, cutlass::epilogue::PtrArrayNoSmemWarpSpecialized2Sm, \
cutlass::epilogue::PtrArrayNoSmemWarpSpecialized1Sm>; \
using EpilogueScheduleSM10x \
= std::conditional_t<IsSM103FP4, EpilogueScheduleSM103, EpilogueScheduleSM100>; \
\
using EpilogueScheduleSM120 = cutlass::epilogue::TmaWarpSpecialized; \
using EpilogueScheduleBW = std ::conditional_t<IsSM120, EpilogueScheduleSM120, EpilogueScheduleSM10x>; \

View File

@ -67,6 +67,7 @@ namespace kernels
{
namespace cutlass_kernels_oss
{
using namespace tensorrt_llm::kernels::cutlass_kernels;
namespace tk = tensorrt_llm::common;
namespace tkc = tensorrt_llm::cutlass_extensions;

View File

@ -105,7 +105,14 @@ void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmIn
#ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS
else if constexpr (Arch::kMinComputeCapability == 103)
{
TLLM_THROW("Please recompile with support for blackwell by passing 103-real as an arch to build_wheel.py.");
static bool first_time = true;
if (first_time)
{
TLLM_LOG_WARNING(
"Falling back to sm100f version. For best performance please recompile with support for blackwell by "
"passing 103-real as an arch to build_wheel.py.");
first_time = false;
}
}
#endif
#ifndef COMPILE_BLACKWELL_TMA_GROUPED_GEMMS

View File

@ -24,6 +24,18 @@
#include <cuda_fp4.h>
#endif
#if !defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
namespace cutlass::arch
{
using Sm103 = Sm100;
}
namespace cutlass::gemm
{
using KernelPtrArrayTmaWarpSpecialized1SmBlockScaled3xOmmaVs16Sm103 = void;
using KernelPtrArrayTmaWarpSpecialized2SmBlockScaled3xOmmaVs16Sm103 = void;
} // namespace cutlass::gemm
#endif
namespace tensorrt_llm::kernels::cutlass_kernels
{

View File

@ -26,8 +26,8 @@ include(GoogleTest)
include_directories(
${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/include
${PROJECT_SOURCE_DIR}/include
${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/include
${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/tools/util/include
${3RDPARTY_DIR}/cutlass/include
${3RDPARTY_DIR}/cutlass/tools/util/include
${PROJECT_SOURCE_DIR}/tests/batch_manager
${PROJECT_SOURCE_DIR}/tests/utils)

View File

@ -217,7 +217,7 @@ target_include_directories(
${CUDA_INCLUDE_DIRS}
${CUDNN_ROOT_DIR}/include
${NCCL_INCLUDE_DIR}
${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/include
${3RDPARTY_DIR}/cutlass/include
${MPI_INCLUDE_PATH}
${COMMON_HEADER_DIR})