From 52ad4436bcd91cc6baa43e6751b9d3e689c3f89f Mon Sep 17 00:00:00 2001 From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> Date: Mon, 21 Jul 2025 14:38:59 +0800 Subject: [PATCH] disable 3xfp4 Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> --- .gitmodules | 5 +---- 3rdparty/dynamic-kernel-generator | 1 - cpp/CMakeLists.txt | 4 ++-- .../kernels/cutlass_kernels/CMakeLists.txt | 2 +- .../launchers/moe_gemm_tma_ws_launcher.inl | 16 +++++++++------- .../moe_gemm_tma_ws_mixed_input_launcher.inl | 1 + .../moe_gemm/moe_gemm_template_dispatch_tma_ws.h | 9 ++++++++- .../moe_gemm/moe_tma_warp_specialized_traits.h | 12 ++++++++++++ cpp/tests/CMakeLists.txt | 4 ++-- .../inflight_batcher_llm/CMakeLists.txt | 2 +- 10 files changed, 37 insertions(+), 19 deletions(-) delete mode 160000 3rdparty/dynamic-kernel-generator diff --git a/.gitmodules b/.gitmodules index cb7cb5e4ac..067d732783 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,7 +25,4 @@ url = https://github.com/wjakob/nanobind [submodule "3rdparty/cppzmq"] path = 3rdparty/cppzmq - url = https://github.com/zeromq/cppzmq.git -[submodule "3rdparty/dynamic-kernel-generator"] - path = 3rdparty/dynamic-kernel-generator - url = ssh://git@gitlab-master.nvidia.com:12051/dlarch-fastkernels/dynamic-kernel-generator.git + url = https://github.com/zeromq/cppzmq.github diff --git a/3rdparty/dynamic-kernel-generator b/3rdparty/dynamic-kernel-generator deleted file mode 160000 index 34bfe35573..0000000000 --- a/3rdparty/dynamic-kernel-generator +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 34bfe3557372d1d2cebe3c90448b03756c6a16eb diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 44127ab087..24779ef6b0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -215,8 +215,8 @@ include_directories( ${CUDAToolkit_INCLUDE_DIRS}/cccl ${CUDNN_ROOT_DIR}/include $ - ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/include - ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/tools/util/include + ${3RDPARTY_DIR}/cutlass/include + ${3RDPARTY_DIR}/cutlass/tools/util/include ${3RDPARTY_DIR}/NVTX/include ${3RDPARTY_DIR}/json/include) if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt index 5a0eb518ee..4bdd71076a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt @@ -28,7 +28,7 @@ if(NOT Python3_EXECUTABLE) endif() execute_process( - WORKING_DIRECTORY ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/python/ + WORKING_DIRECTORY ${3RDPARTY_DIR}/cutlass/python/ COMMAND ${Python3_EXECUTABLE} setup_library.py develop --user RESULT_VARIABLE _CUTLASS_LIBRARY_SUCCESS) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl index 6d61f66d50..efcb4b0e5d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl @@ -135,7 +135,14 @@ void tma_warp_specialized_generic_moe_gemm_kernelLauncher(TmaWarpSpecializedGrou #ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS else if constexpr (ArchTag::kMinComputeCapability == 103) { - TLLM_THROW("Please recompile with support for blackwell by passing 103-real as an arch to build_wheel.py."); + static bool first_time = true; + if (first_time) + { + TLLM_LOG_WARNING( + "Falling back to sm100f version. For best performance please recompile with support for blackwell by " + "passing 103-real as an arch to build_wheel.py."); + first_time = false; + } } #endif #ifndef COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS @@ -344,13 +351,8 @@ using SafeBF16 = void; using EpilogueScheduleSM90 = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized; \ \ constexpr static bool Is2SM = IsBlackwell && (cute::size<0>(ClusterShape{}) % 2) == 0; \ - using EpilogueScheduleSM100 = std::conditional_t; \ - using EpilogueScheduleSM103 \ - = std::conditional_t; \ - using EpilogueScheduleSM10x \ - = std::conditional_t; \ \ using EpilogueScheduleSM120 = cutlass::epilogue::TmaWarpSpecialized; \ using EpilogueScheduleBW = std ::conditional_t; \ diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl index eac301fe82..764a53b107 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl @@ -67,6 +67,7 @@ namespace kernels { namespace cutlass_kernels_oss { +using namespace tensorrt_llm::kernels::cutlass_kernels; namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h index 48726f4b32..bf15aed55f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h @@ -105,7 +105,14 @@ void dispatchMoeGemmSelectBiasTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmIn #ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS else if constexpr (Arch::kMinComputeCapability == 103) { - TLLM_THROW("Please recompile with support for blackwell by passing 103-real as an arch to build_wheel.py."); + static bool first_time = true; + if (first_time) + { + TLLM_LOG_WARNING( + "Falling back to sm100f version. For best performance please recompile with support for blackwell by " + "passing 103-real as an arch to build_wheel.py."); + first_time = false; + } } #endif #ifndef COMPILE_BLACKWELL_TMA_GROUPED_GEMMS diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h index 4ce64f2bdb..bc16af1d26 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h @@ -24,6 +24,18 @@ #include #endif +#if !defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED) +namespace cutlass::arch +{ +using Sm103 = Sm100; +} + +namespace cutlass::gemm +{ +using KernelPtrArrayTmaWarpSpecialized1SmBlockScaled3xOmmaVs16Sm103 = void; +using KernelPtrArrayTmaWarpSpecialized2SmBlockScaled3xOmmaVs16Sm103 = void; +} // namespace cutlass::gemm +#endif namespace tensorrt_llm::kernels::cutlass_kernels { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index da4a15284f..e43226a69d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -26,8 +26,8 @@ include(GoogleTest) include_directories( ${PROJECT_SOURCE_DIR}/tensorrt_llm/cutlass_extensions/include ${PROJECT_SOURCE_DIR}/include - ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/include - ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/tools/util/include + ${3RDPARTY_DIR}/cutlass/include + ${3RDPARTY_DIR}/cutlass/tools/util/include ${PROJECT_SOURCE_DIR}/tests/batch_manager ${PROJECT_SOURCE_DIR}/tests/utils) diff --git a/triton_backend/inflight_batcher_llm/CMakeLists.txt b/triton_backend/inflight_batcher_llm/CMakeLists.txt index 5d3a11269e..0f26015922 100644 --- a/triton_backend/inflight_batcher_llm/CMakeLists.txt +++ b/triton_backend/inflight_batcher_llm/CMakeLists.txt @@ -217,7 +217,7 @@ target_include_directories( ${CUDA_INCLUDE_DIRS} ${CUDNN_ROOT_DIR}/include ${NCCL_INCLUDE_DIR} - ${3RDPARTY_DIR}/dynamic-kernel-generator/cutlass/include + ${3RDPARTY_DIR}/cutlass/include ${MPI_INCLUDE_PATH} ${COMMON_HEADER_DIR})