diff --git a/cpp/cmake/modules/cuda_configuration.cmake b/cpp/cmake/modules/cuda_configuration.cmake index 57f957da39..3e40c9b15b 100644 --- a/cpp/cmake/modules/cuda_configuration.cmake +++ b/cpp/cmake/modules/cuda_configuration.cmake @@ -160,13 +160,19 @@ function(setup_cuda_architectures) ${CMAKE_CUDA_ARCHITECTURES_ORIG} PARENT_SCOPE) - set(ARCHITECTURES_WITH_KERNELS 80 86 89 90 100 120) + set(ARCHITECTURES_WITH_KERNELS 80 86 89 90 120) foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS) if(NOT ${CUDA_ARCH} IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}") message(STATUS "Excluding SM ${CUDA_ARCH}") endif() endforeach() + # deal with SM100/f + if(NOT "100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG + AND NOT "100f" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) + add_definitions("-DEXCLUDE_SM_100") + message(STATUS "Excluding SM 100(f)") + endif() # -a suffix supported from Hopper (90) set(MIN_ARCHITECTURE_HAS_ACCEL 90) diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h index 1e442399d6..44af025c30 100644 --- a/cpp/include/tensorrt_llm/common/cudaUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaUtils.h @@ -311,6 +311,16 @@ inline int getSMVersion() return sm; } +inline int getSMFamily() +{ + int sm = getSMVersion(); + if (sm == 100 || sm == 103) + { + return 100; + } + return sm; +} + inline int getDevice() { int deviceID{0}; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h index 34aa05ddc4..f8f01f1a85 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h @@ -330,7 +330,7 @@ size_t CutlassFp4GemmRunner::dispatchToArch(T* D, void const* A, { if constexpr (fp4GemmType == FP4GemmType::W4A8_MXFP4_MXFP8) { - if (mSm == 100) + if (mSm == 100 || mSm == 103) { return dispatchMXFP8xMXFP4GemmCTAShapeSm100(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes, stream, occupancy); @@ -343,7 +343,7 @@ size_t CutlassFp4GemmRunner::dispatchToArch(T* D, void const* A, } else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4) { - if (mSm == 100) + if (mSm == 100 || mSm == 103) { return dispatchNVFP4xNVFP4GemmCTAShapeSm100(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes, stream, occupancy); @@ -384,7 +384,7 @@ std::vector CutlassFp4GemmRunner::getCon std::vector candidateConfigs; - if (mSm == 100) + if (mSm == 100 || mSm == 103) { std::vector tilesSm100 = { tkc::CutlassTileConfigSM100::CtaShape128x64x128B, diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h index ff582ec6e6..73617e63e0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h @@ -665,8 +665,9 @@ void MoeGemmRunner::dispatchToArch( // numbers of tokens SM80 is faster. We check here to see which is selected if (inputs.gemm_config.sm_version >= 90) { - TLLM_CHECK_WITH_INFO(inputs.gemm_config.sm_version == sm_, "Using SM %d configuration for SM %d device", - inputs.gemm_config.sm_version, sm_); + TLLM_CHECK_WITH_INFO( + (inputs.gemm_config.sm_version == sm_) || (inputs.gemm_config.sm_version == 100 && sm_ == 103), + "Using SM %d configuration for SM %d device", inputs.gemm_config.sm_version, sm_); TLLM_CHECK_WITH_INFO(inputs.biases != nullptr || hopper_inputs.ptr_c == nullptr, "Input biases and hopper input disagree if bias is enabled"); TLLM_CHECK_WITH_INFO( @@ -788,6 +789,14 @@ size_t MoeGemmRunner::calcMaxWorkspace { return 0; } + // #ifndef CUTLASS_ARCH_MMA_SM100F_SUPPORTED + // static_assert(__CUDA_ARCH__ == 1000, "__CUDA_ARCH__"); + // static_assert(CUTLASS_ARCH_MMA_SM100_SUPPORTED, "CUTLASS_ARCH_MMA_SM100F_SUPPORTED"); + // static_assert(CUTLASS_ARCH_MMA_SM100_ENABLED, "CUTLASS_ARCH_MMA_SM100_ENABLED"); + // static_assert(CUTLASS_ARCH_MMA_SM100F_SUPPORTED, "CUTLASS_ARCH_MMA_SM100F_SUPPORTED"); + // static_assert(CUTLASS_ARCH_MMA_SM100F_ENABLED, "CUTLASS_ARCH_MMA_SM100F_ENABLED"); + // // #error "SM100F not supported!" + // #endif if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation() && !use_w4afp8) { auto configs = getTmaWarpSpecializedConfigs(sm_); diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py index 838120136c..7e55098cb6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py @@ -809,7 +809,7 @@ if __name__ == "__main__": } def has_arch(sm): - return f"{sm}" in arches or f"{sm}-real" in arches + return f"{sm}" in arches or f"{sm}-real" in arches or f"{sm}f-real" in arches or f"{sm}f" in arches # The goal here is to group kernels with common instantiations together in order to reduce template instantiation overheads. # Template instantiation dominates the time in a compilation unit, so it is the most important factor to improve. diff --git a/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp b/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp index d41aa157c5..8156cc9c24 100644 --- a/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp +++ b/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp @@ -364,7 +364,8 @@ void* HostAccessibleDeviceAllocator::allocate(size_t memorySize) TLLM_CHECK_WITH_INFO( mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system."); TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize)); - TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId)); + TLLM_CUDA_CHECK(cudaMemAdvise( + devPtr, memorySize, cudaMemAdviseSetPreferredLocation, {cudaMemLocationTypeDevice, currentDevId})); hostPtr = devPtr; } recordAllocation(devPtr, memorySize, hostPtr, memDesc); diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp index 401158b3c2..10b39bc335 100644 --- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp @@ -38,8 +38,8 @@ std::vector run_fp4_block_scale_moe_runner(torch::Tensor const& r int64_t const local_num_experts, std::optional const routed_scaling_factor, int64_t const tile_tokens_dim, int64_t const routing_method_type, bool const do_finalize, MoeRunnerType& moe_runner, int64_t const moeConfigIndex) { - auto const sm = tensorrt_llm::common::getSMVersion(); - TORCH_CHECK(sm == 100, "Only SM100 is supported by FP4 block scale MOE"); + auto const sm = tensorrt_llm::common::getSMFamily(); + TORCH_CHECK(sm == 100, "Only SM100f is supported by FP4 block scale MOE"); TORCH_CHECK(tile_tokens_dim == 8 || tile_tokens_dim == 16 || tile_tokens_dim == 32 || tile_tokens_dim == 64, "tile_tokens_dim must be 8, 16, 32, 64"); if (static_cast(routing_method_type) == RoutingMethodType::DeepSeekV3) diff --git a/tests/unittest/_torch/thop/test_moe.py b/tests/unittest/_torch/thop/test_moe.py index 8f70ecebeb..f9dc149c17 100644 --- a/tests/unittest/_torch/thop/test_moe.py +++ b/tests/unittest/_torch/thop/test_moe.py @@ -568,7 +568,7 @@ def quant_dequant_per_tensor_fp8(a): @pytest.mark.skipif( - getSMVersion() != 100, + getSMVersion() < 100 or getSMVersion() >= 110, reason="The kernel only supports Blackwell. Current SM is %d." % getSMVersion(), ) @@ -702,7 +702,7 @@ class TestMoeFP8: @pytest.mark.skipif( - getSMVersion() != 100, + getSMVersion() < 100 or getSMVersion() >= 110, reason="The kernel only supports Blackwell. Current SM is %d." % getSMVersion(), ) @@ -1061,7 +1061,7 @@ class TestMoeFp4: @pytest.mark.skipif( - getSMVersion() != 100, + getSMVersion() < 100 or getSMVersion() >= 110, reason="The kernel only supports Blackwell. Current SM is %d." % getSMVersion(), )