From 6c694f85baf72adadb5b77edfe56f098643d1330 Mon Sep 17 00:00:00 2001 From: Bo Deng Date: Tue, 27 Jan 2026 00:02:22 +0800 Subject: [PATCH] [None][fix] fix TinyGemm accuracy issue. cherry-pick https://github.com/NVIDIA/TensorRT-LLM/pull/10619 and https://github.com/NVIDIA/TensorRT-LLM/pull/10873 (#10990) Signed-off-by: Bo Deng --- cpp/tensorrt_llm/kernels/quantization.cu | 6 +++--- cpp/tensorrt_llm/kernels/quantization.cuh | 1 + cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index d86595c298..78248214c1 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -177,7 +177,7 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); config.numAttrs = 1; config.attrs = attrs; cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale, reinterpret_cast(output), @@ -212,7 +212,7 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); config.numAttrs = 1; config.attrs = attrs; cudaLaunchKernelEx(&config, @@ -387,7 +387,7 @@ void computePerTokenGlobalScaleForFP4Quantization(int b, int m, int n, T const* config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = false; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); config.numAttrs = 1; config.attrs = attrs; TLLM_CUDA_CHECK(cudaLaunchKernelEx( diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 7aacc0f31d..20ccfae1ae 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -777,6 +777,7 @@ quantize_with_block_size( // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)). + // This value is prepared by model, no need to be protected by ACKBULK float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0]; // Is it swizzled layout? diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh index cc76f35cc0..dbedacad50 100644 --- a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh @@ -236,7 +236,6 @@ __global__ __launch_bounds__(384, 1) void kernel(__nv_bfloat16* output, __nv_bfl if (!weight_warp) { cudaGridDependencySynchronize(); - cudaTriggerProgrammaticLaunchCompletion(); } for (int ki = 0; ki < K_LOOPS_DMA; ki++) @@ -411,6 +410,8 @@ __global__ __launch_bounds__(384, 1) void kernel(__nv_bfloat16* output, __nv_bfl __syncthreads(); + cudaTriggerProgrammaticLaunchCompletion(); + if (warp_id == 0) { @@ -442,6 +443,7 @@ __global__ __launch_bounds__(384, 1) void kernel(__nv_bfloat16* output, __nv_bfl if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0) profile[blockIdx.x].complete = gclock64(); } + __syncthreads(); } #endif // end if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) }