From 6c2ecad2fe061bdac1902520605c746d256c988f Mon Sep 17 00:00:00 2001 From: Zhenhuan Chen Date: Thu, 22 Jan 2026 12:52:27 +0800 Subject: [PATCH] [https://nvbugs/5769425][fix] add syncthreads for tinygemm to resolve intermittent accuracy problem (#10873) Signed-off-by: Zhenhuan Chen Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/quantization.cuh | 1 + cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 6589cc67d5..89b96b288b 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -778,6 +778,7 @@ quantize_with_block_size( // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)). + // This value is prepared by model, no need to be protected by ACKBULK float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0]; // Is it swizzled layout? diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh index 7f60e787bf..6c1d72c353 100644 --- a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh @@ -410,6 +410,8 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output, __syncthreads(); + cudaTriggerProgrammaticLaunchCompletion(); + if (warp_id == 0) { @@ -440,10 +442,8 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output, if (PROFILE && blockIdx.y == 0 && threadIdx.x == 0) profile[blockIdx.x].complete = gclock64(); - - if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0) - cudaTriggerProgrammaticLaunchCompletion(); } + __syncthreads(); } #endif // end if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) }