diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu index c7fa6d4e08..59f3a67f13 100644 --- a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu @@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues auto warp = cg::tiled_partition(block); BaseType minScore = BaseType{-INFINITY}; + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + cudaGridDependencySynchronize(); +#endif + for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum) { auto scoreOffset = tokenId * numExperts; @@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues } } } // end for tokenId + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + cudaTriggerProgrammaticLaunchCompletion(); +#endif } int nextPowerOfTwo(int num) diff --git a/tensorrt_llm/_torch/models/modeling_gpt_oss.py b/tensorrt_llm/_torch/models/modeling_gpt_oss.py index 64c96a191e..d6c90a308a 100644 --- a/tensorrt_llm/_torch/models/modeling_gpt_oss.py +++ b/tensorrt_llm/_torch/models/modeling_gpt_oss.py @@ -39,6 +39,8 @@ from .modeling_utils import (DecoderModel, duplicate_kv_weight, filter_weights, # Use TinyGEMM when the number of tokens is not larger than this threshold MIN_LATENCY_TINYGEMM_NUM_TOKENS = 128 +# Enable TinyGEMM optimization (disabled by default, set ENABLE_TINYGEMM=1 to enable) +ENABLE_TINYGEMM = os.environ.get('ENABLE_TINYGEMM', '0') == '1' class AttentionBlock(Attention): @@ -226,7 +228,7 @@ class MLPBlock(torch.nn.Module): dtype=pretrained_config.torch_dtype) def compute_gate_output(self, x: torch.Tensor) -> torch.Tensor: - if get_sm_version() in [ + if ENABLE_TINYGEMM and get_sm_version() in [ 90, 100, 103 ] and x.shape[0] <= MIN_LATENCY_TINYGEMM_NUM_TOKENS: weight = self.gate.weight diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index e6da9fc216..a2987bdb16 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -1530,7 +1530,8 @@ class PyTorchModelEngine(ModelEngine): num_draft_tokens = len(draft_tokens) total_num_tokens = len(position_ids) assert total_num_tokens <= self.max_num_tokens, ( - "total_num_tokens should be less than or equal to max_num_tokens") + f"total_num_tokens ({total_num_tokens}) should be less than or equal to max_num_tokens ({self.max_num_tokens})" + ) # if exist requests that do not have previous batch, copy input_ids and draft_tokens if num_tokens > 0: input_ids = torch.tensor(input_ids,