From 22219bc37e75b10097c959769414442390c20d07 Mon Sep 17 00:00:00 2001 From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> Date: Fri, 5 Sep 2025 15:29:50 +0800 Subject: [PATCH] Add B300 & GB300 CI Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 9 +++++- cpp/tensorrt_llm/thop/fp8Quantize.cpp | 2 +- .../_torch/models/modeling_deepseekv3.py | 4 +-- tensorrt_llm/_torch/modules/attention.py | 6 ++-- .../modules/fused_moe/fused_moe_cute_dsl.py | 4 +-- .../_torch/modules/fused_moe/quantization.py | 6 ++-- .../defs/accuracy/test_llm_api_pytorch.py | 2 +- tests/integration/defs/test_unittests.py | 7 ++++- .../unittest/_torch/modules/test_fused_moe.py | 30 +++++++++++-------- ..._test_trtllm_serve_multimodal_benchmark.py | 21 +++++++------ .../trt/attention/test_gpt_attention.py | 4 +-- tests/unittest/utils/util.py | 8 ++--- 12 files changed, 61 insertions(+), 42 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 7894ec8dd6..cdae331b94 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -21,7 +21,14 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES) if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9) # The FP4-related conversion instructions in DeepEP require SM100a, SM110a, # or SM120a. - if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0) + if(${CUDA_ARCH_MAJOR} EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0) + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.31) + list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100f${CUDA_ARCH_POSTFIX}") + else() + list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100a${CUDA_ARCH_POSTFIX}" + "103a${CUDA_ARCH_POSTFIX}") + endif() + elseif(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0) list(APPEND DEEP_EP_CUDA_ARCHITECTURES "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}") else() diff --git a/cpp/tensorrt_llm/thop/fp8Quantize.cpp b/cpp/tensorrt_llm/thop/fp8Quantize.cpp index 0203eb76cf..ec5f68ec3e 100644 --- a/cpp/tensorrt_llm/thop/fp8Quantize.cpp +++ b/cpp/tensorrt_llm/thop/fp8Quantize.cpp @@ -67,7 +67,7 @@ std::tuple fp8_quantize_1x128(at::Tensor const& self) act_buffer, act_scale_buffer, reinterpret_cast<__nv_bfloat16 const*>(self.data_ptr()), n, m, stream); // Post-process the scale tensor for sm100 gemm/moe kernel - if (tensorrt_llm::common::getSMVersion() == 100) + if (tensorrt_llm::common::getSMFamily() == 100) { auto const num_n_blocks = (n + 127) / 128; auto const act_scal_elesize = num_n_blocks * m_padded; diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 0f886d0cd5..fa7f5c94d6 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -40,7 +40,7 @@ from tqdm import tqdm from transformers import PretrainedConfig from tensorrt_llm._ipc_utils import can_access_peer -from tensorrt_llm._utils import get_sm_version +from tensorrt_llm._utils import get_sm_family, get_sm_version from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.llmapi.utils import enable_llm_debug from tensorrt_llm.mapping import Mapping @@ -1479,7 +1479,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model, p.data.copy_(module_weights[n][:]) if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales( - ) and get_sm_version() == 100 and hasattr( + ) and get_sm_family() == 100 and hasattr( module, "weight_scale"): weight, weight_scale = resmooth_to_fp8_e8m0( module.weight, module.weight_scale) diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index abf703c11c..f148471f16 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -5,7 +5,7 @@ from typing import Optional, Union, cast import torch from torch import nn -from tensorrt_llm._utils import get_sm_version +from tensorrt_llm._utils import get_sm_family from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping @@ -571,7 +571,7 @@ def fp8_block_scaling_bmm_out( out: torch.Tensor, mat2_dequant: Optional[torch.Tensor] = None, ) -> torch.Tensor: - sm_version = get_sm_version() + sm_version = get_sm_family() if sm_version == 90 or sm_version == 89: mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102( mat1) @@ -892,7 +892,7 @@ class MLA(nn.Module): ), requires_grad=False, ) - if get_sm_version() == 100: + if get_sm_family() == 100: assert self.dtype == torch.bfloat16 self.k_b_proj_trans_dequant = nn.Parameter( torch.empty( diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py index 13f34fbb02..5237095b28 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py @@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union import torch import torch.nn.functional as F -from tensorrt_llm._utils import get_sm_version +from tensorrt_llm._utils import get_sm_family from ...model_config import ModelConfig from ...utils import AuxStreamType, Fp4QuantizedTensor @@ -34,7 +34,7 @@ def cute_dsl_fp8_group_blockwise_gemm_ref( b_tmp = b.permute(1, 2, 0) # Note: we have different output scale shape for fp8_quantize_1x128, so we need to handle it differently for sm100 and other archs. - if get_sm_version() == 100: + if get_sm_family() == 100: input_scale_tmp = a_sf.permute(1, 0).as_strided((m, w_k, 1), (1, m, m * w_k)) else: diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index 646ea6d5e0..ea4766d203 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -7,7 +7,7 @@ import torch.nn.functional as F from torch import nn import tensorrt_llm.logger as trtllm_logger -from tensorrt_llm._utils import get_sm_version +from tensorrt_llm._utils import get_sm_family, get_sm_version from tensorrt_llm.quantization.functional import \ preprocess_weights_for_mixed_gemm from tensorrt_llm.quantization.utils.fp4_utils import ( @@ -742,7 +742,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm( def load_weights(self, module: torch.nn.Module, weights: List[Dict], weight_loading_mode: MoEWeightLoadingMode): - if get_sm_version() == 100: + if get_sm_family() == 100: expert_ids = set(module.initial_local_expert_ids) if self.need_load_shared_weights(module): expert_ids.update( @@ -759,7 +759,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm( weight, scale) super().load_weights(module, weights, weight_loading_mode) - if get_sm_version() == 100: + if get_sm_family() == 100: transfromed_w3_w1_scale = transform_sf_into_required_layout( module.quant_scales[0], mn=module.w3_w1_weight.shape[1], diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 20bd2a7c32..c71f0224a9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1882,7 +1882,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size): - if get_sm_version() == 100: + if get_sm_version() == 100 or get_sm_version() == 103: moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) else: diff --git a/tests/integration/defs/test_unittests.py b/tests/integration/defs/test_unittests.py index 3f13ed10b7..0e83cb6d38 100644 --- a/tests/integration/defs/test_unittests.py +++ b/tests/integration/defs/test_unittests.py @@ -138,7 +138,12 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request): def run_command(cmd): try: - llm_venv.run_cmd(cmd, cwd=test_root) + pythonpath = os.environ.get("PYTHONPATH", "") + llm_venv.run_cmd( + cmd, + cwd=test_root, + env={'PYTHONPATH': f"{llm_root}/tests/unittest:{pythonpath}"}, + ) except CalledProcessError: return False return True diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 397314bcab..8e13a7f880 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -30,12 +30,9 @@ from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode # isort and yapf will fight against each other here, so we disable isort # isort: off -from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod, - CutlassFusedMoE, - DefaultMoeRoutingMethod, - RenormalizeMoeRoutingMethod, - TritonFusedMoE, VanillaMoE, - create_moe, WideEPMoE) +from tensorrt_llm._torch.modules.fused_moe import ( + BaseMoeRoutingMethod, CutlassFusedMoE, DefaultMoeRoutingMethod, + RenormalizeMoeRoutingMethod, TritonFusedMoE, create_moe, WideEPMoE) # isort: on from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \ IS_TRITON_KERNELS_AVAILABLE @@ -76,9 +73,11 @@ def test_fused_moe(moe_backend, if bias and moe_backend not in ["TRITON"]: pytest.skip("Bias not supported.") - mapping = Mapping() + mapping = mapping or Mapping() mapping.rank = mpi_rank() + torch.cuda.set_device(mapping.rank) + with torch.device(f'cuda:{mapping.rank}'): SEQ_LEN = 8 HIDDEN_SIZE = 64 @@ -165,18 +164,19 @@ def test_fused_moe(moe_backend, @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") -@pytest.mark.parametrize("moe_cls", [CutlassFusedMoE, VanillaMoE]) +@pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"]) @pytest.mark.parametrize("ep_size", [1, 2, 4]) def test_fused_moe_multi_gpu(moe_cls, ep_size): world_size = 4 with MPIPoolExecutor(max_workers=world_size) as executor: results = executor.map( test_fused_moe, - *zip(*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod, - Mapping(world_size=world_size, - tp_size=world_size, - moe_ep_size=ep_size, - moe_tp_size=world_size // ep_size))] * world_size), + *zip( + *[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod, False, + Mapping(world_size=world_size, + tp_size=world_size, + moe_ep_size=ep_size, + moe_tp_size=world_size // ep_size))] * world_size), ) for r in results: assert r is None @@ -275,6 +275,10 @@ def test_fused_moe_alltoall(alltoall_method_type): all_rank_max_num_tokens=m, use_dp_padding=False) + if alltoall_method_type == AlltoallMethodType.MNNVL and output.ndim == 3: + output = output.sum(dim=1) + print(f"output: {output.shape}") + print(f"ref_output: {ref_output.shape}") # Evaluate outputs torch.testing.assert_close(output, ref_output, diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py index 14146efa3d..4c11e82eb7 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py @@ -71,15 +71,18 @@ def vision_arena_dataset_path(): @skip_gpu_memory_less_than_80gb -@pytest.mark.parametrize("dataset_name,dataset_args", [("random_image", { - "--random-image-size": "1", - "--random-image-size": "512", -}), ("random_image", { - "--random-num-images": "2", - "--random-image-size": "512", -}), ("hf", { - "--dataset-path": vision_arena_dataset_path(), -})], +@pytest.mark.parametrize("dataset_name,dataset_args", + [("random_image", { + "--random-image-size": "1", + "--random-image-size": "512", + }), + ("random_image", { + "--random-num-images": "2", + "--random-image-size": "512", + }), + ("hf", { + "--dataset-path": vision_arena_dataset_path(), + })], ids=[ "random_image-single_image", "random_image-dual_images", diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py index 38638b198d..d176046816 100644 --- a/tests/unittest/trt/attention/test_gpt_attention.py +++ b/tests/unittest/trt/attention/test_gpt_attention.py @@ -427,7 +427,7 @@ class TestFunctional(unittest.TestCase): # skip tests based on the gpu_arch_lists if gpu_arch != 'all': - assert gpu_arch in [80, 86, 89, 90, 100, 120] + assert gpu_arch in [80, 86, 89, 90, 100, 103, 120] if getSMVersion() != gpu_arch: pytest.skip( "Skip the test as the target gpu arch doesn't match this gpu arch." @@ -439,7 +439,7 @@ class TestFunctional(unittest.TestCase): skip_blackwell_for_fmha_tests(context_fmha_type, head_size) # Skip custom mask tests for Blackwell - if getSMVersion() == 100 and custom_mask_input: + if (getSMVersion() == 100 or getSMVersion == 103) and custom_mask_input: pytest.skip("Custom masked is not supported by TRTLLM-GEN for now.") if num_kv_heads == 0: diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py index 893af5d93b..28ad21adbe 100644 --- a/tests/unittest/utils/util.py +++ b/tests/unittest/utils/util.py @@ -84,7 +84,7 @@ skip_pre_blackwell = pytest.mark.skipif( getSMVersion() < 100, reason="This test is not supported in pre-Blackwell architecture") skip_blackwell = pytest.mark.skipif( - getSMVersion() == 100, + getSMVersion() == 100 or getSMVersion() == 103, reason="This test is not supported in Blackwell architecture") skip_blackwell_geforce = pytest.mark.skipif( getSMVersion() == 120, reason="This test is not supported on SM 120") @@ -127,9 +127,9 @@ def skip_fp8_pre_ada(use_fp8): def skip_blackwell_for_fmha_tests(context_fmha_type, head_size): - if getSMVersion() == 100 and (head_size not in [32, 64, 128] - and context_fmha_type - != ContextFMHAType.disabled): + if (getSMVersion() == 100 or getSMVersion() + == 103) and (head_size not in [32, 64, 128] + and context_fmha_type != ContextFMHAType.disabled): pytest.skip( "Context FMHA only supports head sizes [32, 64, 128] currently on blackwell." )