Add B300 & GB300 CI

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
2026-02-12 05:53:33 +08:00 · 2025-09-05 15:29:50 +08:00 · 2025-09-05 15:29:50 +08:00 · 22219bc37e
commit 22219bc37e
parent 5d4f7f4e8d
12 changed files with 61 additions and 42 deletions
--- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
+++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
@ -21,7 +21,14 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
  if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
    # The FP4-related conversion instructions in DeepEP require SM100a, SM110a,
    # or SM120a.
-    if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+    if(${CUDA_ARCH_MAJOR} EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+      if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.31)
+        list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100f${CUDA_ARCH_POSTFIX}")
+      else()
+        list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100a${CUDA_ARCH_POSTFIX}"
+             "103a${CUDA_ARCH_POSTFIX}")
+      endif()
+    elseif(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
      list(APPEND DEEP_EP_CUDA_ARCHITECTURES
           "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}")
    else()
--- a/cpp/tensorrt_llm/thop/fp8Quantize.cpp
+++ b/cpp/tensorrt_llm/thop/fp8Quantize.cpp
@ -67,7 +67,7 @@ std::tuple<at::Tensor, at::Tensor> fp8_quantize_1x128(at::Tensor const& self)
        act_buffer, act_scale_buffer, reinterpret_cast<__nv_bfloat16 const*>(self.data_ptr()), n, m, stream);

    // Post-process the scale tensor for sm100 gemm/moe kernel
-    if (tensorrt_llm::common::getSMVersion() == 100)
+    if (tensorrt_llm::common::getSMFamily() == 100)
    {
        auto const num_n_blocks = (n + 127) / 128;
        auto const act_scal_elesize = num_n_blocks * m_padded;
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@ -40,7 +40,7 @@ from tqdm import tqdm
 from transformers import PretrainedConfig

 from tensorrt_llm._ipc_utils import can_access_peer
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family, get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.llmapi.utils import enable_llm_debug
 from tensorrt_llm.mapping import Mapping
@ -1479,7 +1479,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model,
                            p.data.copy_(module_weights[n][:])

                if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
-                ) and get_sm_version() == 100 and hasattr(
+                ) and get_sm_family() == 100 and hasattr(
                        module, "weight_scale"):
                    weight, weight_scale = resmooth_to_fp8_e8m0(
                        module.weight, module.weight_scale)
--- a/tensorrt_llm/_torch/modules/attention.py
+++ b/tensorrt_llm/_torch/modules/attention.py
@ -5,7 +5,7 @@ from typing import Optional, Union, cast
 import torch
 from torch import nn

-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping

@ -571,7 +571,7 @@ def fp8_block_scaling_bmm_out(
    out: torch.Tensor,
    mat2_dequant: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    sm_version = get_sm_version()
+    sm_version = get_sm_family()
    if sm_version == 90 or sm_version == 89:
        mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
            mat1)
@ -892,7 +892,7 @@ class MLA(nn.Module):
                ),
                requires_grad=False,
            )
-            if get_sm_version() == 100:
+            if get_sm_family() == 100:
                assert self.dtype == torch.bfloat16
                self.k_b_proj_trans_dequant = nn.Parameter(
                    torch.empty(
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
 import torch
 import torch.nn.functional as F

-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family

 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, Fp4QuantizedTensor
@ -34,7 +34,7 @@ def cute_dsl_fp8_group_blockwise_gemm_ref(
    b_tmp = b.permute(1, 2, 0)

    # Note: we have different output scale shape for fp8_quantize_1x128, so we need to handle it differently for sm100 and other archs.
-    if get_sm_version() == 100:
+    if get_sm_family() == 100:
        input_scale_tmp = a_sf.permute(1, 0).as_strided((m, w_k, 1),
                                                        (1, m, m * w_k))
    else:
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@ -7,7 +7,7 @@ import torch.nn.functional as F
 from torch import nn

 import tensorrt_llm.logger as trtllm_logger
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family, get_sm_version
 from tensorrt_llm.quantization.functional import \
    preprocess_weights_for_mixed_gemm
 from tensorrt_llm.quantization.utils.fp4_utils import (
@ -742,7 +742,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(

    def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                     weight_loading_mode: MoEWeightLoadingMode):
-        if get_sm_version() == 100:
+        if get_sm_family() == 100:
            expert_ids = set(module.initial_local_expert_ids)
            if self.need_load_shared_weights(module):
                expert_ids.update(
@ -759,7 +759,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
                        weight, scale)
        super().load_weights(module, weights, weight_loading_mode)

-        if get_sm_version() == 100:
+        if get_sm_family() == 100:
            transfromed_w3_w1_scale = transform_sf_into_required_layout(
                module.quant_scales[0],
                mn=module.w3_w1_weight.shape[1],
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -1882,7 +1882,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
    def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                            attention_dp, cuda_graph, overlap_scheduler,
                            max_batch_size):
-        if get_sm_version() == 100:
+        if get_sm_version() == 100 or get_sm_version() == 103:
            moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
            kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
        else:
--- a/tests/integration/defs/test_unittests.py
+++ b/tests/integration/defs/test_unittests.py
@ -138,7 +138,12 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):

    def run_command(cmd):
        try:
-            llm_venv.run_cmd(cmd, cwd=test_root)
+            pythonpath = os.environ.get("PYTHONPATH", "")
+            llm_venv.run_cmd(
+                cmd,
+                cwd=test_root,
+                env={'PYTHONPATH': f"{llm_root}/tests/unittest:{pythonpath}"},
+            )
        except CalledProcessError:
            return False
        return True
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@ -30,12 +30,9 @@ from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode

 # isort and yapf will fight against each other here, so we disable isort
 # isort: off
-from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod,
-                                                   CutlassFusedMoE,
-                                                   DefaultMoeRoutingMethod,
-                                                   RenormalizeMoeRoutingMethod,
-                                                   TritonFusedMoE, VanillaMoE,
-                                                   create_moe, WideEPMoE)
+from tensorrt_llm._torch.modules.fused_moe import (
+    BaseMoeRoutingMethod, CutlassFusedMoE, DefaultMoeRoutingMethod,
+    RenormalizeMoeRoutingMethod, TritonFusedMoE, create_moe, WideEPMoE)
 # isort: on
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
    IS_TRITON_KERNELS_AVAILABLE
@ -76,9 +73,11 @@ def test_fused_moe(moe_backend,
    if bias and moe_backend not in ["TRITON"]:
        pytest.skip("Bias not supported.")

-    mapping = Mapping()
+    mapping = mapping or Mapping()
    mapping.rank = mpi_rank()

+    torch.cuda.set_device(mapping.rank)
+
    with torch.device(f'cuda:{mapping.rank}'):
        SEQ_LEN = 8
        HIDDEN_SIZE = 64
@ -165,18 +164,19 @@ def test_fused_moe(moe_backend,

@pytest.mark.skipif(torch.cuda.device_count() < 4,
                    reason="needs 4 GPUs to run this test")
-@pytest.mark.parametrize("moe_cls", [CutlassFusedMoE, VanillaMoE])
+@pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"])
@pytest.mark.parametrize("ep_size", [1, 2, 4])
 def test_fused_moe_multi_gpu(moe_cls, ep_size):
    world_size = 4
    with MPIPoolExecutor(max_workers=world_size) as executor:
        results = executor.map(
            test_fused_moe,
-            *zip(*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod,
-                    Mapping(world_size=world_size,
-                            tp_size=world_size,
-                            moe_ep_size=ep_size,
-                            moe_tp_size=world_size // ep_size))] * world_size),
+            *zip(
+                *[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod, False,
+                   Mapping(world_size=world_size,
+                           tp_size=world_size,
+                           moe_ep_size=ep_size,
+                           moe_tp_size=world_size // ep_size))] * world_size),
        )
        for r in results:
            assert r is None
@ -275,6 +275,10 @@ def test_fused_moe_alltoall(alltoall_method_type):
                    all_rank_max_num_tokens=m,
                    use_dp_padding=False)

+            if alltoall_method_type == AlltoallMethodType.MNNVL and output.ndim == 3:
+                output = output.sum(dim=1)
+            print(f"output: {output.shape}")
+            print(f"ref_output: {ref_output.shape}")
            # Evaluate outputs
            torch.testing.assert_close(output,
                                       ref_output,
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
@ -71,15 +71,18 @@ def vision_arena_dataset_path():


@skip_gpu_memory_less_than_80gb
-@pytest.mark.parametrize("dataset_name,dataset_args", [("random_image", {
-    "--random-image-size": "1",
-    "--random-image-size": "512",
-}), ("random_image", {
-    "--random-num-images": "2",
-    "--random-image-size": "512",
-}), ("hf", {
-    "--dataset-path": vision_arena_dataset_path(),
-})],
+@pytest.mark.parametrize("dataset_name,dataset_args",
+                         [("random_image", {
+                             "--random-image-size": "1",
+                             "--random-image-size": "512",
+                         }),
+                          ("random_image", {
+                              "--random-num-images": "2",
+                              "--random-image-size": "512",
+                          }),
+                          ("hf", {
+                              "--dataset-path": vision_arena_dataset_path(),
+                          })],
                         ids=[
                             "random_image-single_image",
                             "random_image-dual_images",
--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@ -427,7 +427,7 @@ class TestFunctional(unittest.TestCase):

        # skip tests based on the gpu_arch_lists
        if gpu_arch != 'all':
-            assert gpu_arch in [80, 86, 89, 90, 100, 120]
+            assert gpu_arch in [80, 86, 89, 90, 100, 103, 120]
            if getSMVersion() != gpu_arch:
                pytest.skip(
                    "Skip the test as the target gpu arch doesn't match this gpu arch."
@ -439,7 +439,7 @@ class TestFunctional(unittest.TestCase):
        skip_blackwell_for_fmha_tests(context_fmha_type, head_size)

        # Skip custom mask tests for Blackwell
-        if getSMVersion() == 100 and custom_mask_input:
+        if (getSMVersion() == 100 or getSMVersion == 103) and custom_mask_input:
            pytest.skip("Custom masked is not supported by TRTLLM-GEN for now.")

        if num_kv_heads == 0:
--- a/tests/unittest/utils/util.py
+++ b/tests/unittest/utils/util.py
@ -84,7 +84,7 @@ skip_pre_blackwell = pytest.mark.skipif(
    getSMVersion() < 100,
    reason="This test is not supported in pre-Blackwell architecture")
 skip_blackwell = pytest.mark.skipif(
-    getSMVersion() == 100,
+    getSMVersion() == 100 or getSMVersion() == 103,
    reason="This test is not supported in Blackwell architecture")
 skip_blackwell_geforce = pytest.mark.skipif(
    getSMVersion() == 120, reason="This test is not supported on SM 120")
@ -127,9 +127,9 @@ def skip_fp8_pre_ada(use_fp8):


 def skip_blackwell_for_fmha_tests(context_fmha_type, head_size):
-    if getSMVersion() == 100 and (head_size not in [32, 64, 128]
-                                  and context_fmha_type
-                                  != ContextFMHAType.disabled):
+    if (getSMVersion() == 100 or getSMVersion()
+            == 103) and (head_size not in [32, 64, 128]
+                         and context_fmha_type != ContextFMHAType.disabled):
        pytest.skip(
            "Context FMHA only supports head sizes [32, 64, 128] currently on blackwell."
        )