From 22219bc37e75b10097c959769414442390c20d07 Mon Sep 17 00:00:00 2001
From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
Date: Fri, 5 Sep 2025 15:29:50 +0800
Subject: [PATCH] Add B300 & GB300 CI

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
---
 cpp/tensorrt_llm/deep_ep/CMakeLists.txt       |  9 +++++-
 cpp/tensorrt_llm/thop/fp8Quantize.cpp         |  2 +-
 .../_torch/models/modeling_deepseekv3.py      |  4 +--
 tensorrt_llm/_torch/modules/attention.py      |  6 ++--
 .../modules/fused_moe/fused_moe_cute_dsl.py   |  4 +--
 .../_torch/modules/fused_moe/quantization.py  |  6 ++--
 .../defs/accuracy/test_llm_api_pytorch.py     |  2 +-
 tests/integration/defs/test_unittests.py      |  7 ++++-
 .../unittest/_torch/modules/test_fused_moe.py | 30 +++++++++++--------
 ..._test_trtllm_serve_multimodal_benchmark.py | 21 +++++++------
 .../trt/attention/test_gpt_attention.py       |  4 +--
 tests/unittest/utils/util.py                  |  8 ++---
 12 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
index 7894ec8dd6..cdae331b94 100644
--- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
+++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
@@ -21,7 +21,14 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
   if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
     # The FP4-related conversion instructions in DeepEP require SM100a, SM110a,
     # or SM120a.
-    if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+    if(${CUDA_ARCH_MAJOR} EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
+      if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.31)
+        list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100f${CUDA_ARCH_POSTFIX}")
+      else()
+        list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100a${CUDA_ARCH_POSTFIX}"
+             "103a${CUDA_ARCH_POSTFIX}")
+      endif()
+    elseif(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
       list(APPEND DEEP_EP_CUDA_ARCHITECTURES
            "${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}")
     else()
diff --git a/cpp/tensorrt_llm/thop/fp8Quantize.cpp b/cpp/tensorrt_llm/thop/fp8Quantize.cpp
index 0203eb76cf..ec5f68ec3e 100644
--- a/cpp/tensorrt_llm/thop/fp8Quantize.cpp
+++ b/cpp/tensorrt_llm/thop/fp8Quantize.cpp
@@ -67,7 +67,7 @@ std::tuple<at::Tensor, at::Tensor> fp8_quantize_1x128(at::Tensor const& self)
         act_buffer, act_scale_buffer, reinterpret_cast<__nv_bfloat16 const*>(self.data_ptr()), n, m, stream);
 
     // Post-process the scale tensor for sm100 gemm/moe kernel
-    if (tensorrt_llm::common::getSMVersion() == 100)
+    if (tensorrt_llm::common::getSMFamily() == 100)
     {
         auto const num_n_blocks = (n + 127) / 128;
         auto const act_scal_elesize = num_n_blocks * m_padded;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 0f886d0cd5..fa7f5c94d6 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -40,7 +40,7 @@ from tqdm import tqdm
 from transformers import PretrainedConfig
 
 from tensorrt_llm._ipc_utils import can_access_peer
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family, get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.llmapi.utils import enable_llm_debug
 from tensorrt_llm.mapping import Mapping
@@ -1479,7 +1479,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model,
                             p.data.copy_(module_weights[n][:])
 
                 if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
-                ) and get_sm_version() == 100 and hasattr(
+                ) and get_sm_family() == 100 and hasattr(
                         module, "weight_scale"):
                     weight, weight_scale = resmooth_to_fp8_e8m0(
                         module.weight, module.weight_scale)
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
index abf703c11c..f148471f16 100644
--- a/tensorrt_llm/_torch/modules/attention.py
+++ b/tensorrt_llm/_torch/modules/attention.py
@@ -5,7 +5,7 @@ from typing import Optional, Union, cast
 import torch
 from torch import nn
 
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
@@ -571,7 +571,7 @@ def fp8_block_scaling_bmm_out(
     out: torch.Tensor,
     mat2_dequant: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
-    sm_version = get_sm_version()
+    sm_version = get_sm_family()
     if sm_version == 90 or sm_version == 89:
         mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
             mat1)
@@ -892,7 +892,7 @@ class MLA(nn.Module):
                 ),
                 requires_grad=False,
             )
-            if get_sm_version() == 100:
+            if get_sm_family() == 100:
                 assert self.dtype == torch.bfloat16
                 self.k_b_proj_trans_dequant = nn.Parameter(
                     torch.empty(
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
index 13f34fbb02..5237095b28 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
 import torch
 import torch.nn.functional as F
 
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family
 
 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, Fp4QuantizedTensor
@@ -34,7 +34,7 @@ def cute_dsl_fp8_group_blockwise_gemm_ref(
     b_tmp = b.permute(1, 2, 0)
 
     # Note: we have different output scale shape for fp8_quantize_1x128, so we need to handle it differently for sm100 and other archs.
-    if get_sm_version() == 100:
+    if get_sm_family() == 100:
         input_scale_tmp = a_sf.permute(1, 0).as_strided((m, w_k, 1),
                                                         (1, m, m * w_k))
     else:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
index 646ea6d5e0..ea4766d203 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -7,7 +7,7 @@ import torch.nn.functional as F
 from torch import nn
 
 import tensorrt_llm.logger as trtllm_logger
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_family, get_sm_version
 from tensorrt_llm.quantization.functional import \
     preprocess_weights_for_mixed_gemm
 from tensorrt_llm.quantization.utils.fp4_utils import (
@@ -742,7 +742,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
 
     def load_weights(self, module: torch.nn.Module, weights: List[Dict],
                      weight_loading_mode: MoEWeightLoadingMode):
-        if get_sm_version() == 100:
+        if get_sm_family() == 100:
             expert_ids = set(module.initial_local_expert_ids)
             if self.need_load_shared_weights(module):
                 expert_ids.update(
@@ -759,7 +759,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
                         weight, scale)
         super().load_weights(module, weights, weight_loading_mode)
 
-        if get_sm_version() == 100:
+        if get_sm_family() == 100:
             transfromed_w3_w1_scale = transform_sf_into_required_layout(
                 module.quant_scales[0],
                 mn=module.w3_w1_weight.shape[1],
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 20bd2a7c32..c71f0224a9 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1882,7 +1882,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
     def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                             attention_dp, cuda_graph, overlap_scheduler,
                             max_batch_size):
-        if get_sm_version() == 100:
+        if get_sm_version() == 100 or get_sm_version() == 103:
             moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
             kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
         else:
diff --git a/tests/integration/defs/test_unittests.py b/tests/integration/defs/test_unittests.py
index 3f13ed10b7..0e83cb6d38 100644
--- a/tests/integration/defs/test_unittests.py
+++ b/tests/integration/defs/test_unittests.py
@@ -138,7 +138,12 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):
 
     def run_command(cmd):
         try:
-            llm_venv.run_cmd(cmd, cwd=test_root)
+            pythonpath = os.environ.get("PYTHONPATH", "")
+            llm_venv.run_cmd(
+                cmd,
+                cwd=test_root,
+                env={'PYTHONPATH': f"{llm_root}/tests/unittest:{pythonpath}"},
+            )
         except CalledProcessError:
             return False
         return True
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 397314bcab..8e13a7f880 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -30,12 +30,9 @@ from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode
 
 # isort and yapf will fight against each other here, so we disable isort
 # isort: off
-from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod,
-                                                   CutlassFusedMoE,
-                                                   DefaultMoeRoutingMethod,
-                                                   RenormalizeMoeRoutingMethod,
-                                                   TritonFusedMoE, VanillaMoE,
-                                                   create_moe, WideEPMoE)
+from tensorrt_llm._torch.modules.fused_moe import (
+    BaseMoeRoutingMethod, CutlassFusedMoE, DefaultMoeRoutingMethod,
+    RenormalizeMoeRoutingMethod, TritonFusedMoE, create_moe, WideEPMoE)
 # isort: on
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
     IS_TRITON_KERNELS_AVAILABLE
@@ -76,9 +73,11 @@ def test_fused_moe(moe_backend,
     if bias and moe_backend not in ["TRITON"]:
         pytest.skip("Bias not supported.")
 
-    mapping = Mapping()
+    mapping = mapping or Mapping()
     mapping.rank = mpi_rank()
 
+    torch.cuda.set_device(mapping.rank)
+
     with torch.device(f'cuda:{mapping.rank}'):
         SEQ_LEN = 8
         HIDDEN_SIZE = 64
@@ -165,18 +164,19 @@ def test_fused_moe(moe_backend,
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
-@pytest.mark.parametrize("moe_cls", [CutlassFusedMoE, VanillaMoE])
+@pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"])
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
 def test_fused_moe_multi_gpu(moe_cls, ep_size):
     world_size = 4
     with MPIPoolExecutor(max_workers=world_size) as executor:
         results = executor.map(
             test_fused_moe,
-            *zip(*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod,
-                    Mapping(world_size=world_size,
-                            tp_size=world_size,
-                            moe_ep_size=ep_size,
-                            moe_tp_size=world_size // ep_size))] * world_size),
+            *zip(
+                *[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod, False,
+                   Mapping(world_size=world_size,
+                           tp_size=world_size,
+                           moe_ep_size=ep_size,
+                           moe_tp_size=world_size // ep_size))] * world_size),
         )
         for r in results:
             assert r is None
@@ -275,6 +275,10 @@ def test_fused_moe_alltoall(alltoall_method_type):
                     all_rank_max_num_tokens=m,
                     use_dp_padding=False)
 
+            if alltoall_method_type == AlltoallMethodType.MNNVL and output.ndim == 3:
+                output = output.sum(dim=1)
+            print(f"output: {output.shape}")
+            print(f"ref_output: {ref_output.shape}")
             # Evaluate outputs
             torch.testing.assert_close(output,
                                        ref_output,
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
index 14146efa3d..4c11e82eb7 100644
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_benchmark.py
@@ -71,15 +71,18 @@ def vision_arena_dataset_path():
 
 
 @skip_gpu_memory_less_than_80gb
-@pytest.mark.parametrize("dataset_name,dataset_args", [("random_image", {
-    "--random-image-size": "1",
-    "--random-image-size": "512",
-}), ("random_image", {
-    "--random-num-images": "2",
-    "--random-image-size": "512",
-}), ("hf", {
-    "--dataset-path": vision_arena_dataset_path(),
-})],
+@pytest.mark.parametrize("dataset_name,dataset_args",
+                         [("random_image", {
+                             "--random-image-size": "1",
+                             "--random-image-size": "512",
+                         }),
+                          ("random_image", {
+                              "--random-num-images": "2",
+                              "--random-image-size": "512",
+                          }),
+                          ("hf", {
+                              "--dataset-path": vision_arena_dataset_path(),
+                          })],
                          ids=[
                              "random_image-single_image",
                              "random_image-dual_images",
diff --git a/tests/unittest/trt/attention/test_gpt_attention.py b/tests/unittest/trt/attention/test_gpt_attention.py
index 38638b198d..d176046816 100644
--- a/tests/unittest/trt/attention/test_gpt_attention.py
+++ b/tests/unittest/trt/attention/test_gpt_attention.py
@@ -427,7 +427,7 @@ class TestFunctional(unittest.TestCase):
 
         # skip tests based on the gpu_arch_lists
         if gpu_arch != 'all':
-            assert gpu_arch in [80, 86, 89, 90, 100, 120]
+            assert gpu_arch in [80, 86, 89, 90, 100, 103, 120]
             if getSMVersion() != gpu_arch:
                 pytest.skip(
                     "Skip the test as the target gpu arch doesn't match this gpu arch."
@@ -439,7 +439,7 @@ class TestFunctional(unittest.TestCase):
         skip_blackwell_for_fmha_tests(context_fmha_type, head_size)
 
         # Skip custom mask tests for Blackwell
-        if getSMVersion() == 100 and custom_mask_input:
+        if (getSMVersion() == 100 or getSMVersion == 103) and custom_mask_input:
             pytest.skip("Custom masked is not supported by TRTLLM-GEN for now.")
 
         if num_kv_heads == 0:
diff --git a/tests/unittest/utils/util.py b/tests/unittest/utils/util.py
index 893af5d93b..28ad21adbe 100644
--- a/tests/unittest/utils/util.py
+++ b/tests/unittest/utils/util.py
@@ -84,7 +84,7 @@ skip_pre_blackwell = pytest.mark.skipif(
     getSMVersion() < 100,
     reason="This test is not supported in pre-Blackwell architecture")
 skip_blackwell = pytest.mark.skipif(
-    getSMVersion() == 100,
+    getSMVersion() == 100 or getSMVersion() == 103,
     reason="This test is not supported in Blackwell architecture")
 skip_blackwell_geforce = pytest.mark.skipif(
     getSMVersion() == 120, reason="This test is not supported on SM 120")
@@ -127,9 +127,9 @@ def skip_fp8_pre_ada(use_fp8):
 
 
 def skip_blackwell_for_fmha_tests(context_fmha_type, head_size):
-    if getSMVersion() == 100 and (head_size not in [32, 64, 128]
-                                  and context_fmha_type
-                                  != ContextFMHAType.disabled):
+    if (getSMVersion() == 100 or getSMVersion()
+            == 103) and (head_size not in [32, 64, 128]
+                         and context_fmha_type != ContextFMHAType.disabled):
         pytest.skip(
             "Context FMHA only supports head sizes [32, 64, 128] currently on blackwell."
         )