Add B300 & GB300 CI

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
Xiwen Yu 2025-09-05 15:29:50 +08:00
parent 5d4f7f4e8d
commit 22219bc37e
12 changed files with 61 additions and 42 deletions

View File

@ -21,7 +21,14 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
# The FP4-related conversion instructions in DeepEP require SM100a, SM110a,
# or SM120a.
if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
if(${CUDA_ARCH_MAJOR} EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.31)
list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100f${CUDA_ARCH_POSTFIX}")
else()
list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100a${CUDA_ARCH_POSTFIX}"
"103a${CUDA_ARCH_POSTFIX}")
endif()
elseif(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
list(APPEND DEEP_EP_CUDA_ARCHITECTURES
"${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}")
else()

View File

@ -67,7 +67,7 @@ std::tuple<at::Tensor, at::Tensor> fp8_quantize_1x128(at::Tensor const& self)
act_buffer, act_scale_buffer, reinterpret_cast<__nv_bfloat16 const*>(self.data_ptr()), n, m, stream);
// Post-process the scale tensor for sm100 gemm/moe kernel
if (tensorrt_llm::common::getSMVersion() == 100)
if (tensorrt_llm::common::getSMFamily() == 100)
{
auto const num_n_blocks = (n + 127) / 128;
auto const act_scal_elesize = num_n_blocks * m_padded;

View File

@ -40,7 +40,7 @@ from tqdm import tqdm
from transformers import PretrainedConfig
from tensorrt_llm._ipc_utils import can_access_peer
from tensorrt_llm._utils import get_sm_version
from tensorrt_llm._utils import get_sm_family, get_sm_version
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.llmapi.utils import enable_llm_debug
from tensorrt_llm.mapping import Mapping
@ -1479,7 +1479,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model,
p.data.copy_(module_weights[n][:])
if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
) and get_sm_version() == 100 and hasattr(
) and get_sm_family() == 100 and hasattr(
module, "weight_scale"):
weight, weight_scale = resmooth_to_fp8_e8m0(
module.weight, module.weight_scale)

View File

@ -5,7 +5,7 @@ from typing import Optional, Union, cast
import torch
from torch import nn
from tensorrt_llm._utils import get_sm_version
from tensorrt_llm._utils import get_sm_family
from tensorrt_llm.logger import logger
from tensorrt_llm.mapping import Mapping
@ -571,7 +571,7 @@ def fp8_block_scaling_bmm_out(
out: torch.Tensor,
mat2_dequant: Optional[torch.Tensor] = None,
) -> torch.Tensor:
sm_version = get_sm_version()
sm_version = get_sm_family()
if sm_version == 90 or sm_version == 89:
mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
mat1)
@ -892,7 +892,7 @@ class MLA(nn.Module):
),
requires_grad=False,
)
if get_sm_version() == 100:
if get_sm_family() == 100:
assert self.dtype == torch.bfloat16
self.k_b_proj_trans_dequant = nn.Parameter(
torch.empty(

View File

@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
import torch
import torch.nn.functional as F
from tensorrt_llm._utils import get_sm_version
from tensorrt_llm._utils import get_sm_family
from ...model_config import ModelConfig
from ...utils import AuxStreamType, Fp4QuantizedTensor
@ -34,7 +34,7 @@ def cute_dsl_fp8_group_blockwise_gemm_ref(
b_tmp = b.permute(1, 2, 0)
# Note: we have different output scale shape for fp8_quantize_1x128, so we need to handle it differently for sm100 and other archs.
if get_sm_version() == 100:
if get_sm_family() == 100:
input_scale_tmp = a_sf.permute(1, 0).as_strided((m, w_k, 1),
(1, m, m * w_k))
else:

View File

@ -7,7 +7,7 @@ import torch.nn.functional as F
from torch import nn
import tensorrt_llm.logger as trtllm_logger
from tensorrt_llm._utils import get_sm_version
from tensorrt_llm._utils import get_sm_family, get_sm_version
from tensorrt_llm.quantization.functional import \
preprocess_weights_for_mixed_gemm
from tensorrt_llm.quantization.utils.fp4_utils import (
@ -742,7 +742,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
def load_weights(self, module: torch.nn.Module, weights: List[Dict],
weight_loading_mode: MoEWeightLoadingMode):
if get_sm_version() == 100:
if get_sm_family() == 100:
expert_ids = set(module.initial_local_expert_ids)
if self.need_load_shared_weights(module):
expert_ids.update(
@ -759,7 +759,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
weight, scale)
super().load_weights(module, weights, weight_loading_mode)
if get_sm_version() == 100:
if get_sm_family() == 100:
transfromed_w3_w1_scale = transform_sf_into_required_layout(
module.quant_scales[0],
mn=module.w3_w1_weight.shape[1],

View File

@ -1882,7 +1882,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
attention_dp, cuda_graph, overlap_scheduler,
max_batch_size):
if get_sm_version() == 100:
if get_sm_version() == 100 or get_sm_version() == 103:
moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
else:

View File

@ -138,7 +138,12 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):
def run_command(cmd):
try:
llm_venv.run_cmd(cmd, cwd=test_root)
pythonpath = os.environ.get("PYTHONPATH", "")
llm_venv.run_cmd(
cmd,
cwd=test_root,
env={'PYTHONPATH': f"{llm_root}/tests/unittest:{pythonpath}"},
)
except CalledProcessError:
return False
return True

View File

@ -30,12 +30,9 @@ from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode
# isort and yapf will fight against each other here, so we disable isort
# isort: off
from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod,
CutlassFusedMoE,
DefaultMoeRoutingMethod,
RenormalizeMoeRoutingMethod,
TritonFusedMoE, VanillaMoE,
create_moe, WideEPMoE)
from tensorrt_llm._torch.modules.fused_moe import (
BaseMoeRoutingMethod, CutlassFusedMoE, DefaultMoeRoutingMethod,
RenormalizeMoeRoutingMethod, TritonFusedMoE, create_moe, WideEPMoE)
# isort: on
from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
IS_TRITON_KERNELS_AVAILABLE
@ -76,9 +73,11 @@ def test_fused_moe(moe_backend,
if bias and moe_backend not in ["TRITON"]:
pytest.skip("Bias not supported.")
mapping = Mapping()
mapping = mapping or Mapping()
mapping.rank = mpi_rank()
torch.cuda.set_device(mapping.rank)
with torch.device(f'cuda:{mapping.rank}'):
SEQ_LEN = 8
HIDDEN_SIZE = 64
@ -165,18 +164,19 @@ def test_fused_moe(moe_backend,
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="needs 4 GPUs to run this test")
@pytest.mark.parametrize("moe_cls", [CutlassFusedMoE, VanillaMoE])
@pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"])
@pytest.mark.parametrize("ep_size", [1, 2, 4])
def test_fused_moe_multi_gpu(moe_cls, ep_size):
world_size = 4
with MPIPoolExecutor(max_workers=world_size) as executor:
results = executor.map(
test_fused_moe,
*zip(*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod,
Mapping(world_size=world_size,
tp_size=world_size,
moe_ep_size=ep_size,
moe_tp_size=world_size // ep_size))] * world_size),
*zip(
*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod, False,
Mapping(world_size=world_size,
tp_size=world_size,
moe_ep_size=ep_size,
moe_tp_size=world_size // ep_size))] * world_size),
)
for r in results:
assert r is None
@ -275,6 +275,10 @@ def test_fused_moe_alltoall(alltoall_method_type):
all_rank_max_num_tokens=m,
use_dp_padding=False)
if alltoall_method_type == AlltoallMethodType.MNNVL and output.ndim == 3:
output = output.sum(dim=1)
print(f"output: {output.shape}")
print(f"ref_output: {ref_output.shape}")
# Evaluate outputs
torch.testing.assert_close(output,
ref_output,

View File

@ -71,15 +71,18 @@ def vision_arena_dataset_path():
@skip_gpu_memory_less_than_80gb
@pytest.mark.parametrize("dataset_name,dataset_args", [("random_image", {
"--random-image-size": "1",
"--random-image-size": "512",
}), ("random_image", {
"--random-num-images": "2",
"--random-image-size": "512",
}), ("hf", {
"--dataset-path": vision_arena_dataset_path(),
})],
@pytest.mark.parametrize("dataset_name,dataset_args",
[("random_image", {
"--random-image-size": "1",
"--random-image-size": "512",
}),
("random_image", {
"--random-num-images": "2",
"--random-image-size": "512",
}),
("hf", {
"--dataset-path": vision_arena_dataset_path(),
})],
ids=[
"random_image-single_image",
"random_image-dual_images",

View File

@ -427,7 +427,7 @@ class TestFunctional(unittest.TestCase):
# skip tests based on the gpu_arch_lists
if gpu_arch != 'all':
assert gpu_arch in [80, 86, 89, 90, 100, 120]
assert gpu_arch in [80, 86, 89, 90, 100, 103, 120]
if getSMVersion() != gpu_arch:
pytest.skip(
"Skip the test as the target gpu arch doesn't match this gpu arch."
@ -439,7 +439,7 @@ class TestFunctional(unittest.TestCase):
skip_blackwell_for_fmha_tests(context_fmha_type, head_size)
# Skip custom mask tests for Blackwell
if getSMVersion() == 100 and custom_mask_input:
if (getSMVersion() == 100 or getSMVersion == 103) and custom_mask_input:
pytest.skip("Custom masked is not supported by TRTLLM-GEN for now.")
if num_kv_heads == 0:

View File

@ -84,7 +84,7 @@ skip_pre_blackwell = pytest.mark.skipif(
getSMVersion() < 100,
reason="This test is not supported in pre-Blackwell architecture")
skip_blackwell = pytest.mark.skipif(
getSMVersion() == 100,
getSMVersion() == 100 or getSMVersion() == 103,
reason="This test is not supported in Blackwell architecture")
skip_blackwell_geforce = pytest.mark.skipif(
getSMVersion() == 120, reason="This test is not supported on SM 120")
@ -127,9 +127,9 @@ def skip_fp8_pre_ada(use_fp8):
def skip_blackwell_for_fmha_tests(context_fmha_type, head_size):
if getSMVersion() == 100 and (head_size not in [32, 64, 128]
and context_fmha_type
!= ContextFMHAType.disabled):
if (getSMVersion() == 100 or getSMVersion()
== 103) and (head_size not in [32, 64, 128]
and context_fmha_type != ContextFMHAType.disabled):
pytest.skip(
"Context FMHA only supports head sizes [32, 64, 128] currently on blackwell."
)