mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-12 05:53:33 +08:00
Add B300 & GB300 CI
Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
parent
5d4f7f4e8d
commit
22219bc37e
@ -21,7 +21,14 @@ foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
|
||||
if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 9)
|
||||
# The FP4-related conversion instructions in DeepEP require SM100a, SM110a,
|
||||
# or SM120a.
|
||||
if(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
|
||||
if(${CUDA_ARCH_MAJOR} EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
|
||||
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.31)
|
||||
list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100f${CUDA_ARCH_POSTFIX}")
|
||||
else()
|
||||
list(APPEND DEEP_EP_CUDA_ARCHITECTURES "100a${CUDA_ARCH_POSTFIX}"
|
||||
"103a${CUDA_ARCH_POSTFIX}")
|
||||
endif()
|
||||
elseif(${CUDA_ARCH_MAJOR} GREATER_EQUAL 10 AND ${CUDA_ARCH_MINOR} EQUAL 0)
|
||||
list(APPEND DEEP_EP_CUDA_ARCHITECTURES
|
||||
"${CUDA_ARCH_MAJOR}${CUDA_ARCH_MINOR}a${CUDA_ARCH_POSTFIX}")
|
||||
else()
|
||||
|
||||
@ -67,7 +67,7 @@ std::tuple<at::Tensor, at::Tensor> fp8_quantize_1x128(at::Tensor const& self)
|
||||
act_buffer, act_scale_buffer, reinterpret_cast<__nv_bfloat16 const*>(self.data_ptr()), n, m, stream);
|
||||
|
||||
// Post-process the scale tensor for sm100 gemm/moe kernel
|
||||
if (tensorrt_llm::common::getSMVersion() == 100)
|
||||
if (tensorrt_llm::common::getSMFamily() == 100)
|
||||
{
|
||||
auto const num_n_blocks = (n + 127) / 128;
|
||||
auto const act_scal_elesize = num_n_blocks * m_padded;
|
||||
|
||||
@ -40,7 +40,7 @@ from tqdm import tqdm
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from tensorrt_llm._ipc_utils import can_access_peer
|
||||
from tensorrt_llm._utils import get_sm_version
|
||||
from tensorrt_llm._utils import get_sm_family, get_sm_version
|
||||
from tensorrt_llm.functional import PositionEmbeddingType
|
||||
from tensorrt_llm.llmapi.utils import enable_llm_debug
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
@ -1479,7 +1479,7 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model,
|
||||
p.data.copy_(module_weights[n][:])
|
||||
|
||||
if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
|
||||
) and get_sm_version() == 100 and hasattr(
|
||||
) and get_sm_family() == 100 and hasattr(
|
||||
module, "weight_scale"):
|
||||
weight, weight_scale = resmooth_to_fp8_e8m0(
|
||||
module.weight, module.weight_scale)
|
||||
|
||||
@ -5,7 +5,7 @@ from typing import Optional, Union, cast
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from tensorrt_llm._utils import get_sm_version
|
||||
from tensorrt_llm._utils import get_sm_family
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
|
||||
@ -571,7 +571,7 @@ def fp8_block_scaling_bmm_out(
|
||||
out: torch.Tensor,
|
||||
mat2_dequant: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
sm_version = get_sm_version()
|
||||
sm_version = get_sm_family()
|
||||
if sm_version == 90 or sm_version == 89:
|
||||
mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
|
||||
mat1)
|
||||
@ -892,7 +892,7 @@ class MLA(nn.Module):
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
if get_sm_version() == 100:
|
||||
if get_sm_family() == 100:
|
||||
assert self.dtype == torch.bfloat16
|
||||
self.k_b_proj_trans_dequant = nn.Parameter(
|
||||
torch.empty(
|
||||
|
||||
@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Union
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from tensorrt_llm._utils import get_sm_version
|
||||
from tensorrt_llm._utils import get_sm_family
|
||||
|
||||
from ...model_config import ModelConfig
|
||||
from ...utils import AuxStreamType, Fp4QuantizedTensor
|
||||
@ -34,7 +34,7 @@ def cute_dsl_fp8_group_blockwise_gemm_ref(
|
||||
b_tmp = b.permute(1, 2, 0)
|
||||
|
||||
# Note: we have different output scale shape for fp8_quantize_1x128, so we need to handle it differently for sm100 and other archs.
|
||||
if get_sm_version() == 100:
|
||||
if get_sm_family() == 100:
|
||||
input_scale_tmp = a_sf.permute(1, 0).as_strided((m, w_k, 1),
|
||||
(1, m, m * w_k))
|
||||
else:
|
||||
|
||||
@ -7,7 +7,7 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
import tensorrt_llm.logger as trtllm_logger
|
||||
from tensorrt_llm._utils import get_sm_version
|
||||
from tensorrt_llm._utils import get_sm_family, get_sm_version
|
||||
from tensorrt_llm.quantization.functional import \
|
||||
preprocess_weights_for_mixed_gemm
|
||||
from tensorrt_llm.quantization.utils.fp4_utils import (
|
||||
@ -742,7 +742,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
|
||||
|
||||
def load_weights(self, module: torch.nn.Module, weights: List[Dict],
|
||||
weight_loading_mode: MoEWeightLoadingMode):
|
||||
if get_sm_version() == 100:
|
||||
if get_sm_family() == 100:
|
||||
expert_ids = set(module.initial_local_expert_ids)
|
||||
if self.need_load_shared_weights(module):
|
||||
expert_ids.update(
|
||||
@ -759,7 +759,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
|
||||
weight, scale)
|
||||
super().load_weights(module, weights, weight_loading_mode)
|
||||
|
||||
if get_sm_version() == 100:
|
||||
if get_sm_family() == 100:
|
||||
transfromed_w3_w1_scale = transform_sf_into_required_layout(
|
||||
module.quant_scales[0],
|
||||
mn=module.w3_w1_weight.shape[1],
|
||||
|
||||
@ -1882,7 +1882,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
|
||||
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
|
||||
attention_dp, cuda_graph, overlap_scheduler,
|
||||
max_batch_size):
|
||||
if get_sm_version() == 100:
|
||||
if get_sm_version() == 100 or get_sm_version() == 103:
|
||||
moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
else:
|
||||
|
||||
@ -138,7 +138,12 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):
|
||||
|
||||
def run_command(cmd):
|
||||
try:
|
||||
llm_venv.run_cmd(cmd, cwd=test_root)
|
||||
pythonpath = os.environ.get("PYTHONPATH", "")
|
||||
llm_venv.run_cmd(
|
||||
cmd,
|
||||
cwd=test_root,
|
||||
env={'PYTHONPATH': f"{llm_root}/tests/unittest:{pythonpath}"},
|
||||
)
|
||||
except CalledProcessError:
|
||||
return False
|
||||
return True
|
||||
|
||||
@ -30,12 +30,9 @@ from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode
|
||||
|
||||
# isort and yapf will fight against each other here, so we disable isort
|
||||
# isort: off
|
||||
from tensorrt_llm._torch.modules.fused_moe import (BaseMoeRoutingMethod,
|
||||
CutlassFusedMoE,
|
||||
DefaultMoeRoutingMethod,
|
||||
RenormalizeMoeRoutingMethod,
|
||||
TritonFusedMoE, VanillaMoE,
|
||||
create_moe, WideEPMoE)
|
||||
from tensorrt_llm._torch.modules.fused_moe import (
|
||||
BaseMoeRoutingMethod, CutlassFusedMoE, DefaultMoeRoutingMethod,
|
||||
RenormalizeMoeRoutingMethod, TritonFusedMoE, create_moe, WideEPMoE)
|
||||
# isort: on
|
||||
from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import \
|
||||
IS_TRITON_KERNELS_AVAILABLE
|
||||
@ -76,9 +73,11 @@ def test_fused_moe(moe_backend,
|
||||
if bias and moe_backend not in ["TRITON"]:
|
||||
pytest.skip("Bias not supported.")
|
||||
|
||||
mapping = Mapping()
|
||||
mapping = mapping or Mapping()
|
||||
mapping.rank = mpi_rank()
|
||||
|
||||
torch.cuda.set_device(mapping.rank)
|
||||
|
||||
with torch.device(f'cuda:{mapping.rank}'):
|
||||
SEQ_LEN = 8
|
||||
HIDDEN_SIZE = 64
|
||||
@ -165,18 +164,19 @@ def test_fused_moe(moe_backend,
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 4,
|
||||
reason="needs 4 GPUs to run this test")
|
||||
@pytest.mark.parametrize("moe_cls", [CutlassFusedMoE, VanillaMoE])
|
||||
@pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"])
|
||||
@pytest.mark.parametrize("ep_size", [1, 2, 4])
|
||||
def test_fused_moe_multi_gpu(moe_cls, ep_size):
|
||||
world_size = 4
|
||||
with MPIPoolExecutor(max_workers=world_size) as executor:
|
||||
results = executor.map(
|
||||
test_fused_moe,
|
||||
*zip(*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod,
|
||||
Mapping(world_size=world_size,
|
||||
tp_size=world_size,
|
||||
moe_ep_size=ep_size,
|
||||
moe_tp_size=world_size // ep_size))] * world_size),
|
||||
*zip(
|
||||
*[(moe_cls, torch.bfloat16, 512, DefaultMoeRoutingMethod, False,
|
||||
Mapping(world_size=world_size,
|
||||
tp_size=world_size,
|
||||
moe_ep_size=ep_size,
|
||||
moe_tp_size=world_size // ep_size))] * world_size),
|
||||
)
|
||||
for r in results:
|
||||
assert r is None
|
||||
@ -275,6 +275,10 @@ def test_fused_moe_alltoall(alltoall_method_type):
|
||||
all_rank_max_num_tokens=m,
|
||||
use_dp_padding=False)
|
||||
|
||||
if alltoall_method_type == AlltoallMethodType.MNNVL and output.ndim == 3:
|
||||
output = output.sum(dim=1)
|
||||
print(f"output: {output.shape}")
|
||||
print(f"ref_output: {ref_output.shape}")
|
||||
# Evaluate outputs
|
||||
torch.testing.assert_close(output,
|
||||
ref_output,
|
||||
|
||||
@ -71,15 +71,18 @@ def vision_arena_dataset_path():
|
||||
|
||||
|
||||
@skip_gpu_memory_less_than_80gb
|
||||
@pytest.mark.parametrize("dataset_name,dataset_args", [("random_image", {
|
||||
"--random-image-size": "1",
|
||||
"--random-image-size": "512",
|
||||
}), ("random_image", {
|
||||
"--random-num-images": "2",
|
||||
"--random-image-size": "512",
|
||||
}), ("hf", {
|
||||
"--dataset-path": vision_arena_dataset_path(),
|
||||
})],
|
||||
@pytest.mark.parametrize("dataset_name,dataset_args",
|
||||
[("random_image", {
|
||||
"--random-image-size": "1",
|
||||
"--random-image-size": "512",
|
||||
}),
|
||||
("random_image", {
|
||||
"--random-num-images": "2",
|
||||
"--random-image-size": "512",
|
||||
}),
|
||||
("hf", {
|
||||
"--dataset-path": vision_arena_dataset_path(),
|
||||
})],
|
||||
ids=[
|
||||
"random_image-single_image",
|
||||
"random_image-dual_images",
|
||||
|
||||
@ -427,7 +427,7 @@ class TestFunctional(unittest.TestCase):
|
||||
|
||||
# skip tests based on the gpu_arch_lists
|
||||
if gpu_arch != 'all':
|
||||
assert gpu_arch in [80, 86, 89, 90, 100, 120]
|
||||
assert gpu_arch in [80, 86, 89, 90, 100, 103, 120]
|
||||
if getSMVersion() != gpu_arch:
|
||||
pytest.skip(
|
||||
"Skip the test as the target gpu arch doesn't match this gpu arch."
|
||||
@ -439,7 +439,7 @@ class TestFunctional(unittest.TestCase):
|
||||
skip_blackwell_for_fmha_tests(context_fmha_type, head_size)
|
||||
|
||||
# Skip custom mask tests for Blackwell
|
||||
if getSMVersion() == 100 and custom_mask_input:
|
||||
if (getSMVersion() == 100 or getSMVersion == 103) and custom_mask_input:
|
||||
pytest.skip("Custom masked is not supported by TRTLLM-GEN for now.")
|
||||
|
||||
if num_kv_heads == 0:
|
||||
|
||||
@ -84,7 +84,7 @@ skip_pre_blackwell = pytest.mark.skipif(
|
||||
getSMVersion() < 100,
|
||||
reason="This test is not supported in pre-Blackwell architecture")
|
||||
skip_blackwell = pytest.mark.skipif(
|
||||
getSMVersion() == 100,
|
||||
getSMVersion() == 100 or getSMVersion() == 103,
|
||||
reason="This test is not supported in Blackwell architecture")
|
||||
skip_blackwell_geforce = pytest.mark.skipif(
|
||||
getSMVersion() == 120, reason="This test is not supported on SM 120")
|
||||
@ -127,9 +127,9 @@ def skip_fp8_pre_ada(use_fp8):
|
||||
|
||||
|
||||
def skip_blackwell_for_fmha_tests(context_fmha_type, head_size):
|
||||
if getSMVersion() == 100 and (head_size not in [32, 64, 128]
|
||||
and context_fmha_type
|
||||
!= ContextFMHAType.disabled):
|
||||
if (getSMVersion() == 100 or getSMVersion()
|
||||
== 103) and (head_size not in [32, 64, 128]
|
||||
and context_fmha_type != ContextFMHAType.disabled):
|
||||
pytest.skip(
|
||||
"Context FMHA only supports head sizes [32, 64, 128] currently on blackwell."
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user