[None][chore] cherry-pick 6940 (#7097)

Signed-off-by: Bo Deng <deemod@nvidia.com>
This commit is contained in:
Bo Deng 2025-08-25 10:28:45 +08:00 committed by GitHub
parent 3ba9afcc7b
commit c038fb3ef4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 11 additions and 10 deletions

View File

@ -23,7 +23,7 @@ from tensorrt_llm.llmapi.llm_args import LlmArgs
from tensorrt_llm.llmapi.tokenizer import load_hf_tokenizer
from ..conftest import (get_device_count, llm_models_root, parametrize_with_ids,
skip_pre_blackwell, skip_pre_hopper)
skip_no_hopper, skip_pre_blackwell, skip_pre_hopper)
from ..trt_test_alternative import popen
from .accuracy_core import (GSM8K, MMLU, JsonModeEval,
LlmapiAccuracyTestHarness, get_accuracy_task)
@ -640,6 +640,9 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(60000)
@skip_no_hopper
def test_nixl_backend(self):
ctx_server_config = {
"disable_overlap_scheduler": True,
@ -776,6 +779,8 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen3/Qwen3-8B"
MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
@pytest.mark.skip_less_device(2)
@skip_no_hopper
def test_nixl_backend(self):
ctx_server_config = {
"disable_overlap_scheduler": True,
@ -805,8 +810,6 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config, gen_server_config,
self.MODEL_PATH) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

View File

@ -20,7 +20,8 @@ import tempfile
import pytest
import yaml
from defs.conftest import llm_models_root, skip_arm, skip_no_hopper
from defs.conftest import (get_sm_version, llm_models_root, skip_arm,
skip_no_hopper)
from defs.trt_test_alternative import check_call, check_output, popen
from tensorrt_llm.logger import logger
@ -1270,6 +1271,9 @@ def get_config_for_benchmark(model_root, backend):
def test_disaggregated_benchmark_on_diff_backends(
disaggregated_test_root, disaggregated_example_root, llm_venv,
benchmark_model_root, benchmark_root, shared_gpt_path):
if "DeepSeek-V3-Lite" in benchmark_model_root and "fp8" in benchmark_model_root and get_sm_version(
) != 90:
pytest.skip("The test should only run on Hopper")
nixl_config = get_config_for_benchmark(benchmark_model_root, "NIXL")
ucx_config = get_config_for_benchmark(benchmark_model_root, "UCX")
temp_dir = tempfile.TemporaryDirectory()

View File

@ -48,8 +48,6 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
- condition:
ranges:
@ -102,6 +100,5 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-CUTLASS]
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype

View File

@ -296,8 +296,6 @@ triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5456485)
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5434320)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5445466)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)
@ -313,7 +311,6 @@ disaggregated/test_disaggregated.py::test_disaggregated_diff_max_tokens[TinyLlam
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5465642)
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5464461)
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5448449)
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] SKIP (https://nvbugs/5467815)
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5467815)
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815)