mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Unwaive disaggregated serving accuracy tests (#5095)
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> Signed-off-by: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
This commit is contained in:
parent
857108aeca
commit
e5ee5c5352
@ -58,7 +58,7 @@ class MyThreadPoolExecutor(ThreadPoolExecutor):
|
||||
|
||||
for future in self.futures:
|
||||
future.cancel()
|
||||
self.shutdown(wait=False, cancel_futures=True)
|
||||
self.shutdown(wait=True, cancel_futures=True)
|
||||
return False
|
||||
|
||||
|
||||
@ -163,15 +163,16 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
|
||||
thread_pool.futures.append(future)
|
||||
return future
|
||||
|
||||
yield DuckLLM(args, generate_async)
|
||||
try:
|
||||
yield DuckLLM(args, generate_async)
|
||||
finally:
|
||||
ctx_server.terminate()
|
||||
gen_server.terminate()
|
||||
disaggregated_server.terminate()
|
||||
|
||||
ctx_server.terminate()
|
||||
gen_server.terminate()
|
||||
disaggregated_server.terminate()
|
||||
|
||||
ctx_server.wait()
|
||||
gen_server.wait()
|
||||
disaggregated_server.wait()
|
||||
ctx_server.wait()
|
||||
gen_server.wait()
|
||||
disaggregated_server.wait()
|
||||
|
||||
|
||||
@pytest.mark.timeout(3600)
|
||||
@ -252,16 +253,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
@parametrize_with_ids("mtp_nextn",
|
||||
[0, pytest.param(2, marks=skip_pre_hopper)])
|
||||
def test_auto_dtype(self, overlap_scheduler, mtp_nextn):
|
||||
ctx_server_config = {
|
||||
"pytorch_backend_config": {
|
||||
"disable_overlap_scheduler": True
|
||||
}
|
||||
}
|
||||
gen_server_config = {
|
||||
"pytorch_backend_config": {
|
||||
"disable_overlap_scheduler": not overlap_scheduler
|
||||
}
|
||||
}
|
||||
ctx_server_config = {"disable_overlap_scheduler": True}
|
||||
gen_server_config = {"disable_overlap_scheduler": not overlap_scheduler}
|
||||
if mtp_nextn > 0:
|
||||
ctx_server_config["speculative_config"] = {
|
||||
"decoding_type": "MTP",
|
||||
|
||||
@ -402,10 +402,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama
|
||||
test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugspro.nvidia.com/bug/5324239)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5322354)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] SKIP (https://nvbugs/5322354)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K] SKIP (https://nvbugs/5325284)
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-70B] SKIP (https://nvbugs/5323316)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5328160)
|
||||
@ -416,6 +412,8 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] SKIP (https://nvbugs/5333659)
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5333659)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5331031)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5336321)
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5336321)
|
||||
full:B200/examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8] SKIP (https://nvbugs/5292737)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user