diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 57168aa581..c5e5d584d7 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -135,16 +135,18 @@ def run_accuracy_test(llm: "DuckLLM", @contextlib.contextmanager def launch_disaggregated_llm( - disaggregated_server_config: Dict[str, Any], - ctx_server_config: Dict[str, Any], - gen_server_config: Dict[str, Any], - model_name: str, - tensor_parallel_size: int = 1, - ctx_model: str = None, - gen_model: str = None, - server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT, - max_workers: int = 16, - enable_perf=False): + disaggregated_server_config: Dict[str, Any], + ctx_server_config: Dict[str, Any], + gen_server_config: Dict[str, Any], + model_name: str, + tensor_parallel_size: int = 1, + ctx_model: str = None, + gen_model: str = None, + server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT, + max_workers: int = 16, + enable_perf=False, + extra_env: Optional[Dict[str, str]] = None, +): temp_dir = tempfile.TemporaryDirectory() disaggregated_serving_config_path = os.path.join( temp_dir.name, "disaggregated_serving_config.yaml") @@ -229,10 +231,14 @@ def launch_disaggregated_llm( ctx_servers = [] current_gpu_offset = 0 + base_env = os.environ.copy() + if extra_env: + base_env.update(extra_env) + kv_cache_perf_dir = os.path.join(temp_dir.name, "kv_cache_perf") for i, port in enumerate(ctx_ports): - env = os.environ.copy() + env = base_env.copy() env["TRTLLM_USE_UCX_KVCACHE"] = "1" if enable_perf: env["TRTLLM_KVCACHE_TIME_OUTPUT_PATH"] = kv_cache_perf_dir @@ -262,7 +268,7 @@ def launch_disaggregated_llm( gen_servers = [] for i, port in enumerate(gen_ports): - env = os.environ.copy() + env = base_env.copy() env["TRTLLM_USE_UCX_KVCACHE"] = "1" if enable_perf: env["TRTLLM_KVCACHE_TIME_OUTPUT_PATH"] = kv_cache_perf_dir @@ -1220,43 +1226,45 @@ class TestDeepSeekV32Exp(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(8) @pytest.mark.parametrize("overlap_scheduler", [False]) def test_auto_dtype(self, overlap_scheduler): - ctx_server_config = {"disable_overlap_scheduler": True} - gen_server_config = {"disable_overlap_scheduler": overlap_scheduler} - ctx_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} - gen_server_config["cache_transceiver_config"] = {"backend": "DEFAULT"} - ctx_server_config["kv_cache_config"] = { - "free_gpu_memory_fraction": 0.7, + cache_transceiver_config = {"backend": "DEFAULT"} + max_num_tokens = 8192 + ctx_kv_cache_config = { + "free_gpu_memory_fraction": 0.3, "tokens_per_block": 64, - "dtype": "fp8" + "dtype": "fp8", } - ctx_server_config["moe_config"] = { - "backend": "TRTLLM", - "max_num_tokens": 16384 + moe_config = {"backend": "TRTLLM", "max_num_tokens": max_num_tokens} + ctx_server_config = { + "disable_overlap_scheduler": True, + "cuda_graph_config": None, + "cache_transceiver_config": cache_transceiver_config, + "kv_cache_config": ctx_kv_cache_config, + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "max_batch_size": 16, + "max_num_tokens": max_num_tokens, + "enable_autotuner": False, } - ctx_server_config["tensor_parallel_size"] = 4 - ctx_server_config["pipeline_parallel_size"] = 1 - ctx_server_config["moe_expert_parallel_size"] = 4 - ctx_server_config["max_batch_size"] = 24 - ctx_server_config["cuda_graph_config"] = None - ctx_server_config["enable_attention_dp"] = True - ctx_server_config["enable_autotuner"] = False - gen_server_config["kv_cache_config"] = { + gen_kv_cache_config = { + "free_gpu_memory_fraction": 0.5, "tokens_per_block": 64, - "free_gpu_memory_fraction": 0.7, - "dtype": "fp8" + "dtype": "fp8", } - gen_server_config["moe_config"] = { - "backend": "TRTLLM", - "max_num_tokens": 16384 + gen_server_config = { + "disable_overlap_scheduler": overlap_scheduler, + "cuda_graph_config": None, + "cache_transceiver_config": cache_transceiver_config, + "kv_cache_config": gen_kv_cache_config, + "moe_config": moe_config, + "max_batch_size": 128, + "max_num_tokens": 1024, + "cuda_graph_config": None, + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "moe_expert_parallel_size": 4, + "enable_attention_dp": True, + "enable_autotuner": False, } - gen_server_config["max_batch_size"] = 128 - gen_server_config["max_num_tokens"] = 128 - gen_server_config["cuda_graph_config"] = None - gen_server_config["tensor_parallel_size"] = 4 - gen_server_config["pipeline_parallel_size"] = 1 - gen_server_config["moe_expert_parallel_size"] = 4 - gen_server_config["enable_attention_dp"] = True - gen_server_config["enable_autotuner"] = False disaggregated_server_config = { "hostname": "localhost", "port": 8000, @@ -1271,11 +1279,13 @@ class TestDeepSeekV32Exp(LlmapiAccuracyTestHarness): } } with launch_disaggregated_llm(disaggregated_server_config, - ctx_server_config, - gen_server_config, - self.MODEL_PATH, + ctx_server_config=ctx_server_config, + gen_server_config=gen_server_config, + model_name=self.MODEL_PATH, max_workers=128) as llm: - run_accuracy_test(llm, self.MODEL_NAME, ["MMLU", "GSM8K"]) + run_accuracy_test(llm, + model_name=self.MODEL_NAME, + test_sets=["MMLU", "GSM8K"]) @pytest.mark.timeout(DEFAULT_TEST_TIMEOUT) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 4eb031afe6..ece97ca4c9 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -315,7 +315,6 @@ unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] S unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_cuda_graph_overlap_scheduler SKIP (https://nvbugs/5843316) examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1] SKIP (https://nvbugs/5846178) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5846024) -accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183) unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_autotune_fp8_fp4[RoutingDSlite-384-1024-1] SKIP (https://nvbugs/5859881)