mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[TRTLLM-9381][test] add disag-serving kimi k2 thinking tests (#10357)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
parent
4931c5eb3a
commit
b1733d56f6
@ -150,6 +150,8 @@ moonshotai/Kimi-K2-Thinking:
|
||||
- quant_algo: NVFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 90.84
|
||||
- quant_algo: NVFP4
|
||||
accuracy: 90.84
|
||||
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
||||
- accuracy: 92.57
|
||||
- quant_algo: FP8
|
||||
|
||||
@ -1339,3 +1339,60 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
|
||||
gen_model=gen_model,
|
||||
ctx_instances=1,
|
||||
gen_instances=1)
|
||||
|
||||
|
||||
@pytest.mark.timeout(10800)
|
||||
@skip_pre_blackwell
|
||||
class TestKimiK2(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "moonshotai/Kimi-K2-Instruct"
|
||||
MODEL_PATH = f"{llm_models_root()}/Kimi-K2-Instruct"
|
||||
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@pytest.mark.skip_less_device_memory(200000)
|
||||
def test_nvfp4(self):
|
||||
model_name = "moonshotai/Kimi-K2-Thinking"
|
||||
model_path = f"{llm_models_root()}/Kimi-K2-Thinking-NVFP4"
|
||||
ctx_server_config = {
|
||||
"max_batch_size": 16,
|
||||
"disable_overlap_scheduler": True,
|
||||
"cache_transceiver_config": {
|
||||
"backend": "DEFAULT"
|
||||
},
|
||||
"tensor_parallel_size": 4,
|
||||
"enable_attention_dp": True,
|
||||
"trust_remote_code": True,
|
||||
"kv_cache_config": {
|
||||
"free_gpu_memory_fraction": 0.8,
|
||||
},
|
||||
}
|
||||
gen_server_config = {
|
||||
"max_batch_size": 16,
|
||||
"disable_overlap_scheduler": True,
|
||||
"cache_transceiver_config": {
|
||||
"backend": "DEFAULT"
|
||||
},
|
||||
"tensor_parallel_size": 4,
|
||||
"enable_attention_dp": True,
|
||||
"trust_remote_code": True,
|
||||
"kv_cache_config": {
|
||||
"free_gpu_memory_fraction": 0.8,
|
||||
},
|
||||
}
|
||||
disaggregated_server_config = {
|
||||
"hostname": "localhost",
|
||||
"port": 8000,
|
||||
"backend": "pytorch",
|
||||
"context_servers": {
|
||||
"num_instances": 1,
|
||||
"urls": ["localhost:8001"]
|
||||
},
|
||||
"generation_servers": {
|
||||
"num_instances": 1,
|
||||
"urls": ["localhost:8002"]
|
||||
}
|
||||
}
|
||||
with launch_disaggregated_llm(disaggregated_server_config,
|
||||
ctx_server_config, gen_server_config,
|
||||
model_path) as llm:
|
||||
task = GSM8K(model_name)
|
||||
task.evaluate(llm)
|
||||
|
||||
@ -637,6 +637,7 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user