mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][fix] fix same pp disagg (#6730)
Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
This commit is contained in:
parent
767879ef85
commit
c566a8d2a2
@ -846,16 +846,23 @@ void CacheFormatter::unformat(TransferSession& session)
|
|||||||
}
|
}
|
||||||
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
|
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
|
||||||
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
|
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
|
||||||
|
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
|
||||||
|
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
|
||||||
|
|
||||||
|
if (selfPPSize == destPPSize)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (selfNumLayers % selfPPSize != 0)
|
if (selfNumLayers % selfPPSize != 0)
|
||||||
{
|
{
|
||||||
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism");
|
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
|
||||||
|
selfNumLayers, selfPPSize);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
|
|
||||||
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
|
|
||||||
if (destNumLayers % destPPSize != 0)
|
if (destNumLayers % destPPSize != 0)
|
||||||
{
|
{
|
||||||
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers must be divisible by pipeline parallelism");
|
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
|
||||||
|
destNumLayers, destPPSize);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -583,6 +583,28 @@ void MLACacheFormatter::unformat(TransferSession& session)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
|
||||||
|
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
|
||||||
|
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
|
||||||
|
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
|
||||||
|
|
||||||
|
if (selfPPSize == destPPSize)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (selfNumLayers % selfPPSize != 0)
|
||||||
|
{
|
||||||
|
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
|
||||||
|
selfNumLayers, selfPPSize);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (destNumLayers % destPPSize != 0)
|
||||||
|
{
|
||||||
|
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
|
||||||
|
destNumLayers, destPPSize);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace tensorrt_llm::batch_manager::kv_cache_manager
|
} // namespace tensorrt_llm::batch_manager::kv_cache_manager
|
||||||
|
|||||||
@ -663,13 +663,14 @@ def test_disaggregated_ctxtp2pp2_gentp2pp2(disaggregated_test_root, llm_venv,
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip_less_device(8)
|
@pytest.mark.skip_less_device(8)
|
||||||
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
|
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
|
||||||
|
indirect=True)
|
||||||
def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv,
|
def test_disaggregated_ctxpp4_genpp4(disaggregated_test_root, llm_venv,
|
||||||
disaggregated_example_root,
|
disaggregated_example_root,
|
||||||
llama_model_root):
|
llama_model_root):
|
||||||
src_dst_dict = {
|
src_dst_dict = {
|
||||||
llama_model_root:
|
llama_model_root:
|
||||||
f"{llm_venv.get_working_directory()}/llama-3.1-models/Meta-Llama-3.1-8B",
|
f"{llm_venv.get_working_directory()}/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
}
|
}
|
||||||
for src, dst in src_dst_dict.items():
|
for src, dst in src_dst_dict.items():
|
||||||
if not os.path.islink(dst):
|
if not os.path.islink(dst):
|
||||||
|
|||||||
@ -678,7 +678,7 @@ disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[
|
|||||||
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
|
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-False-Qwen3-8B-FP8]
|
||||||
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
|
disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_qwen3[True-True-Qwen3-8B-FP8]
|
||||||
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
|
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
|
||||||
disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b]
|
disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
|
||||||
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
|
disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
|
||||||
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
|
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
|
||||||
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
|
disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
|
||||||
|
|||||||
@ -30,7 +30,7 @@ l0_dgx_h200:
|
|||||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
|
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4]
|
||||||
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
|
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
|
||||||
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
|
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
|
||||||
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b]
|
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
|
||||||
- unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout]
|
- unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout]
|
||||||
- unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout]
|
- unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout]
|
||||||
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
|
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user