mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
fix a bug of global cuda graph dummy request (#4894)
Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
parent
7e921c78b5
commit
154f7cc40a
@ -78,6 +78,13 @@ def add_llm_args(parser):
|
||||
default=False,
|
||||
action='store_true')
|
||||
parser.add_argument('--use_cuda_graph', default=False, action='store_true')
|
||||
parser.add_argument('--cuda_graph_padding_enabled',
|
||||
default=False,
|
||||
action='store_true')
|
||||
parser.add_argument('--cuda_graph_batch_sizes',
|
||||
nargs='+',
|
||||
type=int,
|
||||
default=[])
|
||||
parser.add_argument('--print_iter_log',
|
||||
default=False,
|
||||
action='store_true',
|
||||
@ -160,6 +167,8 @@ def setup_llm(args):
|
||||
kv_cache_config=kv_cache_config,
|
||||
attn_backend=args.attention_backend,
|
||||
use_cuda_graph=args.use_cuda_graph,
|
||||
cuda_graph_padding_enabled=args.cuda_graph_padding_enabled,
|
||||
cuda_graph_batch_sizes=args.cuda_graph_batch_sizes,
|
||||
load_format=args.load_format,
|
||||
print_iter_log=args.print_iter_log,
|
||||
enable_iter_perf_stats=args.print_iter_log,
|
||||
|
||||
@ -478,6 +478,10 @@ class PyTorchModelEngine(ModelEngine):
|
||||
logger.info("Skipping warm up as no KV Cache manager allocated.")
|
||||
return
|
||||
|
||||
# The lifetime of model engine and kv cache manager can be different.
|
||||
# Reset the global cuda graph dummy request to None in warmup.
|
||||
self.cuda_graph_dummy_request = None
|
||||
|
||||
def get_cuda_graph_warmup_request(batch_size):
|
||||
available_blocks = kv_cache_manager.get_num_free_blocks()
|
||||
if available_blocks >= batch_size:
|
||||
|
||||
@ -1569,6 +1569,31 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
|
||||
_check_mem_usage(running_log, [54.50, 0, 0, 0])
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
def test_ptq_quickstart_advanced_bs1(llm_root, llm_venv):
|
||||
model_name = "DeepSeek-V3-Lite-FP8"
|
||||
model_path = "DeepSeek-V3-Lite/fp8"
|
||||
print(f"Testing {model_name}.")
|
||||
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
|
||||
llm_venv.run_cmd([
|
||||
str(example_root / "quickstart_advanced.py"),
|
||||
"--use_cuda_graph",
|
||||
"--cuda_graph_padding_enabled",
|
||||
"--cuda_graph_batch_sizes",
|
||||
"8",
|
||||
"--disable_overlap_scheduler",
|
||||
"--enable_attention_dp",
|
||||
"--tp_size",
|
||||
"4",
|
||||
"--moe_ep_size",
|
||||
"4",
|
||||
"--prompt",
|
||||
"\"NVIDIA is a great company because\"",
|
||||
"--model_dir",
|
||||
f"{llm_models_root()}/{model_path}",
|
||||
])
|
||||
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@skip_pre_hopper
|
||||
|
||||
@ -53,6 +53,7 @@ l0_dgx_h100:
|
||||
auto_trigger: deepseek
|
||||
tests:
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek"
|
||||
- test_e2e.py::test_ptq_quickstart_advanced_bs1
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user