diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py index 59a6cbd86c..f5b25da0a5 100644 --- a/examples/pytorch/quickstart_advanced.py +++ b/examples/pytorch/quickstart_advanced.py @@ -78,6 +78,13 @@ def add_llm_args(parser): default=False, action='store_true') parser.add_argument('--use_cuda_graph', default=False, action='store_true') + parser.add_argument('--cuda_graph_padding_enabled', + default=False, + action='store_true') + parser.add_argument('--cuda_graph_batch_sizes', + nargs='+', + type=int, + default=[]) parser.add_argument('--print_iter_log', default=False, action='store_true', @@ -160,6 +167,8 @@ def setup_llm(args): kv_cache_config=kv_cache_config, attn_backend=args.attention_backend, use_cuda_graph=args.use_cuda_graph, + cuda_graph_padding_enabled=args.cuda_graph_padding_enabled, + cuda_graph_batch_sizes=args.cuda_graph_batch_sizes, load_format=args.load_format, print_iter_log=args.print_iter_log, enable_iter_perf_stats=args.print_iter_log, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 49bf6194b2..c6cbc47bfb 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -478,6 +478,10 @@ class PyTorchModelEngine(ModelEngine): logger.info("Skipping warm up as no KV Cache manager allocated.") return + # The lifetime of model engine and kv cache manager can be different. + # Reset the global cuda graph dummy request to None in warmup. + self.cuda_graph_dummy_request = None + def get_cuda_graph_warmup_request(batch_size): available_blocks = kv_cache_manager.get_num_free_blocks() if available_blocks >= batch_size: diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index c907ae0ac7..943194f94a 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1569,6 +1569,31 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name, _check_mem_usage(running_log, [54.50, 0, 0, 0]) +@pytest.mark.skip_less_device(4) +def test_ptq_quickstart_advanced_bs1(llm_root, llm_venv): + model_name = "DeepSeek-V3-Lite-FP8" + model_path = "DeepSeek-V3-Lite/fp8" + print(f"Testing {model_name}.") + example_root = Path(os.path.join(llm_root, "examples", "pytorch")) + llm_venv.run_cmd([ + str(example_root / "quickstart_advanced.py"), + "--use_cuda_graph", + "--cuda_graph_padding_enabled", + "--cuda_graph_batch_sizes", + "8", + "--disable_overlap_scheduler", + "--enable_attention_dp", + "--tp_size", + "4", + "--moe_ep_size", + "4", + "--prompt", + "\"NVIDIA is a great company because\"", + "--model_dir", + f"{llm_models_root()}/{model_path}", + ]) + + @pytest.mark.skip_less_device_memory(80000) @pytest.mark.skip_less_device(8) @skip_pre_hopper diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index e1b96b68c1..b319a30df3 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -53,6 +53,7 @@ l0_dgx_h100: auto_trigger: deepseek tests: - unittest/_torch/multi_gpu_modeling -k "deepseek" + - test_e2e.py::test_ptq_quickstart_advanced_bs1 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]