fix a bug of global cuda graph dummy request (#4894)

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
This commit is contained in:
QI JUN 2025-06-05 19:47:40 +08:00 committed by GitHub
parent 7e921c78b5
commit 154f7cc40a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 39 additions and 0 deletions

View File

@ -78,6 +78,13 @@ def add_llm_args(parser):
default=False,
action='store_true')
parser.add_argument('--use_cuda_graph', default=False, action='store_true')
parser.add_argument('--cuda_graph_padding_enabled',
default=False,
action='store_true')
parser.add_argument('--cuda_graph_batch_sizes',
nargs='+',
type=int,
default=[])
parser.add_argument('--print_iter_log',
default=False,
action='store_true',
@ -160,6 +167,8 @@ def setup_llm(args):
kv_cache_config=kv_cache_config,
attn_backend=args.attention_backend,
use_cuda_graph=args.use_cuda_graph,
cuda_graph_padding_enabled=args.cuda_graph_padding_enabled,
cuda_graph_batch_sizes=args.cuda_graph_batch_sizes,
load_format=args.load_format,
print_iter_log=args.print_iter_log,
enable_iter_perf_stats=args.print_iter_log,

View File

@ -478,6 +478,10 @@ class PyTorchModelEngine(ModelEngine):
logger.info("Skipping warm up as no KV Cache manager allocated.")
return
# The lifetime of model engine and kv cache manager can be different.
# Reset the global cuda graph dummy request to None in warmup.
self.cuda_graph_dummy_request = None
def get_cuda_graph_warmup_request(batch_size):
available_blocks = kv_cache_manager.get_num_free_blocks()
if available_blocks >= batch_size:

View File

@ -1569,6 +1569,31 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
_check_mem_usage(running_log, [54.50, 0, 0, 0])
@pytest.mark.skip_less_device(4)
def test_ptq_quickstart_advanced_bs1(llm_root, llm_venv):
model_name = "DeepSeek-V3-Lite-FP8"
model_path = "DeepSeek-V3-Lite/fp8"
print(f"Testing {model_name}.")
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"),
"--use_cuda_graph",
"--cuda_graph_padding_enabled",
"--cuda_graph_batch_sizes",
"8",
"--disable_overlap_scheduler",
"--enable_attention_dp",
"--tp_size",
"4",
"--moe_ep_size",
"4",
"--prompt",
"\"NVIDIA is a great company because\"",
"--model_dir",
f"{llm_models_root()}/{model_path}",
])
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(8)
@skip_pre_hopper

View File

@ -53,6 +53,7 @@ l0_dgx_h100:
auto_trigger: deepseek
tests:
- unittest/_torch/multi_gpu_modeling -k "deepseek"
- test_e2e.py::test_ptq_quickstart_advanced_bs1
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]