Breaking change: perf: Enable scheduling overlap by default (#4174)

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
This commit is contained in:
Kaiyu Xie 2025-05-15 14:27:36 +08:00 committed by GitHub
parent 404fbe9b32
commit b4e5df0ee0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
54 changed files with 110 additions and 127 deletions

View File

@ -135,7 +135,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF cat >./extra-llm-api-config.yml<<EOF
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true
use_cuda_graph: true use_cuda_graph: true
moe_backend: TRTLLM moe_backend: TRTLLM
speculative_config: speculative_config:
@ -218,7 +217,6 @@ pytorch_backend_config:
- 256 - 256
- 384 - 384
print_iter_log: true print_iter_log: true
enable_overlap_scheduler: true
enable_attention_dp: true enable_attention_dp: true
EOF EOF
@ -260,7 +258,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
cat >./extra-llm-api-config.yml<<EOF cat >./extra-llm-api-config.yml<<EOF
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true
use_cuda_graph: true use_cuda_graph: true
speculative_config: speculative_config:
decoding_type: MTP decoding_type: MTP
@ -314,7 +311,6 @@ pytorch_backend_config:
use_cuda_graph: true use_cuda_graph: true
cuda_graph_batch_sizes: cuda_graph_batch_sizes:
- 128 - 128
enable_overlap_scheduler: true
enable_attention_dp: true enable_attention_dp: true
EOF EOF

View File

@ -9,7 +9,7 @@ You can use multiple `trtllm-serve` commands to launch the context and generatio
for disaggregated serving. For example, you could launch two context servers and one generation servers as follows: for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
``` ```
echo -e "pytorch_backend_config:\n enable_overlap_scheduler: False\ncache_transceiver_config:\n max_num_tokens: 2048" > context_extra-llm-api-config.yml echo -e "pytorch_backend_config:\n disable_overlap_scheduler: True\ncache_transceiver_config:\n max_num_tokens: 2048" > context_extra-llm-api-config.yml
echo -e "cache_transceiver_config:\n max_num_tokens: 2048" > gen_extra-llm-api-config.yml echo -e "cache_transceiver_config:\n max_num_tokens: 2048" > gen_extra-llm-api-config.yml
export TRTLLM_USE_UCX_KVCACHE=1 export TRTLLM_USE_UCX_KVCACHE=1
@ -65,7 +65,7 @@ model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 1 tensor_parallel_size: 1

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 1 tensor_parallel_size: 1

View File

@ -6,8 +6,7 @@ from tensorrt_llm.llmapi import KvCacheConfig
def main(): def main():
pytorch_config = PyTorchConfig(enable_overlap_scheduler=True, pytorch_config = PyTorchConfig(autotuner_enabled=False,
autotuner_enabled=False,
kv_cache_dtype='auto') kv_cache_dtype='auto')
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",

View File

@ -76,7 +76,6 @@ srun -l \
cat > /tmp/pytorch_extra_args.txt << EOF cat > /tmp/pytorch_extra_args.txt << EOF
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: false use_cuda_graph: false
enable_overlap_scheduler: true
cuda_graph_padding_enabled: false cuda_graph_padding_enabled: false
print_iter_log: true print_iter_log: true
enable_attention_dp: false enable_attention_dp: false

View File

@ -21,7 +21,10 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
- [Quick Start](#quick-start) - [Quick Start](#quick-start)
- [Run a single inference](#run-a-single-inference) - [Run a single inference](#run-a-single-inference)
- [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp) - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
- [Relaxed acceptance](#relaxed-acceptance)
- [Long context support](#long-context-support) - [Long context support](#long-context-support)
- [ISL-64k-OSL-1024](#isl-64k-osl-1024)
- [ISL-128k-OSL-1024](#isl-128k-osl-1024)
- [Evaluation](#evaluation) - [Evaluation](#evaluation)
- [Serving](#serving) - [Serving](#serving)
- [Use trtllm-serve](#use-trtllm-serve) - [Use trtllm-serve](#use-trtllm-serve)
@ -36,6 +39,7 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
- [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla) - [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla)
- [W4AFP8](#w4afp8) - [W4AFP8](#w4afp8)
- [Notes and Troubleshooting](#notes-and-troubleshooting) - [Notes and Troubleshooting](#notes-and-troubleshooting)
- [Known Issues](#known-issues)
## Hardware Requirements ## Hardware Requirements
@ -136,7 +140,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
cat <<EOF > /tmp/extra-llm-api-config.yml cat <<EOF > /tmp/extra-llm-api-config.yml
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true
use_cuda_graph: true use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_padding_enabled: true
cuda_graph_batch_sizes: [1, 4, 8, 12] cuda_graph_batch_sizes: [1, 4, 8, 12]
@ -165,7 +168,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
cat <<EOF > /tmp/extra-llm-api-config.yml cat <<EOF > /tmp/extra-llm-api-config.yml
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true
use_cuda_graph: true use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_padding_enabled: true
cuda_graph_batch_sizes: [1, 2] cuda_graph_batch_sizes: [1, 2]
@ -192,7 +194,6 @@ Evaluate the model accuracy using `trtllm-eval`.
cat >./extra-llm-api-config.yml <<EOF cat >./extra-llm-api-config.yml <<EOF
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: true use_cuda_graph: true
enable_overlap_scheduler: true
enable_attention_dp: true enable_attention_dp: true
EOF EOF
``` ```
@ -249,7 +250,6 @@ pytorch_backend_config:
- 256 - 256
- 384 - 384
print_iter_log: true print_iter_log: true
enable_overlap_scheduler: true
enable_attention_dp: true enable_attention_dp: true
EOF EOF
@ -441,7 +441,6 @@ pytorch_backend_config:
- 256 - 256
- 384 - 384
print_iter_log: true print_iter_log: true
enable_overlap_scheduler: true
enable_attention_dp: true enable_attention_dp: true
EOF EOF
``` ```

View File

@ -22,7 +22,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
- [Run a single inference](#run-a-single-inference) - [Run a single inference](#run-a-single-inference)
- [Evaluation](#evaluation) - [Evaluation](#evaluation)
- [Serving](#serving) - [Serving](#serving)
- [Notes and Troubleshooting](#notes-and-troubleshooting) - [Notes and Troubleshooting](#notes-and-troubleshooting)
- [Credits](#credits) - [Credits](#credits)
## Overview ## Overview
@ -668,7 +668,6 @@ pytorch_backend_config:
- 256 - 256
- 384 - 384
print_iter_log: true print_iter_log: true
enable_overlap_scheduler: true
enable_attention_dp: true enable_attention_dp: true
EOF EOF

View File

@ -72,7 +72,7 @@ def add_llm_args(parser):
parser.add_argument("--kv_cache_fraction", type=float, default=None) parser.add_argument("--kv_cache_fraction", type=float, default=None)
# Runtime # Runtime
parser.add_argument('--enable_overlap_scheduler', parser.add_argument('--disable_overlap_scheduler',
default=False, default=False,
action='store_true') action='store_true')
parser.add_argument('--enable_chunked_prefill', parser.add_argument('--enable_chunked_prefill',
@ -124,7 +124,7 @@ def parse_arguments():
def setup_llm(args): def setup_llm(args):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=args.enable_overlap_scheduler, disable_overlap_scheduler=args.disable_overlap_scheduler,
kv_cache_dtype=args.kv_cache_dtype, kv_cache_dtype=args.kv_cache_dtype,
attn_backend=args.attention_backend, attn_backend=args.attention_backend,
use_cuda_graph=args.use_cuda_graph, use_cuda_graph=args.use_cuda_graph,

View File

@ -39,7 +39,7 @@ def main():
max_batch_size=args.sample_num, max_batch_size=args.sample_num,
max_num_tokens=8192, max_num_tokens=8192,
kv_cache_free_gpu_memory_fraction=0.2, kv_cache_free_gpu_memory_fraction=0.2,
enable_overlap_scheduler=False) disable_overlap_scheduler=True)
workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
workers[QwenRewardController.WorkerTag.REWARD] = reward_worker workers[QwenRewardController.WorkerTag.REWARD] = reward_worker

View File

@ -302,7 +302,7 @@ def create_autodeploy_executor(
model_engine=engine, model_engine=engine,
decoder=decoder, decoder=decoder,
dist=mpi_dist, dist=mpi_dist,
enable_overlap_scheduler=py_config.enable_overlap_scheduler, disable_overlap_scheduler=py_config.disable_overlap_scheduler,
max_input_len=executor_config.max_input_len, max_input_len=executor_config.max_input_len,
max_batch_size=executor_config.max_batch_size, max_batch_size=executor_config.max_batch_size,
max_draft_tokens=executor_config.speculative_config.max_draft_tokens max_draft_tokens=executor_config.speculative_config.max_draft_tokens

View File

@ -343,7 +343,7 @@ def create_py_executor_instance(dist,
if spec_config is not None: if spec_config is not None:
raise ValueError( raise ValueError(
"Guided decoding is not supported with speculative decoding.") "Guided decoding is not supported with speculative decoding.")
if pytorch_backend_config.enable_overlap_scheduler: if not pytorch_backend_config.disable_overlap_scheduler:
raise ValueError( raise ValueError(
"Guided decoding is not supported with overlap scheduler.") "Guided decoding is not supported with overlap scheduler.")
@ -415,7 +415,7 @@ def create_py_executor_instance(dist,
if mapping.has_pp(): if mapping.has_pp():
num_micro_batches = mapping.pp_size num_micro_batches = mapping.pp_size
else: else:
num_micro_batches = 2 if pytorch_backend_config.enable_overlap_scheduler else 1 num_micro_batches = 1 if pytorch_backend_config.disable_overlap_scheduler else 2
resources["seq_slot_manager"] = SeqSlotManager( resources["seq_slot_manager"] = SeqSlotManager(
executor_config.max_batch_size * num_micro_batches) executor_config.max_batch_size * num_micro_batches)
@ -450,8 +450,8 @@ def create_py_executor_instance(dist,
model_engine=model_engine, model_engine=model_engine,
decoder=decoder, decoder=decoder,
dist=dist, dist=dist,
enable_overlap_scheduler=pytorch_backend_config. disable_overlap_scheduler=pytorch_backend_config.
enable_overlap_scheduler, disable_overlap_scheduler,
max_batch_size=executor_config.max_batch_size, max_batch_size=executor_config.max_batch_size,
max_draft_tokens=spec_config.max_draft_tokens max_draft_tokens=spec_config.max_draft_tokens
if spec_config is not None else 0, if spec_config is not None else 0,
@ -471,9 +471,9 @@ def instantiate_decoder(model_engine, executor_config, pytorch_backend_config,
spec_config=model_engine.spec_config) spec_config=model_engine.spec_config)
elif pytorch_backend_config.enable_trtllm_decoder: elif pytorch_backend_config.enable_trtllm_decoder:
decoding_mode = get_decoding_mode(executor_config) decoding_mode = get_decoding_mode(executor_config)
decoder = TRTLLMDecoder(executor_config, model_engine.model, decoder = TRTLLMDecoder(
model_engine.dtype, mapping, decoding_mode, executor_config, model_engine.model, model_engine.dtype, mapping,
pytorch_backend_config.enable_overlap_scheduler) decoding_mode, pytorch_backend_config.disable_overlap_scheduler)
elif not model_engine.model.model_config.is_generation: elif not model_engine.model.model_config.is_generation:
# NOTE: choose decoder based on model type # NOTE: choose decoder based on model type
decoder = EarlyStopDecoder() decoder = EarlyStopDecoder()

View File

@ -45,7 +45,7 @@ class PyTorchConfig:
# If true, batches are rounded up to the nearest cuda_graph_batch_size. # If true, batches are rounded up to the nearest cuda_graph_batch_size.
# This is usually a net win for performance. # This is usually a net win for performance.
cuda_graph_padding_enabled: bool = False cuda_graph_padding_enabled: bool = False
enable_overlap_scheduler: bool = False disable_overlap_scheduler: bool = False
# If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. # If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time.
# If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used. # If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.
moe_max_num_tokens: Optional[int] = None moe_max_num_tokens: Optional[int] = None

View File

@ -449,7 +449,7 @@ class TRTLLMDecoder(Decoder):
model_dtype, model_dtype,
mapping: Mapping, mapping: Mapping,
decoding_mode: DecodingMode, decoding_mode: DecodingMode,
enable_overlap_scheduler: bool, disable_overlap_scheduler: bool,
): ):
vocab_size = model.config.vocab_size vocab_size = model.config.vocab_size
@ -468,7 +468,7 @@ class TRTLLMDecoder(Decoder):
self.max_num_sequences = mapping.pp_size * self.executor_config.max_batch_size self.max_num_sequences = mapping.pp_size * self.executor_config.max_batch_size
self.max_seq_idle_microseconds = 180 * 1000 * 1000 self.max_seq_idle_microseconds = 180 * 1000 * 1000
self.max_decoding_tokens = 1 # It must be 1 when not in speculative decoding self.max_decoding_tokens = 1 # It must be 1 when not in speculative decoding
self.is_trt_overlap = enable_overlap_scheduler self.is_trt_overlap = not disable_overlap_scheduler
self.world_config = WorldConfig.mpi(mapping.gpus_per_node, self.world_config = WorldConfig.mpi(mapping.gpus_per_node,
mapping.tp_size, mapping.pp_size) mapping.tp_size, mapping.pp_size)

View File

@ -331,7 +331,7 @@ class PyTorchModelEngine(ModelEngine):
layerwise_nvtx_marker.register_hooks(self.model, module_prefix) layerwise_nvtx_marker.register_hooks(self.model, module_prefix)
self.enable_attention_dp = self.model.model_config.mapping.enable_attention_dp self.enable_attention_dp = self.model.model_config.mapping.enable_attention_dp
self._enable_overlap_scheduler = self.pytorch_backend_config.enable_overlap_scheduler self._disable_overlap_scheduler = self.pytorch_backend_config.disable_overlap_scheduler
self._torch_compile_backend = None self._torch_compile_backend = None
self.dtype = self.model.config.torch_dtype self.dtype = self.model.config.torch_dtype
self._init_model_capacity() self._init_model_capacity()
@ -982,7 +982,7 @@ class PyTorchModelEngine(ModelEngine):
""" """
Make some changes to the device inputs and avoid block the async data transfer Make some changes to the device inputs and avoid block the async data transfer
""" """
if self.is_spec_decode and self._enable_overlap_scheduler: if self.is_spec_decode and not self._disable_overlap_scheduler:
# When enabling overlap scheduler, the kv cache for draft tokens will # When enabling overlap scheduler, the kv cache for draft tokens will
# be prepared in advance by using the max_draft_len. But we need to use # be prepared in advance by using the max_draft_len. But we need to use
# new_tokens_lens_device to get the real past kv lengths and the # new_tokens_lens_device to get the real past kv lengths and the
@ -1086,7 +1086,7 @@ class PyTorchModelEngine(ModelEngine):
dtype=torch.int32).to('cuda', dtype=torch.int32).to('cuda',
non_blocking=True)) non_blocking=True))
if self._enable_overlap_scheduler and self.is_spec_decode: if not self._disable_overlap_scheduler and self.is_spec_decode:
spec_dec_mode = self.spec_config.spec_dec_mode spec_dec_mode = self.spec_config.spec_dec_mode
assert spec_dec_mode.support_overlap_scheduler( assert spec_dec_mode.support_overlap_scheduler(
), f"{self.spec_config.spec_dec_name} does not support overlap scheduler" ), f"{self.spec_config.spec_dec_name} does not support overlap scheduler"

View File

@ -162,7 +162,7 @@ class PyExecutor:
model_engine: ModelEngine, model_engine: ModelEngine,
decoder: Decoder, decoder: Decoder,
dist: Distributed, dist: Distributed,
enable_overlap_scheduler: bool = False, disable_overlap_scheduler: bool = False,
max_input_len: int = 2048, max_input_len: int = 2048,
max_batch_size: int = 8, max_batch_size: int = 8,
max_draft_tokens: int = 0, max_draft_tokens: int = 0,
@ -187,7 +187,7 @@ class PyExecutor:
self.enable_attention_dp = model_engine.enable_attention_dp self.enable_attention_dp = model_engine.enable_attention_dp
self.decoder = decoder self.decoder = decoder
self.dist = dist self.dist = dist
self.enable_overlap_scheduler = enable_overlap_scheduler self.disable_overlap_scheduler = disable_overlap_scheduler
# Draft model for certain spec decode algorithms, e.g. EAGLE3 # Draft model for certain spec decode algorithms, e.g. EAGLE3
self.draft_model_engine = draft_model_engine self.draft_model_engine = draft_model_engine
@ -258,7 +258,7 @@ class PyExecutor:
if self.dist.pp_size > 1: if self.dist.pp_size > 1:
self.event_loop = self._executor_loop_pp self.event_loop = self._executor_loop_pp
else: else:
self.event_loop = self._executor_loop_overlap if enable_overlap_scheduler else self._executor_loop self.event_loop = self._executor_loop if disable_overlap_scheduler else self._executor_loop_overlap
if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"): if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"):
self.event_loop = trace_func(self.event_loop) self.event_loop = trace_func(self.event_loop)
@ -1975,7 +1975,7 @@ class PyExecutor:
# If request is in transmission, so we don't need to emit a response # If request is in transmission, so we don't need to emit a response
# Also, for the first iteration with overlap, we should skip since first token has already been emitted by context server # Also, for the first iteration with overlap, we should skip since first token has already been emitted by context server
if request.is_disagg_generation_transmission_in_progress or ( if request.is_disagg_generation_transmission_in_progress or (
self.enable_overlap_scheduler not self.disable_overlap_scheduler
and request.py_decoding_iter <= 1): and request.py_decoding_iter <= 1):
new_active_requests.append(request) new_active_requests.append(request)
continue continue

View File

@ -104,7 +104,7 @@ def create_py_executor(executor_config: ExecutorConfig,
# PyTorchModelEngine modifies these fields, update them to executor_config # PyTorchModelEngine modifies these fields, update them to executor_config
max_seq_len = model_engine.max_seq_len max_seq_len = model_engine.max_seq_len
origin_seq_len = max_seq_len origin_seq_len = max_seq_len
if pytorch_backend_config.enable_overlap_scheduler: if not pytorch_backend_config.disable_overlap_scheduler:
max_seq_len = model_engine.max_seq_len + 1 max_seq_len = model_engine.max_seq_len + 1
if spec_config is not None: if spec_config is not None:
max_seq_len += spec_config.max_draft_tokens max_seq_len += spec_config.max_draft_tokens

View File

@ -148,7 +148,6 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
pyt_options = { pyt_options = {
"use_cuda_graph": True, "use_cuda_graph": True,
"cuda_graph_padding_enabled": True, "cuda_graph_padding_enabled": True,
"enable_overlap_scheduler": True,
"kv_cache_dtype": kv_cache_dtype, "kv_cache_dtype": kv_cache_dtype,
"cuda_graph_max_batch_size": max_batch_size, "cuda_graph_max_batch_size": max_batch_size,
} }

View File

@ -115,7 +115,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
backend = None backend = None
pytorch_backend_config = None pytorch_backend_config = None
if backend == "pytorch": if backend == "pytorch":
pytorch_backend_config = PyTorchConfig(enable_overlap_scheduler=True) pytorch_backend_config = PyTorchConfig()
llm_args = { llm_args = {
"model": model, "model": model,

View File

@ -50,8 +50,7 @@ def get_llm_args(model: str,
kv_cache_config = KvCacheConfig( kv_cache_config = KvCacheConfig(
free_gpu_memory_fraction=free_gpu_memory_fraction) free_gpu_memory_fraction=free_gpu_memory_fraction)
pytorch_backend_config = PyTorchConfig( pytorch_backend_config = PyTorchConfig() if backend == "pytorch" else None
enable_overlap_scheduler=True) if backend == "pytorch" else None
dynamic_batch_config = DynamicBatchConfig( dynamic_batch_config = DynamicBatchConfig(
enable_batch_size_tuning=True, enable_batch_size_tuning=True,
enable_max_num_tokens_tuning=False, enable_max_num_tokens_tuning=False,

View File

@ -384,7 +384,7 @@ class ExecutorBindingsWorker(GenerationExecutor):
context_phase_params = request.disaggregated_params.get_context_phase_params( context_phase_params = request.disaggregated_params.get_context_phase_params(
) )
is_overlap_enabled = self._is_pytorch_backend and self._executor_config.pytorch_backend_config.enable_overlap_scheduler is_overlap_enabled = self._is_pytorch_backend and not self._executor_config.pytorch_backend_config.disable_overlap_scheduler
if is_overlap_enabled: if is_overlap_enabled:
is_disaggregated = self.engine.kv_cache_transceiver is not None is_disaggregated = self.engine.kv_cache_transceiver is not None
if is_disaggregated and ( if is_disaggregated and (

View File

@ -136,11 +136,11 @@ class TRTLLMWorker(Worker):
max_batch_size: int = 32, max_batch_size: int = 32,
max_num_tokens: int = 4096, max_num_tokens: int = 4096,
kv_cache_free_gpu_memory_fraction: float = 0.9, kv_cache_free_gpu_memory_fraction: float = 0.9,
enable_overlap_scheduler: bool = True, disable_overlap_scheduler: bool = False,
): ):
pytorch_backend_config = PyTorchConfig( pytorch_backend_config = PyTorchConfig(
mixed_decoder=True, mixed_decoder=True,
enable_overlap_scheduler=enable_overlap_scheduler, disable_overlap_scheduler=disable_overlap_scheduler,
) )
kv_cache_config = KvCacheConfig( kv_cache_config = KvCacheConfig(
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, ) free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, )

View File

@ -144,16 +144,16 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
@pytest.mark.skip_less_device_memory(32000) @pytest.mark.skip_less_device_memory(32000)
@pytest.mark.skip_device_not_contain(["H100"]) @pytest.mark.skip_device_not_contain(["H100"])
@pytest.mark.parametrize("overlap_scheduler", [False, True]) @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
def test_auto_dtype(self, overlap_scheduler): def test_auto_dtype(self, disable_overlap_scheduler):
ctx_server_config = { ctx_server_config = {
"pytorch_backend_config": { "pytorch_backend_config": {
"enable_overlap_scheduler": False "disable_overlap_scheduler": True
} }
} }
gen_server_config = { gen_server_config = {
"pytorch_backend_config": { "pytorch_backend_config": {
"enable_overlap_scheduler": overlap_scheduler "disable_overlap_scheduler": disable_overlap_scheduler
} }
} }
disaggregated_server_config = { disaggregated_server_config = {

View File

@ -78,6 +78,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
cuda_graph_padding_enabled=torch_compile, cuda_graph_padding_enabled=torch_compile,
cuda_graph_batch_sizes=[4], cuda_graph_batch_sizes=[4],
attn_backend=attn_backend, attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
) )
llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config) llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config)
with llm: with llm:
@ -102,6 +103,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
cuda_graph_padding_enabled=torch_compile, cuda_graph_padding_enabled=torch_compile,
cuda_graph_batch_sizes=[4], cuda_graph_batch_sizes=[4],
attn_backend=attn_backend, attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
) )
llm = LLM(self.MODEL_PATH, llm = LLM(self.MODEL_PATH,
tensor_parallel_size=tp_size, tensor_parallel_size=tp_size,
@ -124,6 +126,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
cuda_graph_padding_enabled=torch_compile, cuda_graph_padding_enabled=torch_compile,
cuda_graph_batch_sizes=[4], cuda_graph_batch_sizes=[4],
attn_backend=attn_backend, attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
) )
if fp8kv: if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8 quant_config.kv_cache_quant_algo = QuantAlgo.FP8
@ -160,6 +163,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
cuda_graph_padding_enabled=torch_compile, cuda_graph_padding_enabled=torch_compile,
cuda_graph_batch_sizes=[4], cuda_graph_batch_sizes=[4],
attn_backend=attn_backend, attn_backend=attn_backend,
disable_overlap_scheduler=torch_compile,
) )
if fp8kv: if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8 quant_config.kv_cache_quant_algo = QuantAlgo.FP8
@ -361,7 +365,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
# OOM on H100 with default free_gpu_memory_fraction=0.9 # OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
mtp_config = None mtp_config = None
if mtp_nextn > 0: if mtp_nextn > 0:
@ -393,7 +397,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
# OOM on H100 with default free_gpu_memory_fraction=0.9 # OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
mtp_config = None mtp_config = None
if mtp_nextn > 0: if mtp_nextn > 0:
@ -426,7 +430,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
# OOM on H100 with default free_gpu_memory_fraction=0.9 # OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
quant_config = QuantConfig() quant_config = QuantConfig()
@ -477,7 +481,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
# OOM on H100 with default free_gpu_memory_fraction=0.9 # OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
quant_config = QuantConfig() quant_config = QuantConfig()
@ -522,7 +526,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
(True, True, True, True)]) (True, True, True, True)])
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler): def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
quant_config = QuantConfig() quant_config = QuantConfig()
@ -563,7 +567,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
overlap_scheduler, tp_size, pp_size, ep_size): overlap_scheduler, tp_size, pp_size, ep_size):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
quant_config = QuantConfig() quant_config = QuantConfig()
@ -615,7 +619,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
quant_config = QuantConfig() quant_config = QuantConfig()
@ -663,7 +667,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
batch_size): batch_size):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
quant_config = QuantConfig() quant_config = QuantConfig()
@ -715,7 +719,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
@pytest.mark.skip_less_device(8) @pytest.mark.skip_less_device(8)
def test_auto_dtype_tp8(self): def test_auto_dtype_tp8(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
pytorch_config = PyTorchConfig(enable_overlap_scheduler=True) pytorch_config = PyTorchConfig()
with LLM(self.MODEL_PATH, with LLM(self.MODEL_PATH,
tensor_parallel_size=8, tensor_parallel_size=8,
@ -798,7 +802,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler): cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8", llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
@ -825,7 +829,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler): cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8", llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
@ -848,7 +852,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
overlap_scheduler): overlap_scheduler):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
llm = LLM( llm = LLM(
@ -872,7 +876,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
overlap_scheduler): overlap_scheduler):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
llm = LLM( llm = LLM(
@ -900,7 +904,7 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness):
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler): cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler, disable_overlap_scheduler=not overlap_scheduler,
use_cuda_graph=cuda_graph) use_cuda_graph=cuda_graph)
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8", llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",

View File

@ -5,7 +5,7 @@ backend: "pytorch"
free_gpu_memory_fraction: 0.1 free_gpu_memory_fraction: 0.1
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
autotuner_enabled: False autotuner_enabled: False
context_servers: context_servers:
num_instances: 2 num_instances: 2

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.15
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
autotuner_enabled: False autotuner_enabled: False
context_servers: context_servers:
num_instances: 1 num_instances: 1

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 1 tensor_parallel_size: 1

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
speculative_config: speculative_config:
decoding_type: MTP decoding_type: MTP
num_nextn_predict_layers: 1 num_nextn_predict_layers: 1

View File

@ -13,7 +13,7 @@ context_servers:
enable_attention_dp: true enable_attention_dp: true
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
generation_servers: generation_servers:
@ -23,6 +23,6 @@ generation_servers:
enable_attention_dp: true enable_attention_dp: true
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: True disable_overlap_scheduler: False
urls: urls:
- "localhost:8002" - "localhost:8002"

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 2 tensor_parallel_size: 2

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 2 tensor_parallel_size: 2

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 2 tensor_parallel_size: 2

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 2 tensor_parallel_size: 2

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
speculative_config: speculative_config:
decoding_type: MTP decoding_type: MTP
num_nextn_predict_layers: 1 num_nextn_predict_layers: 1

View File

@ -10,7 +10,7 @@ context_servers:
enable_attention_dp: True enable_attention_dp: True
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
generation_servers: generation_servers:
@ -20,6 +20,6 @@ generation_servers:
enable_attention_dp: True enable_attention_dp: True
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: True disable_overlap_scheduler: False
urls: urls:
- "localhost:8002" - "localhost:8002"

View File

@ -10,7 +10,7 @@ context_servers:
enable_attention_dp: true enable_attention_dp: true
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
generation_servers: generation_servers:
@ -20,6 +20,6 @@ generation_servers:
enable_attention_dp: true enable_attention_dp: true
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: True use_cuda_graph: True
enable_overlap_scheduler: True disable_overlap_scheduler: False
urls: urls:
- "localhost:8002" - "localhost:8002"

View File

@ -9,7 +9,7 @@ context_servers:
pipeline_parallel_size: 1 pipeline_parallel_size: 1
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
generation_servers: generation_servers:
@ -18,6 +18,6 @@ generation_servers:
pipeline_parallel_size: 1 pipeline_parallel_size: 1
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: True use_cuda_graph: True
enable_overlap_scheduler: True disable_overlap_scheduler: False
urls: urls:
- "localhost:8002" - "localhost:8002"

View File

@ -15,7 +15,7 @@ context_servers:
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: True use_cuda_graph: True
cuda_graph_batch_sizes: [1,3000] cuda_graph_batch_sizes: [1,3000]
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
generation_servers: generation_servers:
@ -30,7 +30,7 @@ generation_servers:
enable_partial_reuse: False enable_partial_reuse: False
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: True use_cuda_graph: True
enable_overlap_scheduler: False disable_overlap_scheduler: True
cuda_graph_padding_enabled: True cuda_graph_padding_enabled: True
cuda_graph_batch_sizes: [1,4,8,16,24,32] cuda_graph_batch_sizes: [1,4,8,16,24,32]
urls: urls:

View File

@ -18,7 +18,7 @@ context_servers:
enable_partial_reuse: False enable_partial_reuse: False
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
- "localhost:8002" - "localhost:8002"
@ -37,7 +37,7 @@ generation_servers:
enable_partial_reuse: False enable_partial_reuse: False
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: True disable_overlap_scheduler: False
urls: urls:
- "localhost:8003" - "localhost:8003"
- "localhost:8004" - "localhost:8004"

View File

@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
backend: "pytorch" backend: "pytorch"
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
context_servers: context_servers:
num_instances: 1 num_instances: 1
tensor_parallel_size: 1 tensor_parallel_size: 1

View File

@ -15,7 +15,7 @@ context_servers:
enable_partial_reuse: False enable_partial_reuse: False
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: False disable_overlap_scheduler: True
urls: urls:
- "localhost:8001" - "localhost:8001"
generation_servers: generation_servers:
@ -30,6 +30,6 @@ generation_servers:
enable_partial_reuse: False enable_partial_reuse: False
pytorch_backend_config: pytorch_backend_config:
use_cuda_graph: False use_cuda_graph: False
enable_overlap_scheduler: True disable_overlap_scheduler: False
urls: urls:
- "localhost:8002" - "localhost:8002"

View File

@ -110,13 +110,13 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
# Context worker # Context worker
worker_pytorch_configs.append( worker_pytorch_configs.append(
PyTorchConfig(enable_overlap_scheduler=False, PyTorchConfig(disable_overlap_scheduler=True,
kv_cache_dtype="auto", kv_cache_dtype="auto",
use_cuda_graph=enable_cuda_graph)) use_cuda_graph=enable_cuda_graph))
# Generation worker # Generation worker
worker_pytorch_configs.append( worker_pytorch_configs.append(
PyTorchConfig(enable_overlap_scheduler=generation_overlap, PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
kv_cache_dtype="auto", kv_cache_dtype="auto",
use_cuda_graph=enable_cuda_graph)) use_cuda_graph=enable_cuda_graph))
@ -228,13 +228,13 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
# Context worker # Context worker
worker_pytorch_configs.append( worker_pytorch_configs.append(
PyTorchConfig(enable_overlap_scheduler=False, PyTorchConfig(disable_overlap_scheduler=True,
kv_cache_dtype="auto", kv_cache_dtype="auto",
use_cuda_graph=enable_cuda_graph)) use_cuda_graph=enable_cuda_graph))
# Generation worker # Generation worker
worker_pytorch_configs.append( worker_pytorch_configs.append(
PyTorchConfig(enable_overlap_scheduler=generation_overlap, PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
kv_cache_dtype="auto", kv_cache_dtype="auto",
use_cuda_graph=enable_cuda_graph)) use_cuda_graph=enable_cuda_graph))

View File

@ -29,7 +29,6 @@ def get_model_yaml_config(model_label: str) -> dict:
base_config = { base_config = {
'enable_attention_dp': True, 'enable_attention_dp': True,
'pytorch_backend_config': { 'pytorch_backend_config': {
'enable_overlap_scheduler': True,
'print_iter_log': True, 'print_iter_log': True,
'use_cuda_graph': True, 'use_cuda_graph': True,
'cuda_graph_padding_enabled': True, 'cuda_graph_padding_enabled': True,
@ -40,7 +39,6 @@ def get_model_yaml_config(model_label: str) -> dict:
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8': 'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8':
{ {
'pytorch_backend_config': { 'pytorch_backend_config': {
'enable_overlap_scheduler': True,
'use_cuda_graph': True, 'use_cuda_graph': True,
}, },
'speculative_config': { 'speculative_config': {
@ -51,7 +49,6 @@ def get_model_yaml_config(model_label: str) -> dict:
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8': 'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8':
{ {
'pytorch_backend_config': { 'pytorch_backend_config': {
'enable_overlap_scheduler': True,
'use_cuda_graph': True, 'use_cuda_graph': True,
}, },
'speculative_config': { 'speculative_config': {

View File

@ -502,9 +502,6 @@ def stress_test(config,
"capacity_scheduler_policy": "capacity_scheduler_policy":
test_server_config.capacity_scheduler_policy test_server_config.capacity_scheduler_policy
}, },
"pytorch_backend_config": {
"enable_overlap_scheduler": True,
},
} }
# Add DeepSeek-V3 specific configuration # Add DeepSeek-V3 specific configuration
@ -519,7 +516,6 @@ def stress_test(config,
"cuda_graph_batch_sizes": "cuda_graph_batch_sizes":
[1, 2, 4, 8, 16, 32, 64, 128, 256, 384], [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
"print_iter_log": True, "print_iter_log": True,
"enable_overlap_scheduler": True
} }
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',

View File

@ -425,7 +425,6 @@ def temp_extra_llm_api_options_file(request):
if request.node.callspec.params['pytorch_backend_config']: if request.node.callspec.params['pytorch_backend_config']:
extra_llm_api_options_dict["pytorch_backend_config"] = { extra_llm_api_options_dict["pytorch_backend_config"] = {
"enable_overlap_scheduler": True,
"use_cuda_graph": True, "use_cuda_graph": True,
"cuda_graph_batch_sizes": [1, 2, 3], "cuda_graph_batch_sizes": [1, 2, 3],
} }
@ -1303,7 +1302,6 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
kv_cache_fraction = 0.6 if "Qwen3" in model_name else None kv_cache_fraction = 0.6 if "Qwen3" in model_name else None
llm_venv.run_cmd([ llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--enable_chunked_prefill", "--enable_chunked_prefill",
f"--kv_cache_fraction={kv_cache_fraction}", f"--kv_cache_fraction={kv_cache_fraction}",
"--model_dir", "--model_dir",
@ -1329,7 +1327,6 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
llm_venv.run_cmd( llm_venv.run_cmd(
[ [
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--use_cuda_graph", "--use_cuda_graph",
"--spec_decode_nextn", "--spec_decode_nextn",
"1", # test 1 MTP module "1", # test 1 MTP module
@ -1359,7 +1356,6 @@ def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus(
delete_on_close=True) as running_log: delete_on_close=True) as running_log:
llm_venv.run_cmd([ llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--model_dir", "--model_dir",
f"{llm_models_root()}/{model_path}", f"{llm_models_root()}/{model_path}",
"--moe_ep_size=8", "--moe_ep_size=8",
@ -1397,6 +1393,7 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
"--eagle_model_dir", "--eagle_model_dir",
f"{llm_models_root()}/{eagle_model_path}", f"{llm_models_root()}/{eagle_model_path}",
"--disable_kv_cache_reuse", "--disable_kv_cache_reuse",
"--disable_overlap_scheduler",
], ],
running_log=running_log) running_log=running_log)
_check_mem_usage(running_log, [25.2, 0, 0, 0]) _check_mem_usage(running_log, [25.2, 0, 0, 0])
@ -1420,7 +1417,6 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv,
delete_on_close=True) as running_log: delete_on_close=True) as running_log:
llm_venv.run_cmd([ llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--model_dir", "--model_dir",
f"{llm_models_root()}/{model_path}", f"{llm_models_root()}/{model_path}",
"--moe_tp_size=1", "--moe_tp_size=1",
@ -1454,7 +1450,6 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
delete_on_close=True) as running_log: delete_on_close=True) as running_log:
llm_venv.run_cmd([ llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--model_dir", "--model_dir",
f"{llm_models_root()}/{model_path}", f"{llm_models_root()}/{model_path}",
"--moe_tp_size=1", "--moe_tp_size=1",
@ -1518,7 +1513,6 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
delete_on_close=True) as running_log: delete_on_close=True) as running_log:
llm_venv.run_cmd([ llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--enable_chunked_prefill", "--enable_chunked_prefill",
"--model_dir", "--model_dir",
f"{llm_models_root()}/{model_path}", f"{llm_models_root()}/{model_path}",
@ -1544,7 +1538,6 @@ def test_ptp_quickstart_advanced_2gpus_sm120(llm_root, llm_venv, model_name,
example_root = Path(os.path.join(llm_root, "examples", "pytorch")) example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
llm_venv.run_cmd([ llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"), str(example_root / "quickstart_advanced.py"),
"--enable_overlap_scheduler",
"--enable_chunked_prefill", "--enable_chunked_prefill",
"--model_dir", "--model_dir",
f"{llm_models_root()}/{model_path}", f"{llm_models_root()}/{model_path}",
@ -1789,7 +1782,8 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
sampling_param = SamplingParams(max_tokens=32, return_context_logits=True) sampling_param = SamplingParams(max_tokens=32, return_context_logits=True)
with LLM( with LLM(
model=model_dir, model=model_dir,
pytorch_backend_config=PyTorchConfig(attn_backend=backend), pytorch_backend_config=PyTorchConfig(
attn_backend=backend, disable_overlap_scheduler=True),
) as llm: ) as llm:
outputs = llm.generate(prompts, sampling_params=sampling_param) outputs = llm.generate(prompts, sampling_params=sampling_param)

View File

@ -57,7 +57,7 @@ def test_deepseek_trtllmgen(model_name):
] * 4 ] * 4
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=False, disable_overlap_scheduler=True,
use_cuda_graph=False, use_cuda_graph=False,
kv_cache_dtype="auto", kv_cache_dtype="auto",
attn_backend="TRTLLM", attn_backend="TRTLLM",

View File

@ -3,6 +3,7 @@ import unittest
from parameterized import parameterized from parameterized import parameterized
from tensorrt_llm._torch import LLM from tensorrt_llm._torch import LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.sampling_params import SamplingParams from tensorrt_llm.sampling_params import SamplingParams
@ -40,7 +41,9 @@ class TestOutOfTree(unittest.TestCase):
llm = LLM(model=model_dir, llm = LLM(model=model_dir,
kv_cache_config=kv_cache_config, kv_cache_config=kv_cache_config,
max_num_tokens=2048) max_num_tokens=2048,
pytorch_backend_config=PyTorchConfig(
disable_overlap_scheduler=True))
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",

View File

@ -62,7 +62,8 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size,
max_output_tokens = 128 max_output_tokens = 128
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
pytorch_backend_config = PyTorchConfig( pytorch_backend_config = PyTorchConfig(
attn_backend='FLASHINFER_STAR_ATTENTION') attn_backend='FLASHINFER_STAR_ATTENTION',
disable_overlap_scheduler=True)
llm = LLM(model=model_dir, llm = LLM(model=model_dir,
backend=backend, backend=backend,

View File

@ -57,7 +57,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
] * 32 ] * 32
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=False, disable_overlap_scheduler=True,
use_cuda_graph=False, use_cuda_graph=False,
kv_cache_dtype="auto", kv_cache_dtype="auto",
attn_backend=backend, attn_backend=backend,

View File

@ -25,7 +25,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str):
models_path = llm_models_root() models_path = llm_models_root()
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
enable_overlap_scheduler=False, disable_overlap_scheduler=True,
use_cuda_graph=use_cuda_graph, use_cuda_graph=use_cuda_graph,
# Only create a single CUDA graph to prevent OOM in CI # Only create a single CUDA graph to prevent OOM in CI
attn_backend=attn_backend, attn_backend=attn_backend,

View File

@ -22,11 +22,11 @@ def model_path():
return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
def create_llm(model_dir, enable_overlap_scheduler, enable_trtllm_decoder): def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_decoder):
"""Create LLM with specific overlap scheduler setting""" """Create LLM with specific overlap scheduler setting"""
pytorch_config = PyTorchConfig( pytorch_config = PyTorchConfig(
use_cuda_graph=True, use_cuda_graph=True,
enable_overlap_scheduler=enable_overlap_scheduler, disable_overlap_scheduler=disable_overlap_scheduler,
enable_trtllm_decoder=enable_trtllm_decoder) enable_trtllm_decoder=enable_trtllm_decoder)
trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
@ -62,7 +62,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
# Test with overlap scheduler enabled # Test with overlap scheduler enabled
llm = create_llm(model_path, llm = create_llm(model_path,
enable_overlap_scheduler=True, disable_overlap_scheduler=False,
enable_trtllm_decoder=enable_trtllm_decoder) enable_trtllm_decoder=enable_trtllm_decoder)
outputs_with_overlap = llm.generate(prompts, outputs_with_overlap = llm.generate(prompts,
sampling_params=sampling_config, sampling_params=sampling_config,
@ -74,7 +74,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
# Test with overlap scheduler disabled # Test with overlap scheduler disabled
llm = create_llm(model_path, llm = create_llm(model_path,
enable_overlap_scheduler=False, disable_overlap_scheduler=True,
enable_trtllm_decoder=enable_trtllm_decoder) enable_trtllm_decoder=enable_trtllm_decoder)
outputs_without_overlap = llm.generate(prompts, outputs_without_overlap = llm.generate(prompts,
sampling_params=sampling_config, sampling_params=sampling_config,

View File

@ -26,7 +26,7 @@ def temp_extra_llm_api_options_file(request):
extra_llm_api_options_dict = { extra_llm_api_options_dict = {
"guided_decoding_backend": "xgrammar", "guided_decoding_backend": "xgrammar",
"pytorch_backend_config": { "pytorch_backend_config": {
"enable_overlap_scheduler": False, "disable_overlap_scheduler": True,
} }
} }

View File

@ -24,9 +24,7 @@ def client():
kv_cache_config=KvCacheConfig(), kv_cache_config=KvCacheConfig(),
backend="pytorch", backend="pytorch",
pytorch_backend_config=PyTorchConfig( pytorch_backend_config=PyTorchConfig(
enable_overlap_scheduler=True, enable_iter_perf_stats=True, ))
enable_iter_perf_stats=True,
))
hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path) hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
app_instance = OpenAIServer(llm, app_instance = OpenAIServer(llm,

View File

@ -1875,7 +1875,7 @@ def llm_get_stats_test_harness(tp_size: int = 1,
llm_args_extra["pytorch_backend_config"] = PyTorchConfig( llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
enable_iter_perf_stats=True, enable_iter_perf_stats=True,
enable_iter_req_stats=enable_iter_req_stats, enable_iter_req_stats=enable_iter_req_stats,
enable_overlap_scheduler=use_overlap) disable_overlap_scheduler=not use_overlap)
LLM_CLASS = LLM_torch LLM_CLASS = LLM_torch
else: else:
LLM_CLASS = LLM LLM_CLASS = LLM
@ -1944,8 +1944,8 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
llm_args_extra["pytorch_backend_config"] = PyTorchConfig( llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
enable_iter_perf_stats=True, enable_iter_perf_stats=True,
enable_overlap_scheduler=use_overlap, enable_iter_req_stats=enable_iter_req_stats,
enable_iter_req_stats=enable_iter_req_stats) disable_overlap_scheduler=not use_overlap)
LLM_CLASS = LLM_torch LLM_CLASS = LLM_torch
else: else:
LLM_CLASS = LLM LLM_CLASS = LLM

View File

@ -82,9 +82,9 @@ def test_llm_reward_model():
from tensorrt_llm._torch import LLM as LLM_torch from tensorrt_llm._torch import LLM as LLM_torch
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
llm = LLM_torch( llm = LLM_torch(model=rm_model_path,
model=rm_model_path, pytorch_backend_config=PyTorchConfig(
pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA")) attn_backend="VANILLA", disable_overlap_scheduler=True))
sampling_params = SamplingParams(return_context_logits=True) sampling_params = SamplingParams(return_context_logits=True)