mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Breaking change: perf: Enable scheduling overlap by default (#4174)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
This commit is contained in:
parent
404fbe9b32
commit
b4e5df0ee0
@ -135,7 +135,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
|
|||||||
|
|
||||||
cat >./extra-llm-api-config.yml<<EOF
|
cat >./extra-llm-api-config.yml<<EOF
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
enable_overlap_scheduler: true
|
|
||||||
use_cuda_graph: true
|
use_cuda_graph: true
|
||||||
moe_backend: TRTLLM
|
moe_backend: TRTLLM
|
||||||
speculative_config:
|
speculative_config:
|
||||||
@ -218,7 +217,6 @@ pytorch_backend_config:
|
|||||||
- 256
|
- 256
|
||||||
- 384
|
- 384
|
||||||
print_iter_log: true
|
print_iter_log: true
|
||||||
enable_overlap_scheduler: true
|
|
||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -260,7 +258,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
|
|||||||
|
|
||||||
cat >./extra-llm-api-config.yml<<EOF
|
cat >./extra-llm-api-config.yml<<EOF
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
enable_overlap_scheduler: true
|
|
||||||
use_cuda_graph: true
|
use_cuda_graph: true
|
||||||
speculative_config:
|
speculative_config:
|
||||||
decoding_type: MTP
|
decoding_type: MTP
|
||||||
@ -314,7 +311,6 @@ pytorch_backend_config:
|
|||||||
use_cuda_graph: true
|
use_cuda_graph: true
|
||||||
cuda_graph_batch_sizes:
|
cuda_graph_batch_sizes:
|
||||||
- 128
|
- 128
|
||||||
enable_overlap_scheduler: true
|
|
||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ You can use multiple `trtllm-serve` commands to launch the context and generatio
|
|||||||
for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
|
for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
|
||||||
|
|
||||||
```
|
```
|
||||||
echo -e "pytorch_backend_config:\n enable_overlap_scheduler: False\ncache_transceiver_config:\n max_num_tokens: 2048" > context_extra-llm-api-config.yml
|
echo -e "pytorch_backend_config:\n disable_overlap_scheduler: True\ncache_transceiver_config:\n max_num_tokens: 2048" > context_extra-llm-api-config.yml
|
||||||
echo -e "cache_transceiver_config:\n max_num_tokens: 2048" > gen_extra-llm-api-config.yml
|
echo -e "cache_transceiver_config:\n max_num_tokens: 2048" > gen_extra-llm-api-config.yml
|
||||||
|
|
||||||
export TRTLLM_USE_UCX_KVCACHE=1
|
export TRTLLM_USE_UCX_KVCACHE=1
|
||||||
@ -65,7 +65,7 @@ model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 1
|
tensor_parallel_size: 1
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 1
|
tensor_parallel_size: 1
|
||||||
|
|||||||
@ -6,8 +6,7 @@ from tensorrt_llm.llmapi import KvCacheConfig
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
pytorch_config = PyTorchConfig(enable_overlap_scheduler=True,
|
pytorch_config = PyTorchConfig(autotuner_enabled=False,
|
||||||
autotuner_enabled=False,
|
|
||||||
kv_cache_dtype='auto')
|
kv_cache_dtype='auto')
|
||||||
|
|
||||||
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||||
|
|||||||
@ -76,7 +76,6 @@ srun -l \
|
|||||||
cat > /tmp/pytorch_extra_args.txt << EOF
|
cat > /tmp/pytorch_extra_args.txt << EOF
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: false
|
use_cuda_graph: false
|
||||||
enable_overlap_scheduler: true
|
|
||||||
cuda_graph_padding_enabled: false
|
cuda_graph_padding_enabled: false
|
||||||
print_iter_log: true
|
print_iter_log: true
|
||||||
enable_attention_dp: false
|
enable_attention_dp: false
|
||||||
|
|||||||
@ -21,7 +21,10 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
|
|||||||
- [Quick Start](#quick-start)
|
- [Quick Start](#quick-start)
|
||||||
- [Run a single inference](#run-a-single-inference)
|
- [Run a single inference](#run-a-single-inference)
|
||||||
- [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
|
- [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
|
||||||
|
- [Relaxed acceptance](#relaxed-acceptance)
|
||||||
- [Long context support](#long-context-support)
|
- [Long context support](#long-context-support)
|
||||||
|
- [ISL-64k-OSL-1024](#isl-64k-osl-1024)
|
||||||
|
- [ISL-128k-OSL-1024](#isl-128k-osl-1024)
|
||||||
- [Evaluation](#evaluation)
|
- [Evaluation](#evaluation)
|
||||||
- [Serving](#serving)
|
- [Serving](#serving)
|
||||||
- [Use trtllm-serve](#use-trtllm-serve)
|
- [Use trtllm-serve](#use-trtllm-serve)
|
||||||
@ -36,6 +39,7 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
|
|||||||
- [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla)
|
- [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla)
|
||||||
- [W4AFP8](#w4afp8)
|
- [W4AFP8](#w4afp8)
|
||||||
- [Notes and Troubleshooting](#notes-and-troubleshooting)
|
- [Notes and Troubleshooting](#notes-and-troubleshooting)
|
||||||
|
- [Known Issues](#known-issues)
|
||||||
|
|
||||||
|
|
||||||
## Hardware Requirements
|
## Hardware Requirements
|
||||||
@ -136,7 +140,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
|
|||||||
|
|
||||||
cat <<EOF > /tmp/extra-llm-api-config.yml
|
cat <<EOF > /tmp/extra-llm-api-config.yml
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
enable_overlap_scheduler: true
|
|
||||||
use_cuda_graph: true
|
use_cuda_graph: true
|
||||||
cuda_graph_padding_enabled: true
|
cuda_graph_padding_enabled: true
|
||||||
cuda_graph_batch_sizes: [1, 4, 8, 12]
|
cuda_graph_batch_sizes: [1, 4, 8, 12]
|
||||||
@ -165,7 +168,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
|
|||||||
|
|
||||||
cat <<EOF > /tmp/extra-llm-api-config.yml
|
cat <<EOF > /tmp/extra-llm-api-config.yml
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
enable_overlap_scheduler: true
|
|
||||||
use_cuda_graph: true
|
use_cuda_graph: true
|
||||||
cuda_graph_padding_enabled: true
|
cuda_graph_padding_enabled: true
|
||||||
cuda_graph_batch_sizes: [1, 2]
|
cuda_graph_batch_sizes: [1, 2]
|
||||||
@ -192,7 +194,6 @@ Evaluate the model accuracy using `trtllm-eval`.
|
|||||||
cat >./extra-llm-api-config.yml <<EOF
|
cat >./extra-llm-api-config.yml <<EOF
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: true
|
use_cuda_graph: true
|
||||||
enable_overlap_scheduler: true
|
|
||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
EOF
|
EOF
|
||||||
```
|
```
|
||||||
@ -249,7 +250,6 @@ pytorch_backend_config:
|
|||||||
- 256
|
- 256
|
||||||
- 384
|
- 384
|
||||||
print_iter_log: true
|
print_iter_log: true
|
||||||
enable_overlap_scheduler: true
|
|
||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
@ -441,7 +441,6 @@ pytorch_backend_config:
|
|||||||
- 256
|
- 256
|
||||||
- 384
|
- 384
|
||||||
print_iter_log: true
|
print_iter_log: true
|
||||||
enable_overlap_scheduler: true
|
|
||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
EOF
|
EOF
|
||||||
```
|
```
|
||||||
|
|||||||
@ -22,7 +22,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
|
|||||||
- [Run a single inference](#run-a-single-inference)
|
- [Run a single inference](#run-a-single-inference)
|
||||||
- [Evaluation](#evaluation)
|
- [Evaluation](#evaluation)
|
||||||
- [Serving](#serving)
|
- [Serving](#serving)
|
||||||
- [Notes and Troubleshooting](#notes-and-troubleshooting)
|
- [Notes and Troubleshooting](#notes-and-troubleshooting)
|
||||||
- [Credits](#credits)
|
- [Credits](#credits)
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
@ -668,7 +668,6 @@ pytorch_backend_config:
|
|||||||
- 256
|
- 256
|
||||||
- 384
|
- 384
|
||||||
print_iter_log: true
|
print_iter_log: true
|
||||||
enable_overlap_scheduler: true
|
|
||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@ -72,7 +72,7 @@ def add_llm_args(parser):
|
|||||||
parser.add_argument("--kv_cache_fraction", type=float, default=None)
|
parser.add_argument("--kv_cache_fraction", type=float, default=None)
|
||||||
|
|
||||||
# Runtime
|
# Runtime
|
||||||
parser.add_argument('--enable_overlap_scheduler',
|
parser.add_argument('--disable_overlap_scheduler',
|
||||||
default=False,
|
default=False,
|
||||||
action='store_true')
|
action='store_true')
|
||||||
parser.add_argument('--enable_chunked_prefill',
|
parser.add_argument('--enable_chunked_prefill',
|
||||||
@ -124,7 +124,7 @@ def parse_arguments():
|
|||||||
|
|
||||||
def setup_llm(args):
|
def setup_llm(args):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=args.enable_overlap_scheduler,
|
disable_overlap_scheduler=args.disable_overlap_scheduler,
|
||||||
kv_cache_dtype=args.kv_cache_dtype,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
attn_backend=args.attention_backend,
|
attn_backend=args.attention_backend,
|
||||||
use_cuda_graph=args.use_cuda_graph,
|
use_cuda_graph=args.use_cuda_graph,
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def main():
|
|||||||
max_batch_size=args.sample_num,
|
max_batch_size=args.sample_num,
|
||||||
max_num_tokens=8192,
|
max_num_tokens=8192,
|
||||||
kv_cache_free_gpu_memory_fraction=0.2,
|
kv_cache_free_gpu_memory_fraction=0.2,
|
||||||
enable_overlap_scheduler=False)
|
disable_overlap_scheduler=True)
|
||||||
workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
|
workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
|
||||||
workers[QwenRewardController.WorkerTag.REWARD] = reward_worker
|
workers[QwenRewardController.WorkerTag.REWARD] = reward_worker
|
||||||
|
|
||||||
|
|||||||
@ -302,7 +302,7 @@ def create_autodeploy_executor(
|
|||||||
model_engine=engine,
|
model_engine=engine,
|
||||||
decoder=decoder,
|
decoder=decoder,
|
||||||
dist=mpi_dist,
|
dist=mpi_dist,
|
||||||
enable_overlap_scheduler=py_config.enable_overlap_scheduler,
|
disable_overlap_scheduler=py_config.disable_overlap_scheduler,
|
||||||
max_input_len=executor_config.max_input_len,
|
max_input_len=executor_config.max_input_len,
|
||||||
max_batch_size=executor_config.max_batch_size,
|
max_batch_size=executor_config.max_batch_size,
|
||||||
max_draft_tokens=executor_config.speculative_config.max_draft_tokens
|
max_draft_tokens=executor_config.speculative_config.max_draft_tokens
|
||||||
|
|||||||
@ -343,7 +343,7 @@ def create_py_executor_instance(dist,
|
|||||||
if spec_config is not None:
|
if spec_config is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Guided decoding is not supported with speculative decoding.")
|
"Guided decoding is not supported with speculative decoding.")
|
||||||
if pytorch_backend_config.enable_overlap_scheduler:
|
if not pytorch_backend_config.disable_overlap_scheduler:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Guided decoding is not supported with overlap scheduler.")
|
"Guided decoding is not supported with overlap scheduler.")
|
||||||
|
|
||||||
@ -415,7 +415,7 @@ def create_py_executor_instance(dist,
|
|||||||
if mapping.has_pp():
|
if mapping.has_pp():
|
||||||
num_micro_batches = mapping.pp_size
|
num_micro_batches = mapping.pp_size
|
||||||
else:
|
else:
|
||||||
num_micro_batches = 2 if pytorch_backend_config.enable_overlap_scheduler else 1
|
num_micro_batches = 1 if pytorch_backend_config.disable_overlap_scheduler else 2
|
||||||
|
|
||||||
resources["seq_slot_manager"] = SeqSlotManager(
|
resources["seq_slot_manager"] = SeqSlotManager(
|
||||||
executor_config.max_batch_size * num_micro_batches)
|
executor_config.max_batch_size * num_micro_batches)
|
||||||
@ -450,8 +450,8 @@ def create_py_executor_instance(dist,
|
|||||||
model_engine=model_engine,
|
model_engine=model_engine,
|
||||||
decoder=decoder,
|
decoder=decoder,
|
||||||
dist=dist,
|
dist=dist,
|
||||||
enable_overlap_scheduler=pytorch_backend_config.
|
disable_overlap_scheduler=pytorch_backend_config.
|
||||||
enable_overlap_scheduler,
|
disable_overlap_scheduler,
|
||||||
max_batch_size=executor_config.max_batch_size,
|
max_batch_size=executor_config.max_batch_size,
|
||||||
max_draft_tokens=spec_config.max_draft_tokens
|
max_draft_tokens=spec_config.max_draft_tokens
|
||||||
if spec_config is not None else 0,
|
if spec_config is not None else 0,
|
||||||
@ -471,9 +471,9 @@ def instantiate_decoder(model_engine, executor_config, pytorch_backend_config,
|
|||||||
spec_config=model_engine.spec_config)
|
spec_config=model_engine.spec_config)
|
||||||
elif pytorch_backend_config.enable_trtllm_decoder:
|
elif pytorch_backend_config.enable_trtllm_decoder:
|
||||||
decoding_mode = get_decoding_mode(executor_config)
|
decoding_mode = get_decoding_mode(executor_config)
|
||||||
decoder = TRTLLMDecoder(executor_config, model_engine.model,
|
decoder = TRTLLMDecoder(
|
||||||
model_engine.dtype, mapping, decoding_mode,
|
executor_config, model_engine.model, model_engine.dtype, mapping,
|
||||||
pytorch_backend_config.enable_overlap_scheduler)
|
decoding_mode, pytorch_backend_config.disable_overlap_scheduler)
|
||||||
elif not model_engine.model.model_config.is_generation:
|
elif not model_engine.model.model_config.is_generation:
|
||||||
# NOTE: choose decoder based on model type
|
# NOTE: choose decoder based on model type
|
||||||
decoder = EarlyStopDecoder()
|
decoder = EarlyStopDecoder()
|
||||||
|
|||||||
@ -45,7 +45,7 @@ class PyTorchConfig:
|
|||||||
# If true, batches are rounded up to the nearest cuda_graph_batch_size.
|
# If true, batches are rounded up to the nearest cuda_graph_batch_size.
|
||||||
# This is usually a net win for performance.
|
# This is usually a net win for performance.
|
||||||
cuda_graph_padding_enabled: bool = False
|
cuda_graph_padding_enabled: bool = False
|
||||||
enable_overlap_scheduler: bool = False
|
disable_overlap_scheduler: bool = False
|
||||||
# If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time.
|
# If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time.
|
||||||
# If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.
|
# If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.
|
||||||
moe_max_num_tokens: Optional[int] = None
|
moe_max_num_tokens: Optional[int] = None
|
||||||
|
|||||||
@ -449,7 +449,7 @@ class TRTLLMDecoder(Decoder):
|
|||||||
model_dtype,
|
model_dtype,
|
||||||
mapping: Mapping,
|
mapping: Mapping,
|
||||||
decoding_mode: DecodingMode,
|
decoding_mode: DecodingMode,
|
||||||
enable_overlap_scheduler: bool,
|
disable_overlap_scheduler: bool,
|
||||||
):
|
):
|
||||||
|
|
||||||
vocab_size = model.config.vocab_size
|
vocab_size = model.config.vocab_size
|
||||||
@ -468,7 +468,7 @@ class TRTLLMDecoder(Decoder):
|
|||||||
self.max_num_sequences = mapping.pp_size * self.executor_config.max_batch_size
|
self.max_num_sequences = mapping.pp_size * self.executor_config.max_batch_size
|
||||||
self.max_seq_idle_microseconds = 180 * 1000 * 1000
|
self.max_seq_idle_microseconds = 180 * 1000 * 1000
|
||||||
self.max_decoding_tokens = 1 # It must be 1 when not in speculative decoding
|
self.max_decoding_tokens = 1 # It must be 1 when not in speculative decoding
|
||||||
self.is_trt_overlap = enable_overlap_scheduler
|
self.is_trt_overlap = not disable_overlap_scheduler
|
||||||
|
|
||||||
self.world_config = WorldConfig.mpi(mapping.gpus_per_node,
|
self.world_config = WorldConfig.mpi(mapping.gpus_per_node,
|
||||||
mapping.tp_size, mapping.pp_size)
|
mapping.tp_size, mapping.pp_size)
|
||||||
|
|||||||
@ -331,7 +331,7 @@ class PyTorchModelEngine(ModelEngine):
|
|||||||
layerwise_nvtx_marker.register_hooks(self.model, module_prefix)
|
layerwise_nvtx_marker.register_hooks(self.model, module_prefix)
|
||||||
|
|
||||||
self.enable_attention_dp = self.model.model_config.mapping.enable_attention_dp
|
self.enable_attention_dp = self.model.model_config.mapping.enable_attention_dp
|
||||||
self._enable_overlap_scheduler = self.pytorch_backend_config.enable_overlap_scheduler
|
self._disable_overlap_scheduler = self.pytorch_backend_config.disable_overlap_scheduler
|
||||||
self._torch_compile_backend = None
|
self._torch_compile_backend = None
|
||||||
self.dtype = self.model.config.torch_dtype
|
self.dtype = self.model.config.torch_dtype
|
||||||
self._init_model_capacity()
|
self._init_model_capacity()
|
||||||
@ -982,7 +982,7 @@ class PyTorchModelEngine(ModelEngine):
|
|||||||
"""
|
"""
|
||||||
Make some changes to the device inputs and avoid block the async data transfer
|
Make some changes to the device inputs and avoid block the async data transfer
|
||||||
"""
|
"""
|
||||||
if self.is_spec_decode and self._enable_overlap_scheduler:
|
if self.is_spec_decode and not self._disable_overlap_scheduler:
|
||||||
# When enabling overlap scheduler, the kv cache for draft tokens will
|
# When enabling overlap scheduler, the kv cache for draft tokens will
|
||||||
# be prepared in advance by using the max_draft_len. But we need to use
|
# be prepared in advance by using the max_draft_len. But we need to use
|
||||||
# new_tokens_lens_device to get the real past kv lengths and the
|
# new_tokens_lens_device to get the real past kv lengths and the
|
||||||
@ -1086,7 +1086,7 @@ class PyTorchModelEngine(ModelEngine):
|
|||||||
dtype=torch.int32).to('cuda',
|
dtype=torch.int32).to('cuda',
|
||||||
non_blocking=True))
|
non_blocking=True))
|
||||||
|
|
||||||
if self._enable_overlap_scheduler and self.is_spec_decode:
|
if not self._disable_overlap_scheduler and self.is_spec_decode:
|
||||||
spec_dec_mode = self.spec_config.spec_dec_mode
|
spec_dec_mode = self.spec_config.spec_dec_mode
|
||||||
assert spec_dec_mode.support_overlap_scheduler(
|
assert spec_dec_mode.support_overlap_scheduler(
|
||||||
), f"{self.spec_config.spec_dec_name} does not support overlap scheduler"
|
), f"{self.spec_config.spec_dec_name} does not support overlap scheduler"
|
||||||
|
|||||||
@ -162,7 +162,7 @@ class PyExecutor:
|
|||||||
model_engine: ModelEngine,
|
model_engine: ModelEngine,
|
||||||
decoder: Decoder,
|
decoder: Decoder,
|
||||||
dist: Distributed,
|
dist: Distributed,
|
||||||
enable_overlap_scheduler: bool = False,
|
disable_overlap_scheduler: bool = False,
|
||||||
max_input_len: int = 2048,
|
max_input_len: int = 2048,
|
||||||
max_batch_size: int = 8,
|
max_batch_size: int = 8,
|
||||||
max_draft_tokens: int = 0,
|
max_draft_tokens: int = 0,
|
||||||
@ -187,7 +187,7 @@ class PyExecutor:
|
|||||||
self.enable_attention_dp = model_engine.enable_attention_dp
|
self.enable_attention_dp = model_engine.enable_attention_dp
|
||||||
self.decoder = decoder
|
self.decoder = decoder
|
||||||
self.dist = dist
|
self.dist = dist
|
||||||
self.enable_overlap_scheduler = enable_overlap_scheduler
|
self.disable_overlap_scheduler = disable_overlap_scheduler
|
||||||
|
|
||||||
# Draft model for certain spec decode algorithms, e.g. EAGLE3
|
# Draft model for certain spec decode algorithms, e.g. EAGLE3
|
||||||
self.draft_model_engine = draft_model_engine
|
self.draft_model_engine = draft_model_engine
|
||||||
@ -258,7 +258,7 @@ class PyExecutor:
|
|||||||
if self.dist.pp_size > 1:
|
if self.dist.pp_size > 1:
|
||||||
self.event_loop = self._executor_loop_pp
|
self.event_loop = self._executor_loop_pp
|
||||||
else:
|
else:
|
||||||
self.event_loop = self._executor_loop_overlap if enable_overlap_scheduler else self._executor_loop
|
self.event_loop = self._executor_loop if disable_overlap_scheduler else self._executor_loop_overlap
|
||||||
|
|
||||||
if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"):
|
if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"):
|
||||||
self.event_loop = trace_func(self.event_loop)
|
self.event_loop = trace_func(self.event_loop)
|
||||||
@ -1975,7 +1975,7 @@ class PyExecutor:
|
|||||||
# If request is in transmission, so we don't need to emit a response
|
# If request is in transmission, so we don't need to emit a response
|
||||||
# Also, for the first iteration with overlap, we should skip since first token has already been emitted by context server
|
# Also, for the first iteration with overlap, we should skip since first token has already been emitted by context server
|
||||||
if request.is_disagg_generation_transmission_in_progress or (
|
if request.is_disagg_generation_transmission_in_progress or (
|
||||||
self.enable_overlap_scheduler
|
not self.disable_overlap_scheduler
|
||||||
and request.py_decoding_iter <= 1):
|
and request.py_decoding_iter <= 1):
|
||||||
new_active_requests.append(request)
|
new_active_requests.append(request)
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -104,7 +104,7 @@ def create_py_executor(executor_config: ExecutorConfig,
|
|||||||
# PyTorchModelEngine modifies these fields, update them to executor_config
|
# PyTorchModelEngine modifies these fields, update them to executor_config
|
||||||
max_seq_len = model_engine.max_seq_len
|
max_seq_len = model_engine.max_seq_len
|
||||||
origin_seq_len = max_seq_len
|
origin_seq_len = max_seq_len
|
||||||
if pytorch_backend_config.enable_overlap_scheduler:
|
if not pytorch_backend_config.disable_overlap_scheduler:
|
||||||
max_seq_len = model_engine.max_seq_len + 1
|
max_seq_len = model_engine.max_seq_len + 1
|
||||||
if spec_config is not None:
|
if spec_config is not None:
|
||||||
max_seq_len += spec_config.max_draft_tokens
|
max_seq_len += spec_config.max_draft_tokens
|
||||||
|
|||||||
@ -148,7 +148,6 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
|
|||||||
pyt_options = {
|
pyt_options = {
|
||||||
"use_cuda_graph": True,
|
"use_cuda_graph": True,
|
||||||
"cuda_graph_padding_enabled": True,
|
"cuda_graph_padding_enabled": True,
|
||||||
"enable_overlap_scheduler": True,
|
|
||||||
"kv_cache_dtype": kv_cache_dtype,
|
"kv_cache_dtype": kv_cache_dtype,
|
||||||
"cuda_graph_max_batch_size": max_batch_size,
|
"cuda_graph_max_batch_size": max_batch_size,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -115,7 +115,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
|
|||||||
backend = None
|
backend = None
|
||||||
pytorch_backend_config = None
|
pytorch_backend_config = None
|
||||||
if backend == "pytorch":
|
if backend == "pytorch":
|
||||||
pytorch_backend_config = PyTorchConfig(enable_overlap_scheduler=True)
|
pytorch_backend_config = PyTorchConfig()
|
||||||
|
|
||||||
llm_args = {
|
llm_args = {
|
||||||
"model": model,
|
"model": model,
|
||||||
|
|||||||
@ -50,8 +50,7 @@ def get_llm_args(model: str,
|
|||||||
kv_cache_config = KvCacheConfig(
|
kv_cache_config = KvCacheConfig(
|
||||||
free_gpu_memory_fraction=free_gpu_memory_fraction)
|
free_gpu_memory_fraction=free_gpu_memory_fraction)
|
||||||
|
|
||||||
pytorch_backend_config = PyTorchConfig(
|
pytorch_backend_config = PyTorchConfig() if backend == "pytorch" else None
|
||||||
enable_overlap_scheduler=True) if backend == "pytorch" else None
|
|
||||||
dynamic_batch_config = DynamicBatchConfig(
|
dynamic_batch_config = DynamicBatchConfig(
|
||||||
enable_batch_size_tuning=True,
|
enable_batch_size_tuning=True,
|
||||||
enable_max_num_tokens_tuning=False,
|
enable_max_num_tokens_tuning=False,
|
||||||
|
|||||||
@ -384,7 +384,7 @@ class ExecutorBindingsWorker(GenerationExecutor):
|
|||||||
context_phase_params = request.disaggregated_params.get_context_phase_params(
|
context_phase_params = request.disaggregated_params.get_context_phase_params(
|
||||||
)
|
)
|
||||||
|
|
||||||
is_overlap_enabled = self._is_pytorch_backend and self._executor_config.pytorch_backend_config.enable_overlap_scheduler
|
is_overlap_enabled = self._is_pytorch_backend and not self._executor_config.pytorch_backend_config.disable_overlap_scheduler
|
||||||
if is_overlap_enabled:
|
if is_overlap_enabled:
|
||||||
is_disaggregated = self.engine.kv_cache_transceiver is not None
|
is_disaggregated = self.engine.kv_cache_transceiver is not None
|
||||||
if is_disaggregated and (
|
if is_disaggregated and (
|
||||||
|
|||||||
@ -136,11 +136,11 @@ class TRTLLMWorker(Worker):
|
|||||||
max_batch_size: int = 32,
|
max_batch_size: int = 32,
|
||||||
max_num_tokens: int = 4096,
|
max_num_tokens: int = 4096,
|
||||||
kv_cache_free_gpu_memory_fraction: float = 0.9,
|
kv_cache_free_gpu_memory_fraction: float = 0.9,
|
||||||
enable_overlap_scheduler: bool = True,
|
disable_overlap_scheduler: bool = False,
|
||||||
):
|
):
|
||||||
pytorch_backend_config = PyTorchConfig(
|
pytorch_backend_config = PyTorchConfig(
|
||||||
mixed_decoder=True,
|
mixed_decoder=True,
|
||||||
enable_overlap_scheduler=enable_overlap_scheduler,
|
disable_overlap_scheduler=disable_overlap_scheduler,
|
||||||
)
|
)
|
||||||
kv_cache_config = KvCacheConfig(
|
kv_cache_config = KvCacheConfig(
|
||||||
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, )
|
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, )
|
||||||
|
|||||||
@ -144,16 +144,16 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
|
|||||||
|
|
||||||
@pytest.mark.skip_less_device_memory(32000)
|
@pytest.mark.skip_less_device_memory(32000)
|
||||||
@pytest.mark.skip_device_not_contain(["H100"])
|
@pytest.mark.skip_device_not_contain(["H100"])
|
||||||
@pytest.mark.parametrize("overlap_scheduler", [False, True])
|
@pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
|
||||||
def test_auto_dtype(self, overlap_scheduler):
|
def test_auto_dtype(self, disable_overlap_scheduler):
|
||||||
ctx_server_config = {
|
ctx_server_config = {
|
||||||
"pytorch_backend_config": {
|
"pytorch_backend_config": {
|
||||||
"enable_overlap_scheduler": False
|
"disable_overlap_scheduler": True
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gen_server_config = {
|
gen_server_config = {
|
||||||
"pytorch_backend_config": {
|
"pytorch_backend_config": {
|
||||||
"enable_overlap_scheduler": overlap_scheduler
|
"disable_overlap_scheduler": disable_overlap_scheduler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
disaggregated_server_config = {
|
disaggregated_server_config = {
|
||||||
|
|||||||
@ -78,6 +78,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
|||||||
cuda_graph_padding_enabled=torch_compile,
|
cuda_graph_padding_enabled=torch_compile,
|
||||||
cuda_graph_batch_sizes=[4],
|
cuda_graph_batch_sizes=[4],
|
||||||
attn_backend=attn_backend,
|
attn_backend=attn_backend,
|
||||||
|
disable_overlap_scheduler=torch_compile,
|
||||||
)
|
)
|
||||||
llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config)
|
llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config)
|
||||||
with llm:
|
with llm:
|
||||||
@ -102,6 +103,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
|||||||
cuda_graph_padding_enabled=torch_compile,
|
cuda_graph_padding_enabled=torch_compile,
|
||||||
cuda_graph_batch_sizes=[4],
|
cuda_graph_batch_sizes=[4],
|
||||||
attn_backend=attn_backend,
|
attn_backend=attn_backend,
|
||||||
|
disable_overlap_scheduler=torch_compile,
|
||||||
)
|
)
|
||||||
llm = LLM(self.MODEL_PATH,
|
llm = LLM(self.MODEL_PATH,
|
||||||
tensor_parallel_size=tp_size,
|
tensor_parallel_size=tp_size,
|
||||||
@ -124,6 +126,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
|||||||
cuda_graph_padding_enabled=torch_compile,
|
cuda_graph_padding_enabled=torch_compile,
|
||||||
cuda_graph_batch_sizes=[4],
|
cuda_graph_batch_sizes=[4],
|
||||||
attn_backend=attn_backend,
|
attn_backend=attn_backend,
|
||||||
|
disable_overlap_scheduler=torch_compile,
|
||||||
)
|
)
|
||||||
if fp8kv:
|
if fp8kv:
|
||||||
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
||||||
@ -160,6 +163,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
|
|||||||
cuda_graph_padding_enabled=torch_compile,
|
cuda_graph_padding_enabled=torch_compile,
|
||||||
cuda_graph_batch_sizes=[4],
|
cuda_graph_batch_sizes=[4],
|
||||||
attn_backend=attn_backend,
|
attn_backend=attn_backend,
|
||||||
|
disable_overlap_scheduler=torch_compile,
|
||||||
)
|
)
|
||||||
if fp8kv:
|
if fp8kv:
|
||||||
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
||||||
@ -361,7 +365,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
mtp_config = None
|
mtp_config = None
|
||||||
if mtp_nextn > 0:
|
if mtp_nextn > 0:
|
||||||
@ -393,7 +397,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
mtp_config = None
|
mtp_config = None
|
||||||
if mtp_nextn > 0:
|
if mtp_nextn > 0:
|
||||||
@ -426,7 +430,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
quant_config = QuantConfig()
|
quant_config = QuantConfig()
|
||||||
@ -477,7 +481,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
quant_config = QuantConfig()
|
quant_config = QuantConfig()
|
||||||
@ -522,7 +526,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
(True, True, True, True)])
|
(True, True, True, True)])
|
||||||
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
|
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
quant_config = QuantConfig()
|
quant_config = QuantConfig()
|
||||||
@ -563,7 +567,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
|||||||
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
|
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
|
||||||
overlap_scheduler, tp_size, pp_size, ep_size):
|
overlap_scheduler, tp_size, pp_size, ep_size):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
quant_config = QuantConfig()
|
quant_config = QuantConfig()
|
||||||
@ -615,7 +619,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
|
|||||||
|
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
quant_config = QuantConfig()
|
quant_config = QuantConfig()
|
||||||
@ -663,7 +667,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
|
|||||||
batch_size):
|
batch_size):
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
quant_config = QuantConfig()
|
quant_config = QuantConfig()
|
||||||
@ -715,7 +719,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
|
|||||||
@pytest.mark.skip_less_device(8)
|
@pytest.mark.skip_less_device(8)
|
||||||
def test_auto_dtype_tp8(self):
|
def test_auto_dtype_tp8(self):
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
||||||
pytorch_config = PyTorchConfig(enable_overlap_scheduler=True)
|
pytorch_config = PyTorchConfig()
|
||||||
|
|
||||||
with LLM(self.MODEL_PATH,
|
with LLM(self.MODEL_PATH,
|
||||||
tensor_parallel_size=8,
|
tensor_parallel_size=8,
|
||||||
@ -798,7 +802,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
|
|||||||
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
|
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
|
||||||
cuda_graph, overlap_scheduler):
|
cuda_graph, overlap_scheduler):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
|
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
|
||||||
@ -825,7 +829,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
|
|||||||
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
|
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
|
||||||
cuda_graph, overlap_scheduler):
|
cuda_graph, overlap_scheduler):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
|
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
|
||||||
@ -848,7 +852,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
|
|||||||
def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
||||||
overlap_scheduler):
|
overlap_scheduler):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -872,7 +876,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
|
|||||||
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
|
||||||
overlap_scheduler):
|
overlap_scheduler):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -900,7 +904,7 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness):
|
|||||||
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
|
def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
|
||||||
cuda_graph, overlap_scheduler):
|
cuda_graph, overlap_scheduler):
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=overlap_scheduler,
|
disable_overlap_scheduler=not overlap_scheduler,
|
||||||
use_cuda_graph=cuda_graph)
|
use_cuda_graph=cuda_graph)
|
||||||
|
|
||||||
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",
|
llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",
|
||||||
|
|||||||
@ -5,7 +5,7 @@ backend: "pytorch"
|
|||||||
free_gpu_memory_fraction: 0.1
|
free_gpu_memory_fraction: 0.1
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
autotuner_enabled: False
|
autotuner_enabled: False
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 2
|
num_instances: 2
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.15
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
autotuner_enabled: False
|
autotuner_enabled: False
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 1
|
tensor_parallel_size: 1
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
speculative_config:
|
speculative_config:
|
||||||
decoding_type: MTP
|
decoding_type: MTP
|
||||||
num_nextn_predict_layers: 1
|
num_nextn_predict_layers: 1
|
||||||
|
|||||||
@ -13,7 +13,7 @@ context_servers:
|
|||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
generation_servers:
|
generation_servers:
|
||||||
@ -23,6 +23,6 @@ generation_servers:
|
|||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: True
|
disable_overlap_scheduler: False
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8002"
|
- "localhost:8002"
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 2
|
tensor_parallel_size: 2
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 2
|
tensor_parallel_size: 2
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 2
|
tensor_parallel_size: 2
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 2
|
tensor_parallel_size: 2
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
speculative_config:
|
speculative_config:
|
||||||
decoding_type: MTP
|
decoding_type: MTP
|
||||||
num_nextn_predict_layers: 1
|
num_nextn_predict_layers: 1
|
||||||
|
|||||||
@ -10,7 +10,7 @@ context_servers:
|
|||||||
enable_attention_dp: True
|
enable_attention_dp: True
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
generation_servers:
|
generation_servers:
|
||||||
@ -20,6 +20,6 @@ generation_servers:
|
|||||||
enable_attention_dp: True
|
enable_attention_dp: True
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: True
|
disable_overlap_scheduler: False
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8002"
|
- "localhost:8002"
|
||||||
|
|||||||
@ -10,7 +10,7 @@ context_servers:
|
|||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
generation_servers:
|
generation_servers:
|
||||||
@ -20,6 +20,6 @@ generation_servers:
|
|||||||
enable_attention_dp: true
|
enable_attention_dp: true
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: True
|
use_cuda_graph: True
|
||||||
enable_overlap_scheduler: True
|
disable_overlap_scheduler: False
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8002"
|
- "localhost:8002"
|
||||||
|
|||||||
@ -9,7 +9,7 @@ context_servers:
|
|||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
generation_servers:
|
generation_servers:
|
||||||
@ -18,6 +18,6 @@ generation_servers:
|
|||||||
pipeline_parallel_size: 1
|
pipeline_parallel_size: 1
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: True
|
use_cuda_graph: True
|
||||||
enable_overlap_scheduler: True
|
disable_overlap_scheduler: False
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8002"
|
- "localhost:8002"
|
||||||
|
|||||||
@ -15,7 +15,7 @@ context_servers:
|
|||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: True
|
use_cuda_graph: True
|
||||||
cuda_graph_batch_sizes: [1,3000]
|
cuda_graph_batch_sizes: [1,3000]
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
generation_servers:
|
generation_servers:
|
||||||
@ -30,7 +30,7 @@ generation_servers:
|
|||||||
enable_partial_reuse: False
|
enable_partial_reuse: False
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: True
|
use_cuda_graph: True
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
cuda_graph_padding_enabled: True
|
cuda_graph_padding_enabled: True
|
||||||
cuda_graph_batch_sizes: [1,4,8,16,24,32]
|
cuda_graph_batch_sizes: [1,4,8,16,24,32]
|
||||||
urls:
|
urls:
|
||||||
|
|||||||
@ -18,7 +18,7 @@ context_servers:
|
|||||||
enable_partial_reuse: False
|
enable_partial_reuse: False
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
- "localhost:8002"
|
- "localhost:8002"
|
||||||
@ -37,7 +37,7 @@ generation_servers:
|
|||||||
enable_partial_reuse: False
|
enable_partial_reuse: False
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: True
|
disable_overlap_scheduler: False
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8003"
|
- "localhost:8003"
|
||||||
- "localhost:8004"
|
- "localhost:8004"
|
||||||
|
|||||||
@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
|
|||||||
backend: "pytorch"
|
backend: "pytorch"
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
context_servers:
|
context_servers:
|
||||||
num_instances: 1
|
num_instances: 1
|
||||||
tensor_parallel_size: 1
|
tensor_parallel_size: 1
|
||||||
|
|||||||
@ -15,7 +15,7 @@ context_servers:
|
|||||||
enable_partial_reuse: False
|
enable_partial_reuse: False
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: False
|
disable_overlap_scheduler: True
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8001"
|
- "localhost:8001"
|
||||||
generation_servers:
|
generation_servers:
|
||||||
@ -30,6 +30,6 @@ generation_servers:
|
|||||||
enable_partial_reuse: False
|
enable_partial_reuse: False
|
||||||
pytorch_backend_config:
|
pytorch_backend_config:
|
||||||
use_cuda_graph: False
|
use_cuda_graph: False
|
||||||
enable_overlap_scheduler: True
|
disable_overlap_scheduler: False
|
||||||
urls:
|
urls:
|
||||||
- "localhost:8002"
|
- "localhost:8002"
|
||||||
|
|||||||
@ -110,13 +110,13 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
|
|||||||
|
|
||||||
# Context worker
|
# Context worker
|
||||||
worker_pytorch_configs.append(
|
worker_pytorch_configs.append(
|
||||||
PyTorchConfig(enable_overlap_scheduler=False,
|
PyTorchConfig(disable_overlap_scheduler=True,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
use_cuda_graph=enable_cuda_graph))
|
use_cuda_graph=enable_cuda_graph))
|
||||||
|
|
||||||
# Generation worker
|
# Generation worker
|
||||||
worker_pytorch_configs.append(
|
worker_pytorch_configs.append(
|
||||||
PyTorchConfig(enable_overlap_scheduler=generation_overlap,
|
PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
use_cuda_graph=enable_cuda_graph))
|
use_cuda_graph=enable_cuda_graph))
|
||||||
|
|
||||||
@ -228,13 +228,13 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
|
|||||||
|
|
||||||
# Context worker
|
# Context worker
|
||||||
worker_pytorch_configs.append(
|
worker_pytorch_configs.append(
|
||||||
PyTorchConfig(enable_overlap_scheduler=False,
|
PyTorchConfig(disable_overlap_scheduler=True,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
use_cuda_graph=enable_cuda_graph))
|
use_cuda_graph=enable_cuda_graph))
|
||||||
|
|
||||||
# Generation worker
|
# Generation worker
|
||||||
worker_pytorch_configs.append(
|
worker_pytorch_configs.append(
|
||||||
PyTorchConfig(enable_overlap_scheduler=generation_overlap,
|
PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
use_cuda_graph=enable_cuda_graph))
|
use_cuda_graph=enable_cuda_graph))
|
||||||
|
|
||||||
|
|||||||
@ -29,7 +29,6 @@ def get_model_yaml_config(model_label: str) -> dict:
|
|||||||
base_config = {
|
base_config = {
|
||||||
'enable_attention_dp': True,
|
'enable_attention_dp': True,
|
||||||
'pytorch_backend_config': {
|
'pytorch_backend_config': {
|
||||||
'enable_overlap_scheduler': True,
|
|
||||||
'print_iter_log': True,
|
'print_iter_log': True,
|
||||||
'use_cuda_graph': True,
|
'use_cuda_graph': True,
|
||||||
'cuda_graph_padding_enabled': True,
|
'cuda_graph_padding_enabled': True,
|
||||||
@ -40,7 +39,6 @@ def get_model_yaml_config(model_label: str) -> dict:
|
|||||||
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8':
|
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8':
|
||||||
{
|
{
|
||||||
'pytorch_backend_config': {
|
'pytorch_backend_config': {
|
||||||
'enable_overlap_scheduler': True,
|
|
||||||
'use_cuda_graph': True,
|
'use_cuda_graph': True,
|
||||||
},
|
},
|
||||||
'speculative_config': {
|
'speculative_config': {
|
||||||
@ -51,7 +49,6 @@ def get_model_yaml_config(model_label: str) -> dict:
|
|||||||
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8':
|
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8':
|
||||||
{
|
{
|
||||||
'pytorch_backend_config': {
|
'pytorch_backend_config': {
|
||||||
'enable_overlap_scheduler': True,
|
|
||||||
'use_cuda_graph': True,
|
'use_cuda_graph': True,
|
||||||
},
|
},
|
||||||
'speculative_config': {
|
'speculative_config': {
|
||||||
|
|||||||
@ -502,9 +502,6 @@ def stress_test(config,
|
|||||||
"capacity_scheduler_policy":
|
"capacity_scheduler_policy":
|
||||||
test_server_config.capacity_scheduler_policy
|
test_server_config.capacity_scheduler_policy
|
||||||
},
|
},
|
||||||
"pytorch_backend_config": {
|
|
||||||
"enable_overlap_scheduler": True,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add DeepSeek-V3 specific configuration
|
# Add DeepSeek-V3 specific configuration
|
||||||
@ -519,7 +516,6 @@ def stress_test(config,
|
|||||||
"cuda_graph_batch_sizes":
|
"cuda_graph_batch_sizes":
|
||||||
[1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
|
[1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
|
||||||
"print_iter_log": True,
|
"print_iter_log": True,
|
||||||
"enable_overlap_scheduler": True
|
|
||||||
}
|
}
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
|
||||||
|
|||||||
@ -425,7 +425,6 @@ def temp_extra_llm_api_options_file(request):
|
|||||||
|
|
||||||
if request.node.callspec.params['pytorch_backend_config']:
|
if request.node.callspec.params['pytorch_backend_config']:
|
||||||
extra_llm_api_options_dict["pytorch_backend_config"] = {
|
extra_llm_api_options_dict["pytorch_backend_config"] = {
|
||||||
"enable_overlap_scheduler": True,
|
|
||||||
"use_cuda_graph": True,
|
"use_cuda_graph": True,
|
||||||
"cuda_graph_batch_sizes": [1, 2, 3],
|
"cuda_graph_batch_sizes": [1, 2, 3],
|
||||||
}
|
}
|
||||||
@ -1303,7 +1302,6 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
|
|||||||
kv_cache_fraction = 0.6 if "Qwen3" in model_name else None
|
kv_cache_fraction = 0.6 if "Qwen3" in model_name else None
|
||||||
llm_venv.run_cmd([
|
llm_venv.run_cmd([
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--enable_chunked_prefill",
|
"--enable_chunked_prefill",
|
||||||
f"--kv_cache_fraction={kv_cache_fraction}",
|
f"--kv_cache_fraction={kv_cache_fraction}",
|
||||||
"--model_dir",
|
"--model_dir",
|
||||||
@ -1329,7 +1327,6 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
|
|||||||
llm_venv.run_cmd(
|
llm_venv.run_cmd(
|
||||||
[
|
[
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--use_cuda_graph",
|
"--use_cuda_graph",
|
||||||
"--spec_decode_nextn",
|
"--spec_decode_nextn",
|
||||||
"1", # test 1 MTP module
|
"1", # test 1 MTP module
|
||||||
@ -1359,7 +1356,6 @@ def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus(
|
|||||||
delete_on_close=True) as running_log:
|
delete_on_close=True) as running_log:
|
||||||
llm_venv.run_cmd([
|
llm_venv.run_cmd([
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--model_dir",
|
"--model_dir",
|
||||||
f"{llm_models_root()}/{model_path}",
|
f"{llm_models_root()}/{model_path}",
|
||||||
"--moe_ep_size=8",
|
"--moe_ep_size=8",
|
||||||
@ -1397,6 +1393,7 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
|
|||||||
"--eagle_model_dir",
|
"--eagle_model_dir",
|
||||||
f"{llm_models_root()}/{eagle_model_path}",
|
f"{llm_models_root()}/{eagle_model_path}",
|
||||||
"--disable_kv_cache_reuse",
|
"--disable_kv_cache_reuse",
|
||||||
|
"--disable_overlap_scheduler",
|
||||||
],
|
],
|
||||||
running_log=running_log)
|
running_log=running_log)
|
||||||
_check_mem_usage(running_log, [25.2, 0, 0, 0])
|
_check_mem_usage(running_log, [25.2, 0, 0, 0])
|
||||||
@ -1420,7 +1417,6 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv,
|
|||||||
delete_on_close=True) as running_log:
|
delete_on_close=True) as running_log:
|
||||||
llm_venv.run_cmd([
|
llm_venv.run_cmd([
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--model_dir",
|
"--model_dir",
|
||||||
f"{llm_models_root()}/{model_path}",
|
f"{llm_models_root()}/{model_path}",
|
||||||
"--moe_tp_size=1",
|
"--moe_tp_size=1",
|
||||||
@ -1454,7 +1450,6 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
|
|||||||
delete_on_close=True) as running_log:
|
delete_on_close=True) as running_log:
|
||||||
llm_venv.run_cmd([
|
llm_venv.run_cmd([
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--model_dir",
|
"--model_dir",
|
||||||
f"{llm_models_root()}/{model_path}",
|
f"{llm_models_root()}/{model_path}",
|
||||||
"--moe_tp_size=1",
|
"--moe_tp_size=1",
|
||||||
@ -1518,7 +1513,6 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
|
|||||||
delete_on_close=True) as running_log:
|
delete_on_close=True) as running_log:
|
||||||
llm_venv.run_cmd([
|
llm_venv.run_cmd([
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--enable_chunked_prefill",
|
"--enable_chunked_prefill",
|
||||||
"--model_dir",
|
"--model_dir",
|
||||||
f"{llm_models_root()}/{model_path}",
|
f"{llm_models_root()}/{model_path}",
|
||||||
@ -1544,7 +1538,6 @@ def test_ptp_quickstart_advanced_2gpus_sm120(llm_root, llm_venv, model_name,
|
|||||||
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
|
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
|
||||||
llm_venv.run_cmd([
|
llm_venv.run_cmd([
|
||||||
str(example_root / "quickstart_advanced.py"),
|
str(example_root / "quickstart_advanced.py"),
|
||||||
"--enable_overlap_scheduler",
|
|
||||||
"--enable_chunked_prefill",
|
"--enable_chunked_prefill",
|
||||||
"--model_dir",
|
"--model_dir",
|
||||||
f"{llm_models_root()}/{model_path}",
|
f"{llm_models_root()}/{model_path}",
|
||||||
@ -1789,7 +1782,8 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
|
|||||||
sampling_param = SamplingParams(max_tokens=32, return_context_logits=True)
|
sampling_param = SamplingParams(max_tokens=32, return_context_logits=True)
|
||||||
with LLM(
|
with LLM(
|
||||||
model=model_dir,
|
model=model_dir,
|
||||||
pytorch_backend_config=PyTorchConfig(attn_backend=backend),
|
pytorch_backend_config=PyTorchConfig(
|
||||||
|
attn_backend=backend, disable_overlap_scheduler=True),
|
||||||
) as llm:
|
) as llm:
|
||||||
|
|
||||||
outputs = llm.generate(prompts, sampling_params=sampling_param)
|
outputs = llm.generate(prompts, sampling_params=sampling_param)
|
||||||
|
|||||||
@ -57,7 +57,7 @@ def test_deepseek_trtllmgen(model_name):
|
|||||||
] * 4
|
] * 4
|
||||||
|
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=False,
|
disable_overlap_scheduler=True,
|
||||||
use_cuda_graph=False,
|
use_cuda_graph=False,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
attn_backend="TRTLLM",
|
attn_backend="TRTLLM",
|
||||||
|
|||||||
@ -3,6 +3,7 @@ import unittest
|
|||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
|
|
||||||
from tensorrt_llm._torch import LLM
|
from tensorrt_llm._torch import LLM
|
||||||
|
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||||
from tensorrt_llm.llmapi import KvCacheConfig
|
from tensorrt_llm.llmapi import KvCacheConfig
|
||||||
from tensorrt_llm.sampling_params import SamplingParams
|
from tensorrt_llm.sampling_params import SamplingParams
|
||||||
|
|
||||||
@ -40,7 +41,9 @@ class TestOutOfTree(unittest.TestCase):
|
|||||||
|
|
||||||
llm = LLM(model=model_dir,
|
llm = LLM(model=model_dir,
|
||||||
kv_cache_config=kv_cache_config,
|
kv_cache_config=kv_cache_config,
|
||||||
max_num_tokens=2048)
|
max_num_tokens=2048,
|
||||||
|
pytorch_backend_config=PyTorchConfig(
|
||||||
|
disable_overlap_scheduler=True))
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
|
|||||||
@ -62,7 +62,8 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size,
|
|||||||
max_output_tokens = 128
|
max_output_tokens = 128
|
||||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
||||||
pytorch_backend_config = PyTorchConfig(
|
pytorch_backend_config = PyTorchConfig(
|
||||||
attn_backend='FLASHINFER_STAR_ATTENTION')
|
attn_backend='FLASHINFER_STAR_ATTENTION',
|
||||||
|
disable_overlap_scheduler=True)
|
||||||
|
|
||||||
llm = LLM(model=model_dir,
|
llm = LLM(model=model_dir,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
|
|||||||
@ -57,7 +57,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
|
|||||||
] * 32
|
] * 32
|
||||||
|
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=False,
|
disable_overlap_scheduler=True,
|
||||||
use_cuda_graph=False,
|
use_cuda_graph=False,
|
||||||
kv_cache_dtype="auto",
|
kv_cache_dtype="auto",
|
||||||
attn_backend=backend,
|
attn_backend=backend,
|
||||||
|
|||||||
@ -25,7 +25,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str):
|
|||||||
models_path = llm_models_root()
|
models_path = llm_models_root()
|
||||||
|
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
enable_overlap_scheduler=False,
|
disable_overlap_scheduler=True,
|
||||||
use_cuda_graph=use_cuda_graph,
|
use_cuda_graph=use_cuda_graph,
|
||||||
# Only create a single CUDA graph to prevent OOM in CI
|
# Only create a single CUDA graph to prevent OOM in CI
|
||||||
attn_backend=attn_backend,
|
attn_backend=attn_backend,
|
||||||
|
|||||||
@ -22,11 +22,11 @@ def model_path():
|
|||||||
return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
|
return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
|
||||||
|
|
||||||
|
|
||||||
def create_llm(model_dir, enable_overlap_scheduler, enable_trtllm_decoder):
|
def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_decoder):
|
||||||
"""Create LLM with specific overlap scheduler setting"""
|
"""Create LLM with specific overlap scheduler setting"""
|
||||||
pytorch_config = PyTorchConfig(
|
pytorch_config = PyTorchConfig(
|
||||||
use_cuda_graph=True,
|
use_cuda_graph=True,
|
||||||
enable_overlap_scheduler=enable_overlap_scheduler,
|
disable_overlap_scheduler=disable_overlap_scheduler,
|
||||||
enable_trtllm_decoder=enable_trtllm_decoder)
|
enable_trtllm_decoder=enable_trtllm_decoder)
|
||||||
|
|
||||||
trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
|
trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
|
||||||
@ -62,7 +62,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
|
|||||||
|
|
||||||
# Test with overlap scheduler enabled
|
# Test with overlap scheduler enabled
|
||||||
llm = create_llm(model_path,
|
llm = create_llm(model_path,
|
||||||
enable_overlap_scheduler=True,
|
disable_overlap_scheduler=False,
|
||||||
enable_trtllm_decoder=enable_trtllm_decoder)
|
enable_trtllm_decoder=enable_trtllm_decoder)
|
||||||
outputs_with_overlap = llm.generate(prompts,
|
outputs_with_overlap = llm.generate(prompts,
|
||||||
sampling_params=sampling_config,
|
sampling_params=sampling_config,
|
||||||
@ -74,7 +74,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
|
|||||||
|
|
||||||
# Test with overlap scheduler disabled
|
# Test with overlap scheduler disabled
|
||||||
llm = create_llm(model_path,
|
llm = create_llm(model_path,
|
||||||
enable_overlap_scheduler=False,
|
disable_overlap_scheduler=True,
|
||||||
enable_trtllm_decoder=enable_trtllm_decoder)
|
enable_trtllm_decoder=enable_trtllm_decoder)
|
||||||
outputs_without_overlap = llm.generate(prompts,
|
outputs_without_overlap = llm.generate(prompts,
|
||||||
sampling_params=sampling_config,
|
sampling_params=sampling_config,
|
||||||
|
|||||||
@ -26,7 +26,7 @@ def temp_extra_llm_api_options_file(request):
|
|||||||
extra_llm_api_options_dict = {
|
extra_llm_api_options_dict = {
|
||||||
"guided_decoding_backend": "xgrammar",
|
"guided_decoding_backend": "xgrammar",
|
||||||
"pytorch_backend_config": {
|
"pytorch_backend_config": {
|
||||||
"enable_overlap_scheduler": False,
|
"disable_overlap_scheduler": True,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -24,9 +24,7 @@ def client():
|
|||||||
kv_cache_config=KvCacheConfig(),
|
kv_cache_config=KvCacheConfig(),
|
||||||
backend="pytorch",
|
backend="pytorch",
|
||||||
pytorch_backend_config=PyTorchConfig(
|
pytorch_backend_config=PyTorchConfig(
|
||||||
enable_overlap_scheduler=True,
|
enable_iter_perf_stats=True, ))
|
||||||
enable_iter_perf_stats=True,
|
|
||||||
))
|
|
||||||
hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
|
hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
|
||||||
|
|
||||||
app_instance = OpenAIServer(llm,
|
app_instance = OpenAIServer(llm,
|
||||||
|
|||||||
@ -1875,7 +1875,7 @@ def llm_get_stats_test_harness(tp_size: int = 1,
|
|||||||
llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
|
llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
|
||||||
enable_iter_perf_stats=True,
|
enable_iter_perf_stats=True,
|
||||||
enable_iter_req_stats=enable_iter_req_stats,
|
enable_iter_req_stats=enable_iter_req_stats,
|
||||||
enable_overlap_scheduler=use_overlap)
|
disable_overlap_scheduler=not use_overlap)
|
||||||
LLM_CLASS = LLM_torch
|
LLM_CLASS = LLM_torch
|
||||||
else:
|
else:
|
||||||
LLM_CLASS = LLM
|
LLM_CLASS = LLM
|
||||||
@ -1944,8 +1944,8 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
|
|||||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||||
llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
|
llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
|
||||||
enable_iter_perf_stats=True,
|
enable_iter_perf_stats=True,
|
||||||
enable_overlap_scheduler=use_overlap,
|
enable_iter_req_stats=enable_iter_req_stats,
|
||||||
enable_iter_req_stats=enable_iter_req_stats)
|
disable_overlap_scheduler=not use_overlap)
|
||||||
LLM_CLASS = LLM_torch
|
LLM_CLASS = LLM_torch
|
||||||
else:
|
else:
|
||||||
LLM_CLASS = LLM
|
LLM_CLASS = LLM
|
||||||
|
|||||||
@ -82,9 +82,9 @@ def test_llm_reward_model():
|
|||||||
|
|
||||||
from tensorrt_llm._torch import LLM as LLM_torch
|
from tensorrt_llm._torch import LLM as LLM_torch
|
||||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||||
llm = LLM_torch(
|
llm = LLM_torch(model=rm_model_path,
|
||||||
model=rm_model_path,
|
pytorch_backend_config=PyTorchConfig(
|
||||||
pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
|
attn_backend="VANILLA", disable_overlap_scheduler=True))
|
||||||
|
|
||||||
sampling_params = SamplingParams(return_context_logits=True)
|
sampling_params = SamplingParams(return_context_logits=True)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user