mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][feat] Add support for DeepSeek v3.2 tests (#10561)
Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
This commit is contained in:
parent
7295af68ba
commit
d80f01d205
@ -43,13 +43,12 @@ environment:
|
||||
profiling:
|
||||
nsys_on: false
|
||||
accuracy:
|
||||
enable_accuracy_test: false
|
||||
enable_accuracy_test: false # Set to true to enable accuracy evaluation
|
||||
model: local-completions
|
||||
tasks: gsm8k
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
@ -80,17 +79,20 @@ worker_config:
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 4608
|
||||
backend: NIXL
|
||||
backend: NIXLf
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 4608
|
||||
max_seq_len: 2251
|
||||
@ -101,6 +103,8 @@ worker_config:
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.85
|
||||
|
||||
@ -49,7 +49,6 @@ accuracy:
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
@ -80,10 +79,14 @@ worker_config:
|
||||
free_gpu_memory_fraction: 0.9
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 4608
|
||||
backend: NIXL
|
||||
@ -93,7 +96,6 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 4608
|
||||
max_seq_len: 2251
|
||||
@ -104,6 +106,8 @@ worker_config:
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.85
|
||||
|
||||
@ -50,7 +50,6 @@ accuracy:
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 48
|
||||
moe_expert_parallel_size: 48
|
||||
enable_attention_dp: true
|
||||
@ -81,16 +80,19 @@ worker_config:
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8320
|
||||
backend: DEFAULT
|
||||
stream_interval: 20
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 4
|
||||
max_num_tokens: 4480
|
||||
max_seq_len: 2176
|
||||
@ -101,6 +103,8 @@ worker_config:
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
disable_overlap_scheduler: true
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
kv_cache_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.85
|
||||
|
||||
@ -49,7 +49,6 @@ accuracy:
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
@ -81,10 +80,14 @@ worker_config:
|
||||
free_gpu_memory_fraction: 0.6
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: DEFAULT
|
||||
@ -94,7 +97,6 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 8448
|
||||
max_seq_len: 9423
|
||||
@ -109,6 +111,8 @@ worker_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: DEFAULT
|
||||
|
||||
@ -49,7 +49,6 @@ accuracy:
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 16
|
||||
moe_expert_parallel_size: 16
|
||||
enable_attention_dp: true
|
||||
@ -80,17 +79,20 @@ worker_config:
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: NIXL
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 8448
|
||||
max_seq_len: 9419
|
||||
@ -105,6 +107,8 @@ worker_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: NIXL
|
||||
|
||||
@ -49,7 +49,6 @@ accuracy:
|
||||
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
|
||||
worker_config:
|
||||
gen:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
tensor_parallel_size: 32
|
||||
moe_expert_parallel_size: 32
|
||||
enable_attention_dp: true
|
||||
@ -80,10 +79,14 @@ worker_config:
|
||||
free_gpu_memory_fraction: 0.7
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: WIDEEP
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
- cublaslt
|
||||
- cutedsl
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: NIXL
|
||||
@ -93,7 +96,6 @@ worker_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
ctx:
|
||||
enable_layerwise_nvtx_marker: true
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 8448
|
||||
max_seq_len: 9419
|
||||
@ -108,6 +110,8 @@ worker_config:
|
||||
enable_block_reuse: false
|
||||
free_gpu_memory_fraction: 0.75
|
||||
dtype: fp8
|
||||
moe_config:
|
||||
backend: TRTLLM
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 8448
|
||||
backend: NIXL
|
||||
|
||||
Loading…
Reference in New Issue
Block a user