[None][feat] Add support for DeepSeek v3.2 tests (#10561)

Signed-off-by: yingguo-trt <244492186+yingguo-trt@users.noreply.github.com>
This commit is contained in:
yingguo-trt 2026-01-09 23:20:29 +08:00 committed by GitHub
parent 7295af68ba
commit d80f01d205
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 62 additions and 38 deletions

View File

@ -43,13 +43,12 @@ environment:
profiling:
nsys_on: false
accuracy:
enable_accuracy_test: false
enable_accuracy_test: false # Set to true to enable accuracy evaluation
model: local-completions
tasks: gsm8k
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
@ -80,17 +79,20 @@ worker_config:
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: NIXL
backend: NIXLf
stream_interval: 20
num_postprocess_workers: 4
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 4
max_num_tokens: 4608
max_seq_len: 2251
@ -101,6 +103,8 @@ worker_config:
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
moe_config:
backend: TRTLLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85

View File

@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
@ -80,10 +79,14 @@ worker_config:
free_gpu_memory_fraction: 0.9
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 4608
backend: NIXL
@ -93,7 +96,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 4
max_num_tokens: 4608
max_seq_len: 2251
@ -104,6 +106,8 @@ worker_config:
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
moe_config:
backend: TRTLLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85

View File

@ -50,7 +50,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 48
moe_expert_parallel_size: 48
enable_attention_dp: true
@ -81,16 +80,19 @@ worker_config:
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8320
backend: DEFAULT
stream_interval: 20
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 4
max_num_tokens: 4480
max_seq_len: 2176
@ -101,6 +103,8 @@ worker_config:
print_iter_log: true
cuda_graph_config: null
disable_overlap_scheduler: true
moe_config:
backend: TRTLLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.85

View File

@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
@ -81,10 +80,14 @@ worker_config:
free_gpu_memory_fraction: 0.6
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT
@ -94,7 +97,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 9423
@ -109,6 +111,8 @@ worker_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: DEFAULT

View File

@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 16
moe_expert_parallel_size: 16
enable_attention_dp: true
@ -80,17 +79,20 @@ worker_config:
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL
stream_interval: 20
num_postprocess_workers: 4
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 9419
@ -105,6 +107,8 @@ worker_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL

View File

@ -49,7 +49,6 @@ accuracy:
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
@ -80,10 +79,14 @@ worker_config:
free_gpu_memory_fraction: 0.7
dtype: fp8
moe_config:
backend: WIDEEP
load_balancer:
num_slots: 288
layer_updates_per_iter: 1
backend: CUTEDSL
use_low_precision_moe_combine: true
nvfp4_gemm_config:
allowed_backends:
- cutlass
- cublaslt
- cutedsl
- cuda_core
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL
@ -93,7 +96,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 9419
@ -108,6 +110,8 @@ worker_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.75
dtype: fp8
moe_config:
backend: TRTLLM
cache_transceiver_config:
max_tokens_in_buffer: 8448
backend: NIXL