diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 3ced7b395f..09dc8b2267 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -1,391 +1,4 @@ -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaModel-bert/roberta-base] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha_fp32_acc-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased] -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertModel-bert/bert-base-uncased] -examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] -examples/test_bindings.py::test_llm_bindings_example[llama-7b] -examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-disable_weight_only] -examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-enable_weight_only] -examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only] -examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only] -examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only] -examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (180) -examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (180) -examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] -examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] -examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-enable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] TIMEOUT (90) -examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-disable_fp8] TIMEOUT (90) -examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1] TIMEOUT (90) -examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:4] TIMEOUT (90) -examples/test_exaone.py::test_llm_exaone_1gpu[disable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:4] TIMEOUT (90) -examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_deep_2.4b-float16-nb:1] TIMEOUT (90) -examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1] TIMEOUT (90) -examples/test_gemma.py::test_llm_gemma_1gpu_summary[gemma-2-27b-it-other-bfloat16-8] -examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] -examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] -examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] -examples/test_gpt.py::test_llm_gpt2_medium_1gpu[non_streaming-use_py_session-disable_gemm_plugin] -examples/test_gpt.py::test_llm_gpt2_medium_1gpu[streaming-use_cpp_session-enable_gemm_plugin] -examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp1pp4] -examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp2pp2] -examples/test_gpt.py::test_llm_gpt2_medium_1node_4gpus[tp4pp1] -examples/test_gpt.py::test_llm_gpt2_medium_bad_words_1gpu[non_streaming-use_cpp_session] -examples/test_gpt.py::test_llm_gpt2_medium_stop_words_1gpu[streaming-use_cpp_session] -examples/test_gpt.py::test_llm_gpt2_multi_lora_1gpu[900_stories] -examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_cpp_session-tp1] -examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-1] -examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] -examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] -examples/test_gpt.py::test_streaming_beam[batch_size_1-disable_return_all_generated_tokens-num_beams_1] -examples/test_gpt.py::test_streaming_beam[batch_size_1-disable_return_all_generated_tokens-num_beams_4] -examples/test_gpt.py::test_streaming_beam[batch_size_1-return_all_generated_tokens-num_beams_1] -examples/test_gpt.py::test_streaming_beam[batch_size_1-return_all_generated_tokens-num_beams_4] -examples/test_gpt.py::test_streaming_beam[batch_size_3-disable_return_all_generated_tokens-num_beams_1] -examples/test_gpt.py::test_streaming_beam[batch_size_3-disable_return_all_generated_tokens-num_beams_4] -examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_1] -examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_4] -examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] -examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] -examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] -examples/test_granite.py::test_granite_bf16_lora[granite-3.0-2b-instruct] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_logits-draft_len_8-float16-bs1] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2] -examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-llama_v2-use_cpp_session-use_logits-draft_len_4-float16-bs2] -examples/test_draft_target_model.py::test_llm_draft_target_llama_1gpu -examples/test_draft_target_model.py::test_llm_draft_target_llama_fp8_2gpu -examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] -examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] -examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] -examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] -examples/test_internlm.py::test_llm_internlm2_7b_1node_1gpu[bfloat16-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:2] -examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] -examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] -examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] -examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] -examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:4] -examples/test_llama.py::test_llm_llama_code_llama_1gpu_summary[CodeLlama-7b-Instruct-enable_with_fp32_acc-enable_gemm_plugin-enable_attention_plugin-nb:1] -examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-34b-Instruct-tp4pp1-nb:4] -examples/test_llama.py::test_llm_llama_code_llama_multi_gpus_summary[CodeLlama-70b-hf-tp2pp2-nb:1] -examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4] -examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] -examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1] -examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] -examples/test_llama.py::test_codellama_fp8_with_bf16_lora[CodeLlama-7b-Instruct] -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v2-7b-hf] -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf] -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b] -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-1b] -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.2-3b] -examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp8pp1-nb:1] -examples/test_llama.py::test_llm_llama_lookahead_single_gpu_summary[llama-3.1-8b] -examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] -examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] -examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] -examples/test_llama.py::test_llm_llama_v1_2gpu_summary[llama-7b-nb:4] -examples/test_llama.py::test_llm_llama_v1_4gpu_paged_kv_cache[llama-3.1-8b] -examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp16] -examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] -examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_awq] -examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp16] -examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_fp8] -examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo] -examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb] -examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-enable_fp8] TIMEOUT (120) -examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-405b-fp8-disable_fp8] TIMEOUT (90) -examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] -examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8] -examples/test_llama.py::test_llm_llama_v3_1m_long_context_8gpus[Llama-3-8B-Instruct-Gradient-1048k] TIMEOUT (180) -examples/test_llama.py::test_llm_llama_v3_dora_1gpu[commonsense-llama-v3-8b-dora-r32-llama-v3-8b-hf-base_fp16] -examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] -examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-1.4b-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-130m-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-2.8b-float16-disable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-370m-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-790m-float16-disable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-1.3b-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-2.7b-float16-disable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-370m-float16-enable_gemm_plugin] -examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-780m-float16-disable_gemm_plugin] -examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] -examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] -examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] -examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] -examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] -examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] -examples/test_mixtral.py::test_llm_mixtral_int4_awq_1gpu_summary[mixtral-8x7b-v0.1-AWQ] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] -examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] -# Multimodal Executor Cpp E2E Tests -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] -examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] - -examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8] -examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16] -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] -examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora] -examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] -examples/test_qwen.py::test_llm_qwen1_5_moe_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] -examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen1.5_72b_chat-tp4pp2-context_fmha] -examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2_72b_instruct-tp8pp1-context_fmha_fp32_acc] -examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2.5_72b_chat-tp4pp2-context_fmha] -examples/test_qwen.py::test_llm_qwen_1node_8gpus_summary[qwen2.5_72b_chat-tp8pp1-context_fmha_fp32_acc] -examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen1.5_7b_chat-enable_gemm_plugin-enable_weight_only] -examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_7b_instruct-enable_gemm_plugin-enable_weight_only] -examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] -examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only] -examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] -examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] -examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] -examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_0.5b_chat-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_0.5b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-disable_fmha] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_7b_chat-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen1.5_7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_vl_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-disable_fmha] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_vl_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_0.5b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] -examples/test_internlm.py::test_llm_internlm2_7b_1node_1gpu[bfloat16-enable_context_fmha-enable_gemm_plugin-enable_attention_plugin-nb:2] # 5 mins -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-disable_weight_only-enable_fmha_fp32_acc] -examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] -examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen1.5_7b_chat-nb:4] -examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_7b_instruct-nb:4] -examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] -examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2.5_7b_instruct-nb:4] -examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen1.5_14b_chat_int4-nb:4] -examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen1.5_7b_chat_awq-nb:1] -examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] -examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen1.5_7b_chat-enable_ptpc-nb:4] -examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2_7b_instruct-enable_ptpc-nb:4] -examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2_vl_7b_instruct-enable_ptpc-nb:4] -examples/test_qwen.py::test_llm_qwen_smooth_quant_single_gpu_summary[qwen2.5_7b_instruct-enable_ptpc-nb:4] -examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] -examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2_0.5b_instruct] -examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_0.5b_instruct] -examples/test_qwen.py::test_llm_hf_qwen_multi_lora_1gpu[qwen2.5_1.5b_instruct] -examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] -examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-flax-no_paged_cache-disable_quant-float16-enable_attn_plugin-disable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] -examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] -examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime] -examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime] -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int4-float16-nb:1-use_cpp_runtime] - -# Accuracy test list -accuracy/test_cli_flow.py::TestGpt2::test_auto_dtype -accuracy/test_cli_flow.py::TestGpt2::test_gemm_plugin -accuracy/test_cli_flow.py::TestGpt2::test_attention_ootb -accuracy/test_cli_flow.py::TestGpt2::test_context_fmha_disabled -accuracy/test_cli_flow.py::TestGpt2::test_context_fmha_fp32_acc -accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int8] -accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] -accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache -accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=False-per_channel=False] -accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=True-per_channel=True] -accuracy/test_cli_flow.py::TestGpt2::test_beam_search -accuracy/test_cli_flow.py::TestGpt2::test_beam_search_large -accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search -accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb -accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin -accuracy/test_cli_flow.py::TestGpt2::test_cuda_graph -accuracy/test_cli_flow.py::TestGpt2Medium::test_auto_dtype -accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8 -accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8_lm_head -accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype -accuracy/test_cli_flow.py::TestStarcoder2_3B::test_auto_dtype -accuracy/test_cli_flow.py::TestStarcoder2_15B::test_smooth_quant_ootb -accuracy/test_cli_flow.py::TestGptNext::test_auto_dtype -accuracy/test_cli_flow.py::TestMinitron4BBase::test_auto_dtype -accuracy/test_cli_flow.py::TestMinitron4BBase::test_fp8 -accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi2::test_tp2 -accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype -accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive -accuracy/test_cli_flow.py::TestMamba130M::test_auto_dtype -accuracy/test_cli_flow.py::TestVicuna7B::test_lookahead -accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] -accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] -accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=False-chunked_context=False-typical_acceptance=False] -accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False] -accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False] -accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] -accuracy/test_cli_flow.py::TestLlama7B::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama7B::test_beam_search -accuracy/test_cli_flow.py::TestLlama7B::test_int4_gptq -accuracy/test_cli_flow.py::TestLlama7B::test_streamingllm -accuracy/test_cli_flow.py::TestLlama2_7B::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[tp2] -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[pp2] -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] -accuracy/test_cli_flow.py::TestLlama2_7B::test_tp2cp2 -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_plugin -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_gemm_swiglu_plugin -accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_low_latency_gemm_plugin -accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2 -accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_tp2 -accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_awq_prequantized_tp2 -accuracy/test_cli_flow.py::TestLlama2_7B::test_int4_gptq_prequantized_tp2 -accuracy/test_cli_flow.py::TestLlama2_7B::test_weight_sparsity -accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_float32 -accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only[int8] -accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only[int4] -accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8] -accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_pp4 -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8 -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4 -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant] -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-enable_fused_quant] -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-disable_fused_quant] -accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant] -accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context -accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8 -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq -accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized -accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_medusa_fp8_prequantized -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_smooth_quant_ootb -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_int4_awq_manage_weights -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8 -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_pp2 -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_fp8_rowwise -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_weight_streaming[1.0] -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache -accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache_beam_search -accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized -accuracy/test_llm_api.py::TestLlama3_2_1B::test_auto_dtype -accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant -accuracy/test_llm_api.py::TestLlama3_2_1B::test_smooth_quant_ootb -accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq -accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache -accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2 -accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise -accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized -accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 -accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights -accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin -accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int4_tp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_pp_reduce_scatter_tp2pp2 -accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[expert_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[mixed_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_ootb_except_mha_tp8[tensor_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[no_renormalize-tensor_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-expert_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-mixed_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel] -accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized -accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 TIMEOUT (120) -accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] TIMEOUT (90) -accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype -accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] -accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] -accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_auto_dtype -accuracy/test_cli_flow.py::TestQwen1_5MoeA2_7BChat::test_weight_only -accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_weight_only -accuracy/test_cli_flow.py::TestQwen2_0_5BInstruct::test_fp8 -accuracy/test_cli_flow.py::TestQwen2_1_5B::test_auto_dtype_cp4 -accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_auto_dtype -accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_weight_only -accuracy/test_cli_flow.py::TestQwen2_7BInstruct::test_int4_awq_prequantized -accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4 -accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2 -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph -accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype -accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only -accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise -accuracy/test_llm_api.py::TestQwen2_7BInstruct::test_fp8 -accuracy/test_llm_api.py::TestQwen2_5_0_5BInstruct::test_fp8 -accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_fp8 -accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8 -accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache -accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2 -accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2 -accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 +# text generation accuracy test accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] @@ -418,6 +31,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False] @@ -428,14 +45,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar] -accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] @@ -464,8 +73,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 -accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] @@ -492,9 +99,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=2] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=0] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[llguidance-mtp_nextn=2] -accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 -accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] @@ -522,13 +126,12 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[disable accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] -accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput] -accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm] -accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] -accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] +accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] @@ -536,40 +139,18 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] -accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] -accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto] @@ -621,12 +202,98 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_m accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] +accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 +accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 +accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 +accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar] +accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput] +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm] +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] +accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] +accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus] +accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 +accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8 +accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2 +accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8 +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-True] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus + +# multimodal accuracy tests +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestNano_V2_VLM::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype + +# disaggregated serving accuracy test +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:without_padding-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:without_padding-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True] @@ -653,63 +320,12 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] -accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4 -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] -accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus] -accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 -accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 -accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope -accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 -accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 -accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8 -accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend -accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2 -accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8 -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-True] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-False] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-False] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-False] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus +accuracy/test_disaggregated_serving.py::TestKimiK2::test_nvfp4 -accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestNano_V2_VLM::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized -accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype - -test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-] +# e2e test test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-] test_e2e.py::test_llama_e2e[use_py_session--] -llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-codellama/CodeLlama-7b-Instruct-hf] # 5min -llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf] # 5min -test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--] test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--] test_e2e.py::test_mistral_e2e[use_py_session---] test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B] @@ -720,9 +336,6 @@ test_e2e.py::test_openai_chat_harmony test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b] test_e2e.py::test_trtllm_multimodal_benchmark_serving - -llmapi/test_llm_examples.py::test_llmapi_server_example -# Pivot to Pytorch test cases. test_e2e.py::test_ptp_quickstart test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] @@ -766,47 +379,9 @@ test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B- test_e2e.py::test_eagle3_output_consistency_4gpus[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-Llama-4-Maverick-17B-128E-Eagle3] test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-Qwen3/qwen3-235B-eagle3] unittest/llmapi/test_llm_pytorch.py::test_gemma3_1b_instruct_multi_lora -examples/test_medusa.py::test_codellama_medusa_1gpu[CodeLlama-7b-Instruct] - -examples/test_medusa.py::test_qwen_medusa_1gpu[qwen_7b_chat] -examples/test_medusa.py::test_qwen_medusa_1gpu[qwen1.5_7b_chat] -examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2_7b_instruct] -examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2_0.5b_instruct] -examples/test_medusa.py::test_qwen_medusa_1gpu[qwen2.5_1.5b_instruct] -examples/test_medusa.py::test_phi_medusa_1gpu[phi-2] -examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3-mini-128k-instruct] -examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3-small-128k-instruct] -examples/test_medusa.py::test_phi_medusa_1gpu[Phi-3.5-mini-instruct] -examples/test_medusa.py::test_phi_medusa_1gpu[Phi-4-mini-instruct] -examples/test_eagle.py::test_codellama_eagle_1gpu[CodeLlama-7b-Instruct-eagle1] -examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle1] -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle1] -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle1] - -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle1] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle1] -examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle1] -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle1] -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle1] -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle1] -examples/test_eagle.py::test_codellama_eagle_1gpu[CodeLlama-7b-Instruct-eagle2] -examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle2] -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle2] -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle2] - -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle2] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle2] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle2] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle2] -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle2] -examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2] -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2] -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2] -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2] +llmapi/test_llm_examples.py::test_llmapi_server_example +# e2e serve test examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options] examples/serve/test_serve.py::test_config_file_loading[--config] examples/serve/test_serve.py::test_env_overrides_pdl @@ -827,8 +402,7 @@ examples/serve/test_serve_negative.py::test_malformed_json_request examples/serve/test_serve_negative.py::test_missing_content_type_header examples/serve/test_serve_negative.py::test_extremely_large_batch - -# PyTorch flow disaggregated tests +# e2e disaggregated serving test disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 3d2f6fde86..75282580e0 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -1,39 +1,53 @@ -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend -accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] -accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] -accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp1] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] -accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] -accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] -accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] -accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] -accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 +# text generation accuracy test +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM] +accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False-moe_backend=WIDEEP] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] @@ -61,20 +75,28 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[disable accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_pp4_mtp1] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False-moe_backend=WIDEEP] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=WIDEEP] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3 +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-fp8] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton-auto] @@ -122,117 +144,29 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-one_model-no_overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-overlap_scheduler] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[triton-two_model-no_overlap_scheduler] -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] -accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=True-sampler_async_worker=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search[enable_cuda_graph=True-enable_padding=True-disable_overlap_scheduler=False-sampler_async_worker=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2[torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep2-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp4ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep2-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp4ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep2-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp4ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 -accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 -accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 +accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 +accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized +accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 +accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[4gpus] +accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4[8gpus] +accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] -accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3 -accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype +# multimodal accuracy tests accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype @@ -240,28 +174,41 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] -disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] -disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] -disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] -disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] -disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] +# disaggregated serving accuracy test +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=False] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] +accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] +accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp1] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] +accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] +accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] +accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend +accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] +accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] + +# e2e test test_e2e.py::test_openai_chat_harmony test_e2e.py::test_openai_consistent_chat test_e2e.py::test_openai_multi_chat_example @@ -291,3 +238,27 @@ test_e2e.py::test_trtllm_multimodal_benchmark_serving test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf-Qwen3/qwen3-235B-eagle3] test_e2e.py::test_eagle3_output_consistency_4gpus[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-Llama-4-Maverick-17B-128E-Eagle3] test_e2e.py::test_eagle3_output_consistency_4gpus[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-Qwen3/qwen3-235B-eagle3] + +# e2e disaggregated serving test +disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] +disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] +disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin] +disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] +disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] +disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt index 51cff78ff3..750011c726 100644 --- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt +++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt @@ -179,7 +179,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[xgrammar-eagle3_one_model=True] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 811cb07edd..1352c282ae 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -1,6 +1,5 @@ examples/test_openai.py::test_llm_openai_triton_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654) examples/test_openai.py::test_llm_openai_triton_plugingen_1gpu SKIP (https://nvbugspro.nvidia.com/bug/4963654) -full:GH200/examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (arm is not supported) full:GH200/examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (arm is not supported) full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] SKIP (arm is not supported) full:GH200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (arm is not supported) @@ -13,15 +12,10 @@ full:GH200/examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int perf/test_perf.py::test_perf[t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) perf/test_perf.py::test_perf[flan_t5_base-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) perf/test_perf.py::test_perf[bart_large_cnn-plugin-float16-bs:8-input_output_len:60,20] SKIP # (https://nvidia.slack.com/archives/C059LSY62BT/p1704525727177449) -accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2 SKIP (not supported yet) full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514) -full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514) -full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/4731514) -examples/test_qwen.py::test_llm_qwen1_5_moe_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/4781396) perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:512,200-quant:fp8-tp:4] SKIP (SKIP due to timeout of quantization) perf/test_perf.py::test_perf[llama_v3.1_70b-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-quant:fp8-gpus:8] SKIP (SKIP due to timeout of quantization) cpp/test_e2e.py::test_model[-encoder-90] SKIP (waive Encoder-only test because it doesn't take batched input) -full:L40S/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] SKIP (skip on L40S commit f9a0fcb0) full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955) examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5014327) examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec] SKIP (https://nvbugs/5000026) @@ -31,7 +25,6 @@ examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-full_prec] SKIP examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8] SKIP (https://nvbugs/5000026) examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec] SKIP (https://nvbugs/5000026) examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq] SKIP (https://nvbugs/5000026) -examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5000026) examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931) examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624) examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371) @@ -47,42 +40,15 @@ full:sm100/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blac full:sm100/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell) full:sm100/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96) full:sm100/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12) -full:sm100/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (Disable for Blackwell OOM) full:sm100/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM) full:sm100/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12) -examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5064768) test_e2e.py::test_openai_consistent_chat SKIP (https://nvbugs/5112075) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle1] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_0.5b_instruct-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2.5_1.5b_instruct-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2] SKIP (https://nvbugs/5206383) -examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2] SKIP (https://nvbugs/5206383) examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp16] SKIP (https://nvbugs/5114678) -examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] SKIP (https://nvbugs/5135328) -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5141288) -examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141) full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851) -full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851) -full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851) full:L40S/accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8 SKIP (https://nvbugs/5176867) full:L40S/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2 SKIP (https://nvbugs/5176867) full:L40S/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights SKIP (https://nvbugs/5176867) full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851) -full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851) -full:L20/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851) full:B200/perf/test_perf.py::test_perf[quant:w4a8_awq] SKIP (https://nvbugspro.nvidia.com/bug/5161074) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_tensor] SKIP (https://nvbugspro.nvidia.com/bug/5161074) full:B200/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074) @@ -106,10 +72,6 @@ examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-rec examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221) examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5222697) -examples/test_eagle.py::test_llama_eagle_1gpu[llama-v2-7b-hf-eagle1] SKIP (https://nvbugs/5219535) -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.2-1b-eagle1] SKIP (https://nvbugs/5219535) -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle1] SKIP (https://nvbugs/5219535) -examples/test_eagle.py::test_llama_eagle_1gpu[llama-3.1-8b-eagle2] SKIP (https://nvbugs/5219535) examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339) perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477 perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP @@ -132,15 +94,10 @@ full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[deepseek full:B200/perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float16-input_output_len:128,128-quant:fp8] SKIP (https://nvbugspro.nvidia.com/bug/5150255) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5232405) accuracy/test_cli_flow.py::TestLlama3_2_1B::test_cyclic_kv_cache SKIP (https://nvbugs/5231310) -examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-disable_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5234058) examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:2-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5234058) -full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) -full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) -full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) -accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] SKIP (https://nvbugs/5273695) examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570) triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060) triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP @@ -186,33 +143,19 @@ examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552) unittest/llmapi/test_llm_multi_gpu.py -m "gpu4 and part0" SKIP (https://nvbugs/5348958) accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] SKIP (https://nvbugs/5346443) -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936) -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936) -examples/test_llama.py::test_llm_llama_1gpu_streaming_llm[ailab-deepseek-coder-6.7b-instruct] SKIP (https://nvbugs/5435714) test_e2e.py::test_openai_multinodes_chat_tp16pp1 SKIP (https://nvbugs/5112075) -examples/test_qwen.py::test_llm_hf_qwen_quantization_1gpu[qwen2_vl_7b_instruct-fp8-bfloat16] SKIP (https://nvbugs/5322488) accuracy/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5234043) full:L40S/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620) full:L20/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620) test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8] SKIP (https://nvbugs/5380570) test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-8] SKIP (https://nvbugs/5380570) examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992) -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422) -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233) examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233) test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5409420) -examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288) -examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067) -examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5419070) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] SKIP (https://nvbugs/5421989) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForSequenceClassification-bert/twitter-roberta-base-emotion] SKIP (https://nvbugs/5421989) examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5431132) -accuracy/test_llm_api.py::TestLlama3_2_1B::test_int4_awq_int8_kv_cache SKIP (https://nvbugs/5433541) -accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2 SKIP (https://nvbugs/5433541) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451) -examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451) examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636) accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627) @@ -226,14 +169,6 @@ triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (http triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343) triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] SKIP (https://nvbugs/5465143) -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKIP (https://nvbugs/5465143) -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] SKIP (https://nvbugs/5465143) -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-4-mini-instruct] SKIP (https://nvbugs/5465143) -full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) -full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) -full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696) -full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl SKIP (https://nvbugs/5413362) examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146) triton_server/test_triton.py::test_python_bls_unit_tests[python-bls-unit-tests] SKIP (https://nvbugs/5477392) @@ -241,17 +176,6 @@ triton_server/test_triton.py::test_mistral_ib[mistral-ib] SKIP (https://nvbugs/5 triton_server/test_triton.py::test_eagle[eagle] SKIP (https://nvbugs/5477378) examples/test_mixtral.py::test_llm_mixtral_moe_plugin_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5477421) test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444) -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5448462) -examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5448462) -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5448479) -examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8] SKIP (https://nvbugs/5465143) -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16] SKIP (https://nvbugs/5465143) -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] SKIP (https://nvbugs/5465143) -examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-MoE-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143) -accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5465143, 5481206 WNF) -accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2 SKIP (https://nvbugs/5465143, 5481206 WNF) -accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbugs/5481075) -accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143, 5481206 WNF) accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5738168) test_e2e.py::test_trtllm_bench_iteration_log[TRT-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B] SKIP (https://nvbugs/5448523) accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype SKIP (https://nvbugs/5520319) @@ -260,10 +184,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] SKIP (https://nvbugs/5546507) examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] SKIP (https://nvbugs/5546507) examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] SKIP (https://nvbugs/5546507) -examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] SKIP (https://nvbugs/5546507) -examples/test_ngram.py::test_llm_ngram_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507) -examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs1] SKIP (https://nvbugs/5546507) -examples/test_ngram.py::test_llm_ngram_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-max_draft_len_8-float16-bs2] SKIP (https://nvbugs/5546507) cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5550689) cpp/test_e2e.py::test_benchmarks[bart-90] SKIP (https://nvbugs/5550689) full:H20/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) @@ -278,26 +198,9 @@ full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_f full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5574553) full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) full:H20-3e/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5574553) -full:GB200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] SKIP (https://nvbugs/5568052) -full:GB200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] SKIP (https://nvbugs/5568052) -full:GB200/examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] SKIP (https://nvbugs/5568052) -full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052) -full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052) -full:GB200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052) -full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp4pp1-fp8-nb:1] SKIP (https://nvbugs/5568052) -full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp4pp1-fp8-nb:4] SKIP (https://nvbugs/5568052) -full:B200/examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp8] SKIP (https://nvbugs/5568052) -full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-enable_reduce_fusion-enable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052) -full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-v2-13b-hf-disable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052) -full:B200/examples/test_llama.py::test_llm_llama_2gpu_fp8_summary[llama-7b-enable_reduce_fusion-disable_fp8_context_fmha_xqa] SKIP (https://nvbugs/5568052) -accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (https://nvbugs/5451207) -accuracy/test_cli_flow.py::TestMixtral8x22B::test_fp8_tp2pp2 SKIP (https://nvbugs/5511944) triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] SKIP (https://nvbugs/5470830) full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_variable_beam_width_search SKIP (https://nvbugs/5481075) -full:L40S/accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_plugin SKIP (https://nvbugs/5568052) full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] SKIP (https://nvbugs/5596337) -accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 SKIP (https://nvbugs/5598847) -examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-MoE-instruct] SKIP (https://nvbugs/5465143) unittest/llmapi/test_memory_profiling.py::test_profile_kvcache SKIP (https://nvbugs/5580781) triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414) full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696) @@ -317,14 +220,8 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684) accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697) -examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441) -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) -examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) -examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) -examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) -examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5655832) test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] SKIP (https://nvbugs/5664904) @@ -335,7 +232,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5756804) -examples/test_qwen.py::test_llm_qwen_int4_single_gpu_summary[qwen2.5_14b_instruct_int4-nb:4] SKIP (https://nvbugs/5666826) examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] SKIP (https://nvbugs/5451216) accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype SKIP (https://nvbugs/5588376) accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527) @@ -387,34 +283,18 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_tr accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304) unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392) unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) -examples/test_phi.py::test_phi_fp8_with_bf16_lora[phi-2] SKIP (https://nvbugs/5744293) -examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1] SKIP (https://nvbugs/5744293) -examples/test_phi.py::test_llm_phi_quantization_1gpu[phi-2-fp8-bfloat16] SKIP (https://nvbugs/5744293) +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377) test_e2e.py::test_trtllm_bench_llmapi_launch[pytorch_backend-llama-v3-llama3-8b] SKIP (https://nvbugs/5744432) test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920) test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938) triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981) unittest/llmapi/test_llm_pytorch.py::test_tinyllama_logits_processor[False] SKIP (https://nvbugs/5771838) unittest/llmapi/test_llm_pytorch.py::test_tinyllama_logits_processor[True] SKIP (https://nvbugs/5771838) -accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype SKIP (https://nvbugs/5744293) -accuracy/test_cli_flow.py::TestPhi2::test_tp2 SKIP (https://nvbugs/5744293) -accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293) -accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293) -accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293) -accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype SKIP (https://nvbugs/5744293) -accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5744293) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377) cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979) -examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-RobertaForQuestionAnswering-bert/roberta-base-squad2] SKIP (https://nvbugs/5608979) -examples/test_bert.py::test_llm_bert_general[compare_hf-disable_remove_input_padding-use_attention_plugin-disable_context_fmha-tp:2-pp:1-float16-BertForQuestionAnswering-bert/bert-base-cased-squad2] SKIP (https://nvbugs/5608979) examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979) examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979) -examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) -examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) -examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) -examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2.5_7b_chat-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5754976) -examples/test_qwenvl.py::test_llm_qwenvl_single_gpu_summary[qwen-vl-chat] SKIP (https://nvbugs/5754976) -examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-int8-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/5588376) unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698) triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118) @@ -492,11 +372,9 @@ unittest/_torch/ray_orchestrator/multi_gpu/test_multi_instance.py::test_multi_in disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin] SKIP (https://nvbugs/5776445) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[one_model] SKIP (https://nvbugs/5756028) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_vswa_reuse_4gpus[two_model] SKIP (https://nvbugs/5756028) -examples/test_gpt.py::test_llm_gpt2_parallel_embedding_2gpu[float16-0] SKIP (https://nvbugs/5784518) +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] SKIP (https://nvbugs/5785206) accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5785465) accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5785485) -examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5787855) -examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] SKIP (https://nvbugs/5787855) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] SKIP (https://nvbugs/5787836) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2] SKIP (https://nvbugs/5787836) @@ -515,4 +393,3 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (http accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892) accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)