mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Merge branch 'main' into fix_spec_gate
Signed-off-by: Zheyu Fu <zheyuf@nvidia.com>
This commit is contained in:
commit
ff749bc0ef
@ -10,7 +10,7 @@ tiktoken
|
||||
einops
|
||||
|
||||
# optional dependencies
|
||||
gradio==4.44.1
|
||||
gradio==5.4.0
|
||||
mdtex2html
|
||||
sse_starlette
|
||||
aiohttp_sse_client
|
||||
|
||||
@ -40,6 +40,14 @@ TEST_LIST_PATH = (
|
||||
REPO_ROOT / "tests" / "integration" / "test_lists" / "qa" / "llm_config_database.yml"
|
||||
)
|
||||
ITERATIONS = 10
|
||||
# Mapping from HuggingFace model IDs to MODEL_PATH_DICT keys used by the test framework
|
||||
# in tests/integration/defs/perf/test_perf_sanity.py
|
||||
MODEL_NAME_MAPPING = {
|
||||
"deepseek-ai/DeepSeek-R1-0528": "deepseek_r1_0528_fp8",
|
||||
"nvidia/DeepSeek-R1-0528-FP4-v2": "deepseek_r1_0528_fp4_v2",
|
||||
"openai/gpt-oss-120b": "gpt_oss_120b_fp4",
|
||||
}
|
||||
|
||||
|
||||
# GPU type to condition wildcards mapping for test list
|
||||
# Note: cpu is used to distinguish between e.g. H200_SXM and GH200
|
||||
@ -65,9 +73,13 @@ def generate_client_name(recipe: Recipe) -> str:
|
||||
|
||||
def recipe_to_server_config(recipe: Recipe, llm_api_config: dict) -> dict:
|
||||
"""Convert a recipe + LLM API config to aggr_server format."""
|
||||
model_name = MODEL_NAME_MAPPING.get(recipe.model)
|
||||
if not model_name:
|
||||
raise ValueError(f"Model not found in MODEL_NAME_MAPPING: {recipe.model}")
|
||||
|
||||
server_config = {
|
||||
"name": generate_server_name(recipe),
|
||||
"model_name": recipe.model,
|
||||
"model_name": model_name,
|
||||
"gpus": recipe.num_gpus,
|
||||
# Enable scenario-only matching for baseline comparison
|
||||
"match_mode": "scenario",
|
||||
@ -157,7 +169,7 @@ def generate_condition_entry(
|
||||
}
|
||||
|
||||
tests = [
|
||||
f"perf/test_perf.py::test_perf[perf_sanity_upload-{config_name}-{name}]"
|
||||
f"perf/test_perf_sanity.py::test_e2e[aggr_upload-{config_name}-{name}]"
|
||||
for name in server_names
|
||||
]
|
||||
return {"condition": condition, "tests": tests}
|
||||
|
||||
@ -560,7 +560,7 @@ class ReportUtility:
|
||||
else:
|
||||
backend_info = (
|
||||
"\n\n===========================================================\n"
|
||||
"= PYTORCH BACKEND\n"
|
||||
f"= {self.rt_cfg.backend.upper()} BACKEND\n"
|
||||
"===========================================================\n"
|
||||
f"Model:\t\t\t{engine['model']}\n"
|
||||
f"Model Path:\t\t{engine['model_path']}\n"
|
||||
|
||||
@ -207,7 +207,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
kwargs = self.get_default_kwargs()
|
||||
# TODO: multi-stream MOE seems to increase the memory usage
|
||||
kwargs["max_batch_size"] = 32
|
||||
kwargs["free_mem_ratio"] = 0.5
|
||||
kwargs["free_mem_ratio"] = 0.4
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
|
||||
tokenizer=self.MODEL_PATH_BF16,
|
||||
@ -226,9 +226,9 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
# Manually set quant_config for FP8 model to get the accuracy threshold
|
||||
llm.args.quant_config.quant_algo = QuantAlgo.FP8
|
||||
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
||||
|
||||
# task = MMLU(self.MODEL_NAME)
|
||||
# task.evaluate(llm, sampling_params=sampling_params)
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=sampling_params)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@ -260,6 +260,7 @@ class TestQwen3VL_MOE(LlmapiAccuracyTestHarness):
|
||||
max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<|endoftext|>"
|
||||
)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(140000)
|
||||
def test_auto_dtype(self):
|
||||
with LLM(
|
||||
self.MODEL_PATH,
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
@ -89,7 +92,7 @@ worker_config:
|
||||
- cuda_core
|
||||
cache_transceiver_config:
|
||||
max_tokens_in_buffer: 4608
|
||||
backend: NIXLf
|
||||
backend: NIXL
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -82,6 +82,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -82,6 +82,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -81,6 +81,9 @@ worker_config:
|
||||
moe_config:
|
||||
backend: CUTEDSL
|
||||
use_low_precision_moe_combine: true
|
||||
load_balancer:
|
||||
num_slots: 288
|
||||
layer_updates_per_iter: 1
|
||||
nvfp4_gemm_config:
|
||||
allowed_backends:
|
||||
- cutlass
|
||||
|
||||
@ -78,6 +78,7 @@ PRE_MERGE_THRESHOLD = 0.1
|
||||
# scenario, allowing the underlying config to change while still comparing against baselines
|
||||
# for the same scenario.
|
||||
SCENARIO_MATCH_FIELDS = [
|
||||
"s_gpu_type",
|
||||
"s_runtime",
|
||||
"s_model_name",
|
||||
"l_isl",
|
||||
|
||||
@ -58,6 +58,7 @@ MODEL_PATH_DICT = {
|
||||
}
|
||||
|
||||
SUPPORTED_GPU_TYPE = [
|
||||
"H200",
|
||||
"B200",
|
||||
"B300",
|
||||
"GB200",
|
||||
@ -226,6 +227,7 @@ class ServerConfig:
|
||||
"gpus_per_node",
|
||||
"match_mode",
|
||||
"client_configs",
|
||||
"match_mode",
|
||||
]
|
||||
self.extra_llm_api_config_data = {
|
||||
k: v for k, v in server_config_data.items() if k not in exclude_keys
|
||||
@ -520,7 +522,9 @@ class AggrTestCmds(NamedTuple):
|
||||
)
|
||||
|
||||
wait_for_endpoint_ready(
|
||||
f"http://{server_hostname}:{server_port}/health", timeout=self.timeout
|
||||
f"http://{server_hostname}:{server_port}/health",
|
||||
timeout=self.timeout,
|
||||
server_proc=server_proc,
|
||||
)
|
||||
|
||||
# Run all clients for this server
|
||||
@ -1321,11 +1325,11 @@ class PerfSanityTestConfig:
|
||||
cmd_idx += 1
|
||||
|
||||
if not match_keys:
|
||||
match_keys.extend(["s_gpu_type", "s_runtime"])
|
||||
if server_config.match_mode == "scenario":
|
||||
match_keys = SCENARIO_MATCH_FIELDS.copy()
|
||||
is_scenario_mode = True
|
||||
else:
|
||||
match_keys.extend(["s_gpu_type", "s_runtime"])
|
||||
match_keys.extend(server_config.to_match_keys())
|
||||
match_keys.extend(client_config.to_match_keys())
|
||||
|
||||
|
||||
@ -23,15 +23,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -42,15 +42,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -61,21 +61,21 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -86,27 +86,27 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -117,15 +117,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -136,15 +136,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 2
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -155,15 +155,15 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 4
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
|
||||
- condition:
|
||||
wildcards:
|
||||
gpu:
|
||||
@ -174,18 +174,18 @@ llm_config_database:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
tests:
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
|
||||
- perf/test_perf_sanity.py::test_e2e[aggr_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
|
||||
|
||||
@ -9,7 +9,6 @@ llm_perf_sanity:
|
||||
# 4: A100, L40S, H20, H100, H200, B200, B300, GB200, GB300, RTX6000-Server
|
||||
# 5: L40S, H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 6: H20, H100, H200, B200, B300, RTX6000-Server
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
# ===============================================================================
|
||||
|
||||
# 1: All GPUs
|
||||
@ -31,6 +30,7 @@ llm_perf_sanity:
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:500,2000]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:512,32]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
- perf/test_perf.py::test_perf[starcoder2_7b-bench-pytorch-bfloat16-input_output_len:512,512]
|
||||
# Phi-4-multimodal-instruct
|
||||
- perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
|
||||
# Bielik-11B-v2.2-Instruct
|
||||
@ -124,25 +124,9 @@ llm_perf_sanity:
|
||||
# for chunked prefill cases
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8] TIMEOUT(100)
|
||||
- perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] TIMEOUT(60)
|
||||
# disagg server cases
|
||||
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
- perf/test_perf.py::test_perf[llama_v3.1_8b-disagg_server-ctx_dp:4-gen_tp:4]
|
||||
# gpt_oss_20b_fp4
|
||||
- perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-input_output_len:512,512]
|
||||
|
||||
|
||||
# 7: H20, H100, H200, B200, B300
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 8
|
||||
compute_capability:
|
||||
gte: 9.0
|
||||
lt: 12.0
|
||||
|
||||
tests:
|
||||
# chunked attention case
|
||||
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:20000-kv_frac:0.6-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8]
|
||||
|
||||
@ -220,8 +220,6 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
|
||||
test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825)
|
||||
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904)
|
||||
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] SKIP (https://nvbugs/5664904)
|
||||
@ -233,7 +231,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quan
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] SKIP (https://nvbugs/5673610)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5756804)
|
||||
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] SKIP (https://nvbugs/5451216)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype SKIP (https://nvbugs/5588376)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5673527)
|
||||
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
|
||||
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
|
||||
@ -256,9 +253,6 @@ accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/570
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] SKIP (https://nvbugs/5705194)
|
||||
accuracy/test_cli_flow.py::TestLlama2_7B::test_smooth_quant_ootb_tp2 SKIP (https://nvbugs/5705195)
|
||||
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8] SKIP (https://nvbugs/5666826)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/5707087)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype SKIP (https://nvbugs/5707087)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype SKIP (https://nvbugs/5707087)
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145)
|
||||
@ -293,7 +287,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep
|
||||
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
|
||||
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
|
||||
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen3VL_MOE::test_auto_dtype SKIP (https://nvbugs/5588376)
|
||||
unittest/_torch/speculative/test_dynamic_spec_decode.py::test_dynamic_spec_decode SKIP (https://nvbugs/5758449)
|
||||
unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698)
|
||||
triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
|
||||
@ -341,6 +334,7 @@ accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (
|
||||
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5596343)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=True] SKIP (https://nvbugs/5775326)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
|
||||
@ -392,3 +386,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False] SKIP (https://nvbugs/5795918)
|
||||
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=2-overlap_scheduler=True] SKIP (https://nvbugs/5800591)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5800646)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)
|
||||
full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-overlap_scheduler] SKIP (https://nvbugs/5800679)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False] SKIP (https://nvbugs/5741304)
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] SKIP (https://nvbugs/5800725)
|
||||
examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5802248)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
server_configs:
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -31,7 +31,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -62,7 +62,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -97,7 +97,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -128,7 +128,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -159,7 +159,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -194,7 +194,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -225,7 +225,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -256,7 +256,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -291,7 +291,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -322,7 +322,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -353,7 +353,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8
|
||||
model_name: nvidia/DeepSeek-R1-0528-FP4-v2
|
||||
model_name: deepseek_r1_0528_fp4_v2
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -388,7 +388,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -419,7 +419,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -450,7 +450,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -481,7 +481,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -512,7 +512,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -543,7 +543,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -578,7 +578,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -613,7 +613,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -648,7 +648,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -683,7 +683,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -718,7 +718,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -753,7 +753,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -788,7 +788,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -823,7 +823,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -858,7 +858,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -893,7 +893,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -928,7 +928,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -963,7 +963,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -998,7 +998,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1033,7 +1033,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1068,7 +1068,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1103,7 +1103,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1138,7 +1138,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1173,7 +1173,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1208,7 +1208,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1243,7 +1243,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1278,7 +1278,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1313,7 +1313,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1348,7 +1348,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1383,7 +1383,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1418,7 +1418,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1453,7 +1453,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1488,7 +1488,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1523,7 +1523,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1558,7 +1558,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1593,7 +1593,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1628,7 +1628,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1663,7 +1663,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1698,7 +1698,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1733,7 +1733,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1768,7 +1768,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1803,7 +1803,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
server_configs:
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -31,7 +31,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -62,7 +62,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -93,7 +93,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -124,7 +124,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -155,7 +155,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
|
||||
model_name: deepseek-ai/DeepSeek-R1-0528
|
||||
model_name: deepseek_r1_0528_fp8
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
cuda_graph_config:
|
||||
@ -190,7 +190,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -224,7 +224,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -258,7 +258,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -292,7 +292,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -326,7 +326,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -360,7 +360,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -394,7 +394,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -428,7 +428,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -462,7 +462,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -496,7 +496,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -530,7 +530,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -564,7 +564,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -598,7 +598,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -632,7 +632,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -666,7 +666,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -700,7 +700,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -734,7 +734,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -768,7 +768,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -802,7 +802,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -836,7 +836,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -870,7 +870,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -904,7 +904,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -938,7 +938,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -972,7 +972,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1006,7 +1006,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1040,7 +1040,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1074,7 +1074,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 1
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1108,7 +1108,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1142,7 +1142,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1176,7 +1176,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 2
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1210,7 +1210,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1244,7 +1244,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1278,7 +1278,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 4
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1312,7 +1312,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1346,7 +1346,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
@ -1380,7 +1380,7 @@ server_configs:
|
||||
backend: openai
|
||||
streaming: true
|
||||
- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
|
||||
model_name: openai/gpt-oss-120b
|
||||
model_name: gpt_oss_120b_fp4
|
||||
gpus: 8
|
||||
match_mode: scenario
|
||||
env_overrides:
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def wait_for_endpoint_ready(url: str, timeout: int = 300):
|
||||
def wait_for_endpoint_ready(url: str, timeout: int = 300, server_proc: subprocess.Popen = None):
|
||||
start = time.monotonic()
|
||||
while time.monotonic() - start < timeout:
|
||||
if server_proc is not None:
|
||||
exit_code = server_proc.poll()
|
||||
if exit_code is not None:
|
||||
raise RuntimeError(
|
||||
f"Server process exited with code {exit_code} before becoming ready."
|
||||
)
|
||||
try:
|
||||
time.sleep(1)
|
||||
if requests.get(url, timeout=5).status_code == 200:
|
||||
|
||||
@ -1,12 +1,16 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from itertools import product
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from utils.llm_data import llm_models_root
|
||||
|
||||
from tensorrt_llm import MultimodalEncoder
|
||||
from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
|
||||
from tensorrt_llm.inputs import default_multimodal_input_loader
|
||||
from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
|
||||
from tensorrt_llm.llmapi.llm import LLM, SamplingParams
|
||||
@ -24,56 +28,127 @@ _QWEN_2_5_VL_DIR = llm_models_root() / "Qwen2.5-VL-3B-Instruct"
|
||||
_QWEN_3_VL_DIR = llm_models_root() / "Qwen3" / "Qwen3-VL-2B-Instruct"
|
||||
|
||||
|
||||
# TODO: Add multi-image in single chat test
|
||||
@pytest.mark.parametrize("model_dir",
|
||||
[_LLAVA_DIR, _QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR])
|
||||
@pytest.mark.parametrize("pd_disagg", [False, True])
|
||||
def test_single_image_chat(model_dir, pd_disagg):
|
||||
"""Test processing single image using encoder (pass mm_embeddings) + LLM API.
|
||||
@pytest.mark.parametrize(
|
||||
"prompts,expected_num_duplicates",
|
||||
[
|
||||
# Full reuse: same media + same prompts
|
||||
# All blocks are reused, thus no duplicates
|
||||
(["Describe the natural environment in the image."] * 2, 0),
|
||||
# Partial reuse: same media + different prompts
|
||||
# Prefix blocks are reused, thus 2 duplicates
|
||||
([
|
||||
"Describe the natural environment in the image.",
|
||||
"What objects can you see in the image?",
|
||||
"Describe the weather in the image.",
|
||||
], 2),
|
||||
])
|
||||
def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates):
|
||||
"""Test mm_keys in KV cache events with cache reuse scenarios.
|
||||
|
||||
This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
|
||||
results to standard llm generation (pass raw image) by comparing outputs.
|
||||
This test verifies:
|
||||
1. KV cache events contain mm_keys for multimodal blocks
|
||||
2. mm_keys have the expected structure (hash + start_offset)
|
||||
3. Cache reuse behavior based on media and prompts:
|
||||
- Same media + same prompts: full reuse (0 duplicate offsets)
|
||||
- Same media + different prompts: partial reuse (prefix blocks reused)
|
||||
"""
|
||||
encoder_model_dir = _LLAVA_DIR
|
||||
|
||||
# Test configuration
|
||||
max_tokens = 64
|
||||
max_tokens = 16
|
||||
free_gpu_memory_fraction = 0.2
|
||||
max_batch_size = 1
|
||||
|
||||
# Test data - OpenAI chat completion format
|
||||
prompts = ["Describe the natural environment in the image."]
|
||||
media = [example_images[0]]
|
||||
# Use same image for all prompts
|
||||
media = [example_images[0]] * len(prompts)
|
||||
|
||||
# Sampling configuration
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False,
|
||||
enable_block_reuse=True,
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
event_buffer_max_size=1024, # Enable KV cache events
|
||||
)
|
||||
|
||||
llm = LLM(model=encoder_model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=1)
|
||||
|
||||
inputs = _load_inputs(llm, prompts, media)
|
||||
|
||||
with llm:
|
||||
# Generate for each input separately to test KV cache reuse
|
||||
for inp in inputs:
|
||||
_ = llm.generate([inp], sampling_params=sampling_params)
|
||||
|
||||
time.sleep(0.5) # Wait for events to be dispatched
|
||||
events = llm.get_kv_cache_events(10)
|
||||
|
||||
# Extract mm_keys offsets from stored events
|
||||
mm_keys_offsets = []
|
||||
for event in events:
|
||||
if event and event.get("data", {}).get("type") == "stored":
|
||||
for block in event["data"].get("blocks", []):
|
||||
if block.get("mm_keys"):
|
||||
for mm_key in block["mm_keys"]:
|
||||
assert "hash" in mm_key, "mm_key should have 'hash' field"
|
||||
assert "start_offset" in mm_key, "mm_key should have 'start_offset' field"
|
||||
mm_keys_offsets.append(mm_key["start_offset"])
|
||||
|
||||
num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets))
|
||||
assert num_duplicates == expected_num_duplicates, (
|
||||
f"Expected {expected_num_duplicates} duplicate mm_keys offsets, "
|
||||
f"got {num_duplicates}. Offsets: {mm_keys_offsets}")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=[_LLAVA_DIR, _QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR],
|
||||
ids=["llava_7b", "qwen2.5_3b", "qwen3_2b"])
|
||||
def model_dir(request) -> Path:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def pd_disagg(request) -> bool:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llms(model_dir: Path,
|
||||
pd_disagg: bool) -> Generator[tuple[LLM, LLM | None], None, None]:
|
||||
"""Get LLM for prefill and, if disagg, separate LLM for decode."""
|
||||
free_gpu_memory_fraction = 0.2
|
||||
disable_overlap_scheduler = pd_disagg
|
||||
cache_transceiver_cfg = CacheTransceiverConfig(
|
||||
backend="DEFAULT") if pd_disagg else None
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False, # Disable for output 1:1 matching check
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
)
|
||||
|
||||
# Process multimodal data using encoder (pass mm_embeddings)
|
||||
encoder = MultimodalEncoder(model=model_dir, max_batch_size=max_batch_size)
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg,
|
||||
disable_overlap_scheduler=disable_overlap_scheduler,
|
||||
max_batch_size=1, # fix batch size to reduce non-determinism in tests
|
||||
)
|
||||
with llm:
|
||||
if pd_disagg:
|
||||
llm_decode = LLM(
|
||||
model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg,
|
||||
)
|
||||
with llm_decode:
|
||||
yield (llm, llm_decode)
|
||||
else:
|
||||
yield (llm, None)
|
||||
|
||||
cache_transceiver_cfg = CacheTransceiverConfig(
|
||||
backend="DEFAULT") if pd_disagg else None
|
||||
|
||||
disable_overlap_scheduler = pd_disagg
|
||||
|
||||
llm = LLM(model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg,
|
||||
disable_overlap_scheduler=disable_overlap_scheduler)
|
||||
|
||||
llm_decode = None
|
||||
if pd_disagg:
|
||||
llm_decode = LLM(model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
trust_remote_code=True,
|
||||
cache_transceiver_config=cache_transceiver_cfg)
|
||||
|
||||
def _load_inputs(llm: LLM, prompts, media, mm_embeddings=None):
|
||||
# Load model configuration
|
||||
config_path = os.path.join(llm._hf_model_dir, 'config.json')
|
||||
assert os.path.exists(
|
||||
@ -90,11 +165,42 @@ def test_single_image_chat(model_dir, pd_disagg):
|
||||
modality="image",
|
||||
prompts=prompts,
|
||||
media=media,
|
||||
mm_embeddings=mm_embeddings,
|
||||
image_data_format="pt")
|
||||
|
||||
# Validate inputs structure
|
||||
assert len(inputs) == len(
|
||||
prompts), f"Expected {len(prompts)} inputs, got {len(inputs)}"
|
||||
return inputs
|
||||
|
||||
|
||||
# TODO: Add multi-image in single chat test
|
||||
@pytest.mark.threadleak(enabled=False)
|
||||
def test_single_image_chat(
|
||||
pd_disagg: bool,
|
||||
model_dir: Path,
|
||||
llms: tuple[LLM, LLM | None],
|
||||
):
|
||||
"""Test processing single image using encoder (pass mm_embeddings) + LLM API.
|
||||
|
||||
This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
|
||||
results to standard llm generation (pass raw image) by comparing outputs.
|
||||
"""
|
||||
llm, llm_decode = llms
|
||||
|
||||
# Test configuration
|
||||
max_tokens = 64
|
||||
max_batch_size = 1
|
||||
|
||||
# Test data - OpenAI chat completion format
|
||||
prompts = ["Describe the natural environment in the image."]
|
||||
media = [example_images[0]]
|
||||
|
||||
# Sampling configuration
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
|
||||
# Prepare multimodal inputs
|
||||
inputs = _load_inputs(llm, prompts, media)
|
||||
|
||||
# Generate reference output with raw multimodal inputs
|
||||
outputs_ref = llm.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
@ -109,33 +215,35 @@ def test_single_image_chat(model_dir, pd_disagg):
|
||||
) > 0, f"Reference generation has no output text for input {i}"
|
||||
|
||||
# Prepare inputs for llm (pass mm_embeddings)
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
# Process multimodal data using encoder (pass mm_embeddings)
|
||||
encoder = MultimodalEncoder(model=model_dir, max_batch_size=max_batch_size)
|
||||
with encoder:
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
|
||||
# Generate output using llm (pass mm_embeddings)
|
||||
ep_disaggregated_params = encoder_outputs[0].disaggregated_params
|
||||
# Generate output using llm (pass mm_embeddings)
|
||||
ep_disaggregated_params = encoder_outputs[0].disaggregated_params
|
||||
|
||||
assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
|
||||
ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
|
||||
assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
|
||||
ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=ep_disaggregated_params)
|
||||
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=ep_disaggregated_params)
|
||||
if pd_disagg:
|
||||
# Generation using llm_decode
|
||||
assert len(outputs) == 1
|
||||
pd_disaggregated_params = outputs[0].disaggregated_params
|
||||
pd_disaggregated_params.request_type = "generation_only"
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
# remove multimodal data from input as decoder worker doesn't need it
|
||||
inputs[0]['multi_modal_data'] = None
|
||||
# use prompt token ids from encoder output
|
||||
inputs[0]['prompt_token_ids'] = outputs[0].prompt_token_ids
|
||||
|
||||
if pd_disagg:
|
||||
# Generation using llm_decode
|
||||
assert len(outputs) == 1
|
||||
pd_disaggregated_params = outputs[0].disaggregated_params
|
||||
pd_disaggregated_params.request_type = "generation_only"
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
# remove multimodal data from input as decoder worker doesn't need it
|
||||
inputs[0]['multi_modal_data'] = None
|
||||
# use prompt token ids from encoder output
|
||||
inputs[0]['prompt_token_ids'] = outputs[0].prompt_token_ids
|
||||
|
||||
outputs = llm_decode.generate(
|
||||
inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=pd_disaggregated_params)
|
||||
outputs = llm_decode.generate(
|
||||
inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=pd_disaggregated_params)
|
||||
|
||||
# Validate outputs
|
||||
assert len(outputs) == len(
|
||||
@ -175,24 +283,37 @@ def test_single_image_chat(model_dir, pd_disagg):
|
||||
f"Log probabilities don't match for output {i}, generation {j}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_dir, encoder_max_batch_size",
|
||||
[
|
||||
(_LLAVA_DIR, 3),
|
||||
# Qwen2.5 VL's vision encoder seems to output different embeddings based on this value.
|
||||
# The test only passes with this set to 1.
|
||||
(_QWEN_2_5_VL_DIR, 1),
|
||||
(_QWEN_3_VL_DIR, 3),
|
||||
],
|
||||
)
|
||||
def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
|
||||
@pytest.mark.parametrize("use_mm_embeddings,pass_embeddings_through_loader",
|
||||
product([False, True], [False, True]))
|
||||
@pytest.mark.threadleak(enabled=False)
|
||||
def test_multi_request_batch_chat(
|
||||
model_dir: Path,
|
||||
llms: tuple[LLM, LLM | None],
|
||||
use_mm_embeddings: bool,
|
||||
pass_embeddings_through_loader: bool,
|
||||
):
|
||||
"""Test batching multiple multimodal requests and verify encoder path matches raw path.
|
||||
|
||||
This mirrors test_single_image_chat but with a batch of size 3.
|
||||
This mirrors test_single_image_chat but with a batch of size 3. It also tests passing
|
||||
embeddings alongside the prompt ("multi_modal_embeddings"), as well as the embedding
|
||||
handling within default_multimodal_input_loader.
|
||||
"""
|
||||
if use_mm_embeddings and model_dir in [_QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR]:
|
||||
pytest.skip("Qwen does not implement attach_multimodal_embeddings")
|
||||
|
||||
# Qwen2.5/3 VL's vision encoder seems to output different embeddings based on this value.
|
||||
# The test only passes with this set to 1.
|
||||
encoder_max_batch_size = (1 if model_dir
|
||||
in [_QWEN_2_5_VL_DIR, _QWEN_3_VL_DIR] else 3)
|
||||
|
||||
llm, llm_decode = llms
|
||||
if llm_decode is not None:
|
||||
pytest.skip("Disagg support not implemented in test case")
|
||||
|
||||
if pass_embeddings_through_loader and not use_mm_embeddings:
|
||||
pytest.skip("Redundant test configuration")
|
||||
|
||||
max_tokens = 64
|
||||
free_gpu_memory_fraction = 0.6
|
||||
|
||||
prompts = [
|
||||
"Describe the natural environment in the image.",
|
||||
@ -202,37 +323,8 @@ def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
|
||||
media = [example_images[0], example_images[1], example_images[2]]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=
|
||||
False, # Disable block reuse for output 1-1 matching check
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
)
|
||||
|
||||
encoder = MultimodalEncoder(model=model_dir,
|
||||
max_batch_size=encoder_max_batch_size)
|
||||
llm = LLM(
|
||||
model=model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=1, # fix batch size to reduce non-determinism in tests
|
||||
trust_remote_code=True)
|
||||
|
||||
config_path = os.path.join(llm._hf_model_dir, 'config.json')
|
||||
assert os.path.exists(
|
||||
config_path), f"Model config not found at {config_path}"
|
||||
with open(config_path, 'r') as f:
|
||||
model_config = json.load(f)
|
||||
model_type = model_config['model_type']
|
||||
|
||||
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
|
||||
model_dir=llm._hf_model_dir,
|
||||
model_type=model_type,
|
||||
modality="image",
|
||||
prompts=prompts,
|
||||
media=media,
|
||||
image_data_format="pt")
|
||||
assert len(inputs) == len(
|
||||
prompts), f"Expected {len(prompts)} inputs, got {len(inputs)}"
|
||||
inputs = _load_inputs(llm, prompts, media)
|
||||
|
||||
# Reference with raw inputs
|
||||
outputs_ref = llm.generate(inputs, sampling_params=sampling_params)
|
||||
@ -242,107 +334,74 @@ def test_multi_request_batch_chat(model_dir, encoder_max_batch_size):
|
||||
output.outputs
|
||||
) > 0, f"Reference generation has no output text for input {i}"
|
||||
|
||||
# Encoder path
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
for eo in encoder_outputs:
|
||||
eo.disaggregated_params.request_type = "context_and_generation"
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
disaggregated_params=[
|
||||
eo.disaggregated_params for eo in encoder_outputs
|
||||
])
|
||||
encoder = MultimodalEncoder(model=model_dir,
|
||||
max_batch_size=encoder_max_batch_size)
|
||||
with encoder:
|
||||
# Encoder path
|
||||
encoder_outputs = encoder.generate(inputs)
|
||||
if use_mm_embeddings:
|
||||
for input, encoder_output in zip(inputs, encoder_outputs):
|
||||
mm_embed_handle = encoder_output.mm_embedding_handle
|
||||
assert mm_embed_handle is not None
|
||||
mm_embed = SharedTensorContainer.from_dict(
|
||||
mm_embed_handle).get_local_view()
|
||||
input["multi_modal_embeddings"] = {"image": mm_embed}
|
||||
|
||||
assert len(outputs) == len(prompts)
|
||||
for i, output in enumerate(outputs):
|
||||
assert len(
|
||||
output.outputs) > 0, f"generation has no output text for input {i}"
|
||||
if pass_embeddings_through_loader:
|
||||
# Test embedding support in default_multimodal_input_loader
|
||||
inputs_with_embeddings = _load_inputs(
|
||||
llm,
|
||||
prompts,
|
||||
media=None,
|
||||
mm_embeddings=[
|
||||
input["multi_modal_embeddings"]["image"]
|
||||
for input in inputs
|
||||
],
|
||||
)
|
||||
for input, input_with_embedding in zip(inputs,
|
||||
inputs_with_embeddings):
|
||||
assert isinstance(input, dict)
|
||||
assert isinstance(input_with_embedding, dict)
|
||||
assert list(
|
||||
set(input.keys())
|
||||
^ set(input_with_embedding.keys())) == [
|
||||
"multi_modal_data"
|
||||
]
|
||||
assert set(input_with_embedding.keys()) == set(
|
||||
["prompt", "multi_modal_embeddings"])
|
||||
assert input["prompt"] == input_with_embedding["prompt"]
|
||||
assert list(
|
||||
input["multi_modal_embeddings"].keys()) == ["image"]
|
||||
assert list(input_with_embedding["multi_modal_embeddings"].
|
||||
keys()) == ["image"]
|
||||
mm_embed, = input_with_embedding["multi_modal_embeddings"][
|
||||
"image"]
|
||||
torch.testing.assert_close(
|
||||
mm_embed, input["multi_modal_embeddings"]["image"])
|
||||
inputs = inputs_with_embeddings # perform inference with embeddings returned by input loader
|
||||
|
||||
# Compare
|
||||
for i, (ref_output, test_output) in enumerate(zip(outputs_ref, outputs)):
|
||||
assert len(ref_output.outputs) == len(test_output.outputs), \
|
||||
f"Number of generated outputs don't match for output {i}: {len(ref_output.outputs)} vs {len(test_output.outputs)}"
|
||||
for j, (ref_gen, test_gen) in enumerate(
|
||||
zip(ref_output.outputs, test_output.outputs)):
|
||||
assert ref_gen.text == test_gen.text, \
|
||||
f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}"
|
||||
extra_kwargs = {}
|
||||
else:
|
||||
for eo in encoder_outputs:
|
||||
eo.disaggregated_params.request_type = "context_and_generation"
|
||||
extra_kwargs = dict(disaggregated_params=[
|
||||
eo.disaggregated_params for eo in encoder_outputs
|
||||
])
|
||||
outputs = llm.generate(inputs,
|
||||
sampling_params=sampling_params,
|
||||
**extra_kwargs)
|
||||
|
||||
assert len(outputs) == len(prompts)
|
||||
for i, output in enumerate(outputs):
|
||||
assert len(output.outputs
|
||||
) > 0, f"generation has no output text for input {i}"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompts,expected_num_duplicates",
|
||||
[
|
||||
# Full reuse: same media + same prompts
|
||||
# All blocks are reused, thus no duplicates
|
||||
(["Describe the natural environment in the image."] * 2, 0),
|
||||
# Partial reuse: same media + different prompts
|
||||
# Prefix blocks are reused, thus 2 duplicates
|
||||
([
|
||||
"Describe the natural environment in the image.",
|
||||
"What objects can you see in the image?",
|
||||
"Describe the weather in the image.",
|
||||
], 2),
|
||||
])
|
||||
def test_kv_event_mm_keys_with_reuse(prompts, expected_num_duplicates):
|
||||
"""Test mm_keys in KV cache events with cache reuse scenarios.
|
||||
|
||||
This test verifies:
|
||||
1. KV cache events contain mm_keys for multimodal blocks
|
||||
2. mm_keys have the expected structure (hash + start_offset)
|
||||
3. Cache reuse behavior based on media and prompts:
|
||||
- Same media + same prompts: full reuse (0 duplicate offsets)
|
||||
- Same media + different prompts: partial reuse (prefix blocks reused)
|
||||
"""
|
||||
encoder_model_dir = _LLAVA_DIR
|
||||
|
||||
max_tokens = 16
|
||||
free_gpu_memory_fraction = 0.6
|
||||
|
||||
# Use same image for all prompts
|
||||
media = [example_images[0]] * len(prompts)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens)
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
free_gpu_memory_fraction=free_gpu_memory_fraction,
|
||||
event_buffer_max_size=1024, # Enable KV cache events
|
||||
)
|
||||
|
||||
llm = LLM(model=encoder_model_dir,
|
||||
backend='pytorch',
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_batch_size=1)
|
||||
|
||||
config_path = os.path.join(llm._hf_model_dir, 'config.json')
|
||||
with open(config_path, 'r') as f:
|
||||
model_config = json.load(f)
|
||||
model_type = model_config['model_type']
|
||||
|
||||
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
|
||||
model_dir=llm._hf_model_dir,
|
||||
model_type=model_type,
|
||||
modality="image",
|
||||
prompts=prompts,
|
||||
media=media,
|
||||
image_data_format="pt")
|
||||
|
||||
# Generate for each input separately to test KV cache reuse
|
||||
for inp in inputs:
|
||||
_ = llm.generate([inp], sampling_params=sampling_params)
|
||||
|
||||
time.sleep(0.5) # Wait for events to be dispatched
|
||||
events = llm.get_kv_cache_events(10)
|
||||
|
||||
# Extract mm_keys offsets from stored events
|
||||
mm_keys_offsets = []
|
||||
for event in events:
|
||||
if event and event.get("data", {}).get("type") == "stored":
|
||||
for block in event["data"].get("blocks", []):
|
||||
if block.get("mm_keys"):
|
||||
for mm_key in block["mm_keys"]:
|
||||
assert "hash" in mm_key, "mm_key should have 'hash' field"
|
||||
assert "start_offset" in mm_key, "mm_key should have 'start_offset' field"
|
||||
mm_keys_offsets.append(mm_key["start_offset"])
|
||||
|
||||
num_duplicates = len(mm_keys_offsets) - len(set(mm_keys_offsets))
|
||||
assert num_duplicates == expected_num_duplicates, (
|
||||
f"Expected {expected_num_duplicates} duplicate mm_keys offsets, "
|
||||
f"got {num_duplicates}. Offsets: {mm_keys_offsets}")
|
||||
# Compare
|
||||
for i, (ref_output, test_output) in enumerate(zip(outputs_ref,
|
||||
outputs)):
|
||||
assert len(ref_output.outputs) == len(test_output.outputs), \
|
||||
f"Number of generated outputs don't match for output {i}: {len(ref_output.outputs)} vs {len(test_output.outputs)}"
|
||||
for j, (ref_gen, test_gen) in enumerate(
|
||||
zip(ref_output.outputs, test_output.outputs)):
|
||||
assert ref_gen.text == test_gen.text, \
|
||||
f"Generated text doesn't match for output {i}, generation {j}:\nReference: {ref_gen.text!r}\nTest: {test_gen.text!r}"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user