mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
ci: [nvbugs/5280806] Unwaive unittests/_torch. (#4951)
Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
This commit is contained in:
parent
9c012d5bf8
commit
88480197da
@ -419,7 +419,7 @@ class RopeParams:
|
||||
self.original_max_positions,
|
||||
})
|
||||
if rope_inv_freq is not None:
|
||||
rope_inv_freq = torch.torch.tensor(
|
||||
rope_inv_freq = torch.tensor(
|
||||
rope_inv_freq,
|
||||
dtype=torch.float32,
|
||||
device='cuda',
|
||||
@ -428,7 +428,7 @@ class RopeParams:
|
||||
rope_cos_sin = rope_cos_sin.reshape(
|
||||
self.max_positions, -1,
|
||||
2)[:, :self.dim // 2, :].transpose(0, 2, 1).reshape(1, -1)
|
||||
rope_cos_sin = torch.torch.tensor(
|
||||
rope_cos_sin = torch.tensor(
|
||||
rope_cos_sin,
|
||||
dtype=torch.float32,
|
||||
device='cuda',
|
||||
|
||||
@ -373,7 +373,6 @@ full:B200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_
|
||||
accuracy/test_cli_flow.py::TestMixtral8x22B::test_int8_plugin_tp8[renormalize-tensor_parallel] SKIP (https://nvbugs/5273695)
|
||||
examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoder] SKIP (https://nvbugs/5144931)
|
||||
examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (https://nvbugs/5144931)
|
||||
unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" SKIP (https://nvbugs/5280806)
|
||||
examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-disable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/5244570)
|
||||
triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5240060)
|
||||
triton_server/test_triton.py::test_triton_extensive[triton-extensive] SKIP
|
||||
|
||||
@ -314,6 +314,7 @@ def test_fused_moe_nvfp4(dtype):
|
||||
torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5325653")
|
||||
@skip_neither_ada_nor_hopper_unittest
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
def test_fused_moe_w4afp8(dtype):
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from mpi4py import MPI
|
||||
|
||||
@ -178,6 +179,7 @@ class TestMoeLoadBalancer(unittest.TestCase):
|
||||
# Verify the global state is cleaned up
|
||||
self.assertIsNone(get_moe_load_balancer())
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5324229")
|
||||
@patch('tensorrt_llm.bindings.internal.runtime.MoeLoadBalancer')
|
||||
def test_single_layer_moe_load_balancer_methods(self,
|
||||
mock_load_balancer_impl):
|
||||
|
||||
@ -13,6 +13,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.llm_data import llm_models_root
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5280806")
|
||||
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
|
||||
[[True, "TRTLLM"], [False, "TRTLLM"],
|
||||
[True, "FLASHINFER"], [False, "FLASHINFER"]])
|
||||
|
||||
@ -16,6 +16,7 @@ from utils.llm_data import llm_models_root
|
||||
# TODO: Add cuda graph enabled tests.
|
||||
# Cuda graph cannot currently be enabled for ngram because cuda graph requires
|
||||
# spec metadata and ngram does not have it.
|
||||
@pytest.mark.skip(reason="https://nvbugs/5324239")
|
||||
@pytest.mark.parametrize("use_cuda_graph,attn_backend",
|
||||
[[False, "TRTLLM"], [False, "FLASHINFER"]])
|
||||
def test_llama_ngram(use_cuda_graph: bool, attn_backend: str):
|
||||
|
||||
@ -43,6 +43,9 @@ from tensorrt_llm.quantization.utils.fp4_utils import (
|
||||
["low-latency", "throughput"],
|
||||
)
|
||||
def test_fp8_block_scale_gemm(dtype, m, k, n, inference_mode):
|
||||
if inference_mode == "low-latency" and dtype == torch.bfloat16:
|
||||
pytest.skip("https://nvbugs/5328141")
|
||||
|
||||
torch.random.manual_seed(0)
|
||||
a = torch.randn((m, k), device='cuda', dtype=torch.float)
|
||||
b = torch.randn((n, k), device='cuda', dtype=torch.float)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import unittest
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import tensorrt_llm
|
||||
@ -126,6 +127,7 @@ def create_model_engine_and_kvcache(config: PyTorchConfig = None):
|
||||
return model_engine, kv_cache_manager
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5324248")
|
||||
class PyTorchModelEngineTestCase(unittest.TestCase):
|
||||
|
||||
def test_pad_generation_requests(self) -> None:
|
||||
|
||||
@ -5,6 +5,7 @@ import sys
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import tensorrt_llm
|
||||
@ -338,6 +339,7 @@ class TestResourceManager(unittest.TestCase):
|
||||
|
||||
self.assertEqual(len(peft_table), self.num_lora_modules)
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5324252")
|
||||
def test_put_get(self):
|
||||
"""Test adding a request with properly configured LoRA weights and config."""
|
||||
peft_cache_config = self.create_peft_cache_config()
|
||||
|
||||
@ -54,6 +54,7 @@ def test_LlmResponse_pickle():
|
||||
assert pickle_result.log_probs == logprobs
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5327892")
|
||||
@force_ampere # Save H100 resource
|
||||
@pytest.mark.parametrize("return_log_probs", [False, True])
|
||||
@pytest.mark.parametrize("gather_generation_logits", [False, True])
|
||||
@ -96,31 +97,35 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool,
|
||||
top_p=1,
|
||||
return_context_logits=gather_context_logits,
|
||||
return_generation_logits=gather_generation_logits,
|
||||
logprobs=return_log_probs)
|
||||
logprobs=return_log_probs,
|
||||
)
|
||||
|
||||
for output in llm.generate(prompts, sampling_params=sampling_params):
|
||||
if gather_context_logits:
|
||||
assert output.context_logits is not None
|
||||
assert len(prompts[0].split()) == output.context_logits.shape[0]
|
||||
else:
|
||||
assert output.context_logits is None
|
||||
with llm:
|
||||
for output in llm.generate(prompts, sampling_params=sampling_params):
|
||||
if gather_context_logits:
|
||||
assert output.context_logits is not None
|
||||
assert len(prompts[0].split()) == output.context_logits.shape[0]
|
||||
else:
|
||||
assert output.context_logits is None
|
||||
|
||||
if gather_generation_logits:
|
||||
gen_logits = output.outputs[0].generation_logits
|
||||
assert gen_logits is not None
|
||||
assert gen_logits.ndim == 2
|
||||
assert gen_logits.shape[0] == sampling_params.max_tokens
|
||||
assert torch.argmax(gen_logits,
|
||||
dim=1).tolist() == output.outputs[0].token_ids
|
||||
else:
|
||||
assert output.outputs[0].generation_logits is None
|
||||
if gather_generation_logits:
|
||||
gen_logits = output.outputs[0].generation_logits
|
||||
assert gen_logits is not None
|
||||
assert gen_logits.ndim == 2
|
||||
assert gen_logits.shape[0] == sampling_params.max_tokens
|
||||
assert torch.argmax(
|
||||
gen_logits, dim=1).tolist() == output.outputs[0].token_ids
|
||||
else:
|
||||
assert output.outputs[0].generation_logits is None
|
||||
|
||||
if return_log_probs:
|
||||
assert len(output.outputs[0].logprobs) == sampling_params.max_tokens
|
||||
else:
|
||||
assert len(output.outputs[0].logprobs) == 0
|
||||
if return_log_probs:
|
||||
assert len(
|
||||
output.outputs[0].logprobs) == sampling_params.max_tokens
|
||||
else:
|
||||
assert len(output.outputs[0].logprobs) == 0
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="https://nvbugs/5327892")
|
||||
@force_ampere # Save H100 resource
|
||||
@pytest.mark.parametrize("return_log_probs", [False, True])
|
||||
@pytest.mark.parametrize("gather_generation_logits", [False, True])
|
||||
@ -164,28 +169,29 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool,
|
||||
return_generation_logits=gather_generation_logits,
|
||||
logprobs=return_log_probs)
|
||||
|
||||
for idx, output in enumerate(
|
||||
llm.generate_async(prompts[0],
|
||||
sampling_params=sampling_params,
|
||||
streaming=True)):
|
||||
if gather_context_logits:
|
||||
assert output.context_logits is not None
|
||||
assert len(prompts[0].split()) == output.context_logits.shape[0]
|
||||
else:
|
||||
assert output.context_logits is None
|
||||
with llm:
|
||||
for idx, output in enumerate(
|
||||
llm.generate_async(prompts[0],
|
||||
sampling_params=sampling_params,
|
||||
streaming=True)):
|
||||
if gather_context_logits:
|
||||
assert output.context_logits is not None
|
||||
assert len(prompts[0].split()) == output.context_logits.shape[0]
|
||||
else:
|
||||
assert output.context_logits is None
|
||||
|
||||
if gather_generation_logits:
|
||||
gen_logits = output.outputs[0].generation_logits
|
||||
assert gen_logits is not None
|
||||
assert gen_logits.ndim == 2
|
||||
assert gen_logits.shape[0] == 1
|
||||
assert torch.argmax(
|
||||
gen_logits,
|
||||
dim=1).tolist()[0] == output.outputs[0].token_ids[-1]
|
||||
else:
|
||||
assert output.outputs[0].generation_logits is None
|
||||
if gather_generation_logits:
|
||||
gen_logits = output.outputs[0].generation_logits
|
||||
assert gen_logits is not None
|
||||
assert gen_logits.ndim == 2
|
||||
assert gen_logits.shape[0] == 1
|
||||
assert torch.argmax(
|
||||
gen_logits,
|
||||
dim=1).tolist()[0] == output.outputs[0].token_ids[-1]
|
||||
else:
|
||||
assert output.outputs[0].generation_logits is None
|
||||
|
||||
if return_log_probs:
|
||||
assert len(output.outputs[0].logprobs) == idx + 1
|
||||
else:
|
||||
assert len(output.outputs[0].logprobs) == 0
|
||||
if return_log_probs:
|
||||
assert len(output.outputs[0].logprobs) == idx + 1
|
||||
else:
|
||||
assert len(output.outputs[0].logprobs) == 0
|
||||
|
||||
@ -41,6 +41,9 @@ from utils.util import getSMVersion
|
||||
[torch.bfloat16],
|
||||
)
|
||||
def test_fp8_block_scale_gemm(dtype, m, k, n):
|
||||
if getSMVersion() == 89 and k == 7168 and n == 2112:
|
||||
pytest.skip("https://nvbugs/5328184")
|
||||
|
||||
torch.random.manual_seed(0)
|
||||
a = torch.randn((m, k), device='cuda', dtype=dtype) / k
|
||||
b = torch.randn((n, k), device='cuda', dtype=dtype) / k
|
||||
|
||||
@ -235,7 +235,7 @@ class TestFunctional(unittest.TestCase):
|
||||
def test_selective_scan_v2(self, dim, headdim, ngroups, dstate, req_type,
|
||||
dtype, batch_size, max_seq_len, has_z,
|
||||
remove_padding):
|
||||
|
||||
pytest.skip("https://nvbugs/5324258")
|
||||
if dtype == 'float32' and req_type == 'context':
|
||||
pytest.skip(
|
||||
"Mamba2 chunk scan kernel only support float16 and bfloat16")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user