diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index 3cff1f7398..b18cb0d3c6 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -5261,7 +5261,7 @@ For more information, please refer to - `Tracker`: https://github.com/tox-dev/py-filelock/issues -## flashinfer-python (0.3.1.post1) +## flashinfer-python (0.6.0) ### Licenses License: `Apache-2.0` diff --git a/requirements.txt b/requirements.txt index 2e789cbc7f..6980f8487f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,7 +53,7 @@ ordered-set peft patchelf einops -flashinfer-python>=0.3.0,<0.4.0 +flashinfer-python~=0.6.0 opencv-python-headless xgrammar==0.1.25 llguidance==0.7.29 @@ -74,7 +74,7 @@ nvidia-cutlass-dsl==4.3.4; python_version >= "3.10" plotly numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing partial_json_parser -apache-tvm-ffi==0.1.4 # used for reduce nvidia-cutlass-dsl host overhead +apache-tvm-ffi==0.1.6 # used for reduce nvidia-cutlass-dsl host overhead torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf mistral-common==1.8.6 torchao>=0.14.1 diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index 3279605a07..c0150ea753 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -57,7 +57,7 @@ ordered-set = "^4.1.0" peft = "^0.18.1" patchelf = "^0.17.2.4" einops = "^0.8.1" -flashinfer-python = ">=0.3.0,<0.4.0" +flashinfer-python = "^0.6.0" xgrammar = "0.1.25" llguidance = "0.7.29" jsonschema = "^4.26.0" diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py index a6f52ee4a7..4766b49a6c 100644 --- a/tensorrt_llm/_torch/attention_backend/flashinfer.py +++ b/tensorrt_llm/_torch/attention_backend/flashinfer.py @@ -425,6 +425,8 @@ class FlashInferAttentionMetadata(AttentionMetadata): paged_kv_indices_buffer=self._paged_kv_indices, paged_kv_last_page_len_buffer=self._paged_kv_last_page_len, use_tensor_cores=use_tensor_cores, + backend="fa2" + if torch.cuda.get_device_capability(0) == (9, 0) else "auto", ) def decode_plan(): diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py index c8c2f83515..ac530fd7ea 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py @@ -26,156 +26,6 @@ from .attention_interface import ( ) -# TODO: remove this when flashinfer version is updated to >0.5 -def fast_decode_plan( - wrapper: flashinfer.BatchDecodeWithPagedKVCacheWrapper, - indptr: torch.Tensor, - indices: torch.Tensor, - last_page_len: torch.Tensor, - num_qo_heads: int, - num_kv_heads: int, - head_dim: int, - page_size: int, - pos_encoding_mode: str = "NONE", - window_left: int = -1, - logits_soft_cap: Optional[float] = None, - q_data_type: Optional[Union[str, torch.dtype]] = None, - kv_data_type: Optional[Union[str, torch.dtype]] = None, - data_type: Optional[Union[str, torch.dtype]] = None, - sm_scale: Optional[float] = None, - rope_scale: Optional[float] = None, - rope_theta: Optional[float] = None, - non_blocking: bool = True, - fixed_split_size: Optional[int] = None, - disable_split_kv: bool = False, - global_override_indptr_cpu: Optional[torch.Tensor] = None, -) -> None: - """ - Copied from flashinfer.decode.fast_decode_plan in flashinfer version >0.5. - Does not exist in flashinfer version 0.3.1, hence copied here. - """ - batch_size = len(last_page_len) - if logits_soft_cap is None: - logits_soft_cap = 0.0 - - # Handle data types consistently - if data_type is not None: - if q_data_type is None: - q_data_type = data_type - if kv_data_type is None: - kv_data_type = data_type - elif q_data_type is None: - q_data_type = "float16" - - if kv_data_type is None: - kv_data_type = q_data_type - - if wrapper.use_tensor_cores: - qo_indptr_host = torch.arange(batch_size + 1, dtype=torch.int32, device="cpu") - # Here we set fixed_split_size to -1 to avoid the assertion error in flashinfer's plan function - if fixed_split_size is None: - fixed_split_size = -1 - - if wrapper.is_cuda_graph_enabled: - if batch_size != wrapper._fixed_batch_size: - raise ValueError( - "The batch size should be fixed in cudagraph mode, the runtime batch size {} " - " mismatches the batch size set during initialization {}".format( - batch_size, wrapper._fixed_batch_size - ) - ) - if len(indices) > len(wrapper._paged_kv_indices_buf): - raise ValueError( - "The size of indices should be less than or equal to the allocated buffer" - ) - else: - wrapper._paged_kv_indptr_buf = indptr - wrapper._paged_kv_indices_buf = indices - wrapper._paged_kv_last_page_len_buf = last_page_len - if wrapper.use_tensor_cores: - wrapper._qo_indptr_buf = qo_indptr_host.to(wrapper.device, non_blocking=non_blocking) - - # Create empty tensors for dtype info if needed - empty_q_data = torch.empty( - 0, - dtype=(getattr(torch, q_data_type) if isinstance(q_data_type, str) else q_data_type), - device=wrapper.device, - ) - - empty_kv_cache = torch.empty( - 0, - dtype=(getattr(torch, kv_data_type) if isinstance(kv_data_type, str) else kv_data_type), - device=wrapper.device, - ) - - indptr_host = ( - global_override_indptr_cpu if global_override_indptr_cpu is not None else indptr.cpu() - ) - - with torch.cuda.device(wrapper.device): - if wrapper.use_tensor_cores: - # ALSO convert last_page_len to CPU - if page_size == 1: - # When page size is 1, last_page_len is always 1. - # Directly construct the host tensor rather than executing a device-to-host copy. - last_page_len_host = torch.ones((batch_size,), dtype=torch.int32, device="cpu") - else: - last_page_len_host = last_page_len.cpu() - - kv_lens_arr_host = flashinfer.get_seq_lens(indptr_host, last_page_len_host, page_size) - - try: - # Make sure we pass exactly 15 arguments for tensor core version - wrapper._plan_info = wrapper._cached_module.plan( - wrapper._float_workspace_buffer, - wrapper._int_workspace_buffer, - wrapper._pin_memory_int_workspace_buffer, - qo_indptr_host, - indptr_host, - kv_lens_arr_host, - batch_size, # total_num_rows - batch_size, - num_qo_heads, - num_kv_heads, - page_size, - wrapper.is_cuda_graph_enabled, - head_dim, - head_dim, - False, # causal - ) - except Exception as e: - raise RuntimeError(f"Error in standard plan: {e}") from e - else: - try: - # Make sure we pass exactly 15 arguments for standard version - wrapper._plan_info = wrapper._cached_module.plan( - wrapper._float_workspace_buffer, - wrapper._int_workspace_buffer, - wrapper._pin_memory_int_workspace_buffer, - indptr_host, - batch_size, - num_qo_heads, - num_kv_heads, - page_size, - wrapper.is_cuda_graph_enabled, - window_left, - logits_soft_cap, - head_dim, - head_dim, - empty_q_data, - empty_kv_cache, - ) - except Exception as e: - raise RuntimeError(f"Error in standard plan: {e}") from e - - wrapper._pos_encoding_mode = pos_encoding_mode - wrapper._window_left = window_left - wrapper._logits_soft_cap = logits_soft_cap - wrapper._sm_scale = sm_scale - wrapper._rope_scale = rope_scale - wrapper._rope_theta = rope_theta - - @dataclass class PlanParams: """Parameters that affect the flashinfer execution plan.""" @@ -233,12 +83,14 @@ class _FlashInferPlanner: paged_kv_indices_buffer=indices, paged_kv_last_page_len_buffer=last_page_len, use_tensor_cores=True, + backend="fa2" if torch.cuda.get_device_capability(0) == (9, 0) else "auto", ) else: return flashinfer.BatchDecodeWithPagedKVCacheWrapper( self.workspace_buffer, "NHD", use_tensor_cores=True, + backend="fa2" if torch.cuda.get_device_capability(0) == (9, 0) else "auto", ) def init_workspace(self, workspace_buffer: torch.Tensor): @@ -268,7 +120,7 @@ class _FlashInferPlanner: for plan_params in self.cached_cuda_graph_decode_wrappers: if plan_params.num_seq == num_seq: wrapper = self.cached_cuda_graph_decode_wrappers[plan_params] - fast_decode_plan( + flashinfer.decode.fast_decode_plan( wrapper, cu_num_pages, cache_loc, diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 388cecba4f..5bf14f78a7 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -194,10 +194,10 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness): # TODO: multi-stream MOE seems to increase the memory usage kwargs["max_batch_size"] = 32 kwargs["free_mem_ratio"] = 0.4 - sampling_params = self.get_default_sampling_params() with AutoDeployLLM(model=self.MODEL_PATH_BF16, tokenizer=self.MODEL_PATH_BF16, **kwargs) as llm: + sampling_params = self.get_default_sampling_params() task = MMLU(self.MODEL_NAME) task.evaluate(llm, sampling_params=sampling_params) task = GSM8K(self.MODEL_NAME) @@ -206,6 +206,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device_memory(32000) def test_fp8(self): kwargs = self.get_default_kwargs() + kwargs["max_batch_size"] = 64 with AutoDeployLLM(model=self.MODEL_PATH_FP8, tokenizer=self.MODEL_PATH_FP8, **kwargs) as llm: diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index d0668658c0..2354ccfb3a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -383,6 +383,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] SKIP (https://nvbugs/5810980) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5814309) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646) +unittest/_torch/thop/parallel/test_fp4_swizzle.py::test_swizzle_sf SKIP (https://nvbugs/5811159) unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py::test_mxfp4_mlp_ep_dtypes[1-4-6] SKIP (https://nvbugs/5814247) unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py::test_mxfp4_mlp_ep_dtypes[1-4-8] SKIP (https://nvbugs/5814247) unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py::test_allreduce_strategies[AUTO] SKIP (https://nvbugs/5814247)