diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
index 3cff1f7398..b18cb0d3c6 100644
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@@ -5261,7 +5261,7 @@ For more information, please refer to <http://unlicense.org>
   - `Tracker`: https://github.com/tox-dev/py-filelock/issues
 
 
-## flashinfer-python (0.3.1.post1)
+## flashinfer-python (0.6.0)
 
 ### Licenses
 License: `Apache-2.0`
diff --git a/requirements.txt b/requirements.txt
index 2e789cbc7f..6980f8487f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,7 +53,7 @@ ordered-set
 peft
 patchelf
 einops
-flashinfer-python>=0.3.0,<0.4.0
+flashinfer-python~=0.6.0
 opencv-python-headless
 xgrammar==0.1.25
 llguidance==0.7.29
@@ -74,7 +74,7 @@ nvidia-cutlass-dsl==4.3.4; python_version >= "3.10"
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
 partial_json_parser
-apache-tvm-ffi==0.1.4 # used for reduce nvidia-cutlass-dsl host overhead
+apache-tvm-ffi==0.1.6 # used for reduce nvidia-cutlass-dsl host overhead
 torch-c-dlpack-ext==0.1.3 # used for reduce nvidia-cutlass-dsl host overhead, optional package for improved torch tensor calling perf
 mistral-common==1.8.6
 torchao>=0.14.1
diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
index 3279605a07..c0150ea753 100644
--- a/security_scanning/pyproject.toml
+++ b/security_scanning/pyproject.toml
@@ -57,7 +57,7 @@ ordered-set = "^4.1.0"
 peft = "^0.18.1"
 patchelf = "^0.17.2.4"
 einops = "^0.8.1"
-flashinfer-python = ">=0.3.0,<0.4.0"
+flashinfer-python = "^0.6.0"
 xgrammar = "0.1.25"
 llguidance = "0.7.29"
 jsonschema = "^4.26.0"
diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py
index a6f52ee4a7..4766b49a6c 100644
--- a/tensorrt_llm/_torch/attention_backend/flashinfer.py
+++ b/tensorrt_llm/_torch/attention_backend/flashinfer.py
@@ -425,6 +425,8 @@ class FlashInferAttentionMetadata(AttentionMetadata):
                 paged_kv_indices_buffer=self._paged_kv_indices,
                 paged_kv_last_page_len_buffer=self._paged_kv_last_page_len,
                 use_tensor_cores=use_tensor_cores,
+                backend="fa2"
+                if torch.cuda.get_device_capability(0) == (9, 0) else "auto",
             )
 
         def decode_plan():
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
index c8c2f83515..ac530fd7ea 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py
@@ -26,156 +26,6 @@ from .attention_interface import (
 )
 
 
-# TODO: remove this when flashinfer version is updated to >0.5
-def fast_decode_plan(
-    wrapper: flashinfer.BatchDecodeWithPagedKVCacheWrapper,
-    indptr: torch.Tensor,
-    indices: torch.Tensor,
-    last_page_len: torch.Tensor,
-    num_qo_heads: int,
-    num_kv_heads: int,
-    head_dim: int,
-    page_size: int,
-    pos_encoding_mode: str = "NONE",
-    window_left: int = -1,
-    logits_soft_cap: Optional[float] = None,
-    q_data_type: Optional[Union[str, torch.dtype]] = None,
-    kv_data_type: Optional[Union[str, torch.dtype]] = None,
-    data_type: Optional[Union[str, torch.dtype]] = None,
-    sm_scale: Optional[float] = None,
-    rope_scale: Optional[float] = None,
-    rope_theta: Optional[float] = None,
-    non_blocking: bool = True,
-    fixed_split_size: Optional[int] = None,
-    disable_split_kv: bool = False,
-    global_override_indptr_cpu: Optional[torch.Tensor] = None,
-) -> None:
-    """
-    Copied from flashinfer.decode.fast_decode_plan in flashinfer version >0.5.
-    Does not exist in flashinfer version 0.3.1, hence copied here.
-    """
-    batch_size = len(last_page_len)
-    if logits_soft_cap is None:
-        logits_soft_cap = 0.0
-
-    # Handle data types consistently
-    if data_type is not None:
-        if q_data_type is None:
-            q_data_type = data_type
-        if kv_data_type is None:
-            kv_data_type = data_type
-    elif q_data_type is None:
-        q_data_type = "float16"
-
-    if kv_data_type is None:
-        kv_data_type = q_data_type
-
-    if wrapper.use_tensor_cores:
-        qo_indptr_host = torch.arange(batch_size + 1, dtype=torch.int32, device="cpu")
-        # Here we set fixed_split_size to -1 to avoid the assertion error in flashinfer's plan function
-        if fixed_split_size is None:
-            fixed_split_size = -1
-
-    if wrapper.is_cuda_graph_enabled:
-        if batch_size != wrapper._fixed_batch_size:
-            raise ValueError(
-                "The batch size should be fixed in cudagraph mode, the runtime batch size {} "
-                " mismatches the batch size set during initialization {}".format(
-                    batch_size, wrapper._fixed_batch_size
-                )
-            )
-        if len(indices) > len(wrapper._paged_kv_indices_buf):
-            raise ValueError(
-                "The size of indices should be less than or equal to the allocated buffer"
-            )
-    else:
-        wrapper._paged_kv_indptr_buf = indptr
-        wrapper._paged_kv_indices_buf = indices
-        wrapper._paged_kv_last_page_len_buf = last_page_len
-        if wrapper.use_tensor_cores:
-            wrapper._qo_indptr_buf = qo_indptr_host.to(wrapper.device, non_blocking=non_blocking)
-
-    # Create empty tensors for dtype info if needed
-    empty_q_data = torch.empty(
-        0,
-        dtype=(getattr(torch, q_data_type) if isinstance(q_data_type, str) else q_data_type),
-        device=wrapper.device,
-    )
-
-    empty_kv_cache = torch.empty(
-        0,
-        dtype=(getattr(torch, kv_data_type) if isinstance(kv_data_type, str) else kv_data_type),
-        device=wrapper.device,
-    )
-
-    indptr_host = (
-        global_override_indptr_cpu if global_override_indptr_cpu is not None else indptr.cpu()
-    )
-
-    with torch.cuda.device(wrapper.device):
-        if wrapper.use_tensor_cores:
-            # ALSO convert last_page_len to CPU
-            if page_size == 1:
-                # When page size is 1, last_page_len is always 1.
-                # Directly construct the host tensor rather than executing a device-to-host copy.
-                last_page_len_host = torch.ones((batch_size,), dtype=torch.int32, device="cpu")
-            else:
-                last_page_len_host = last_page_len.cpu()
-
-            kv_lens_arr_host = flashinfer.get_seq_lens(indptr_host, last_page_len_host, page_size)
-
-            try:
-                # Make sure we pass exactly 15 arguments for tensor core version
-                wrapper._plan_info = wrapper._cached_module.plan(
-                    wrapper._float_workspace_buffer,
-                    wrapper._int_workspace_buffer,
-                    wrapper._pin_memory_int_workspace_buffer,
-                    qo_indptr_host,
-                    indptr_host,
-                    kv_lens_arr_host,
-                    batch_size,  # total_num_rows
-                    batch_size,
-                    num_qo_heads,
-                    num_kv_heads,
-                    page_size,
-                    wrapper.is_cuda_graph_enabled,
-                    head_dim,
-                    head_dim,
-                    False,  # causal
-                )
-            except Exception as e:
-                raise RuntimeError(f"Error in standard plan: {e}") from e
-        else:
-            try:
-                # Make sure we pass exactly 15 arguments for standard version
-                wrapper._plan_info = wrapper._cached_module.plan(
-                    wrapper._float_workspace_buffer,
-                    wrapper._int_workspace_buffer,
-                    wrapper._pin_memory_int_workspace_buffer,
-                    indptr_host,
-                    batch_size,
-                    num_qo_heads,
-                    num_kv_heads,
-                    page_size,
-                    wrapper.is_cuda_graph_enabled,
-                    window_left,
-                    logits_soft_cap,
-                    head_dim,
-                    head_dim,
-                    empty_q_data,
-                    empty_kv_cache,
-                )
-            except Exception as e:
-                raise RuntimeError(f"Error in standard plan: {e}") from e
-
-    wrapper._pos_encoding_mode = pos_encoding_mode
-    wrapper._window_left = window_left
-    wrapper._logits_soft_cap = logits_soft_cap
-    wrapper._sm_scale = sm_scale
-    wrapper._rope_scale = rope_scale
-    wrapper._rope_theta = rope_theta
-
-
 @dataclass
 class PlanParams:
     """Parameters that affect the flashinfer execution plan."""
@@ -233,12 +83,14 @@ class _FlashInferPlanner:
                 paged_kv_indices_buffer=indices,
                 paged_kv_last_page_len_buffer=last_page_len,
                 use_tensor_cores=True,
+                backend="fa2" if torch.cuda.get_device_capability(0) == (9, 0) else "auto",
             )
         else:
             return flashinfer.BatchDecodeWithPagedKVCacheWrapper(
                 self.workspace_buffer,
                 "NHD",
                 use_tensor_cores=True,
+                backend="fa2" if torch.cuda.get_device_capability(0) == (9, 0) else "auto",
             )
 
     def init_workspace(self, workspace_buffer: torch.Tensor):
@@ -268,7 +120,7 @@ class _FlashInferPlanner:
         for plan_params in self.cached_cuda_graph_decode_wrappers:
             if plan_params.num_seq == num_seq:
                 wrapper = self.cached_cuda_graph_decode_wrappers[plan_params]
-                fast_decode_plan(
+                flashinfer.decode.fast_decode_plan(
                     wrapper,
                     cu_num_pages,
                     cache_loc,
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 388cecba4f..5bf14f78a7 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -194,10 +194,10 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
         # TODO: multi-stream MOE seems to increase the memory usage
         kwargs["max_batch_size"] = 32
         kwargs["free_mem_ratio"] = 0.4
-        sampling_params = self.get_default_sampling_params()
         with AutoDeployLLM(model=self.MODEL_PATH_BF16,
                            tokenizer=self.MODEL_PATH_BF16,
                            **kwargs) as llm:
+            sampling_params = self.get_default_sampling_params()
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=sampling_params)
             task = GSM8K(self.MODEL_NAME)
@@ -206,6 +206,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
     @pytest.mark.skip_less_device_memory(32000)
     def test_fp8(self):
         kwargs = self.get_default_kwargs()
+        kwargs["max_batch_size"] = 64
         with AutoDeployLLM(model=self.MODEL_PATH_FP8,
                            tokenizer=self.MODEL_PATH_FP8,
                            **kwargs) as llm:
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index d0668658c0..2354ccfb3a 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -383,6 +383,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] SKIP (https://nvbugs/5810980)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5814309)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646)
+unittest/_torch/thop/parallel/test_fp4_swizzle.py::test_swizzle_sf SKIP (https://nvbugs/5811159)
 unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py::test_mxfp4_mlp_ep_dtypes[1-4-6] SKIP (https://nvbugs/5814247)
 unittest/_torch/auto_deploy/unit/multigpu/custom_ops/test_mxfp4_moe_ep.py::test_mxfp4_mlp_ep_dtypes[1-4-8] SKIP (https://nvbugs/5814247)
 unittest/_torch/auto_deploy/unit/multigpu/test_ad_allreduce_strategies.py::test_allreduce_strategies[AUTO] SKIP (https://nvbugs/5814247)