diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd4a9c0b590..0b919e2314a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -590,8 +590,11 @@ endif()
 
 if (VLLM_GPU_LANG STREQUAL "HIP")
   # Add QuickReduce kernels (ROCm-only; not part of stable ABI migration).
+  # TODO: Remove the cuda_view when ROCm upgrade to torch 2.11.
   list(APPEND VLLM_EXT_SRC
     "csrc/custom_quickreduce.cu"
+    "csrc/cuda_view.cu"
+    "csrc/libtorch_stable/cuda_utils_kernels.cu"
   )
 # if ROCM endif
 endif()
@@ -621,7 +624,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   #
   set(VLLM_STABLE_EXT_SRC
     "csrc/libtorch_stable/torch_bindings.cpp"
-    "csrc/libtorch_stable/cuda_view.cu"
     "csrc/libtorch_stable/activation_kernels.cu"
     "csrc/libtorch_stable/quantization/w8a8/int8/scaled_quant.cu"
     "csrc/libtorch_stable/quantization/w8a8/fp8/common.cu"
@@ -649,6 +651,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
 
   if(VLLM_GPU_LANG STREQUAL "CUDA")
     list(APPEND VLLM_STABLE_EXT_SRC
+      "csrc/libtorch_stable/cuda_view.cu"
       "csrc/libtorch_stable/cuda_utils_kernels.cu"
       "csrc/libtorch_stable/cutlass_extensions/common.cpp"
       "csrc/libtorch_stable/quantization/w8a8/cutlass/scaled_mm_entry.cu"
@@ -1071,20 +1074,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
     USE_SABI 3
     WITH_SOABI)
 
-  # Set TORCH_TARGET_VERSION for stable ABI compatibility.
-  # This ensures we only use C-shim APIs available in PyTorch 2.11.
-  # _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
-  # which is currently set to 2.11.
-  target_compile_definitions(_C_stable_libtorch PRIVATE
-    TORCH_TARGET_VERSION=0x020B000000000000ULL)
-
   # Needed to use cuda/hip APIs from C-shim
   if(VLLM_GPU_LANG STREQUAL "CUDA")
+    # Set TORCH_TARGET_VERSION for stable ABI compatibility.
+    # This ensures we only use C-shim APIs available in PyTorch 2.11.
+    # _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
+    # which is currently set to 2.11.
+    target_compile_definitions(_C_stable_libtorch PRIVATE
+      TORCH_TARGET_VERSION=0x020B000000000000ULL)
     target_compile_definitions(_C_stable_libtorch PRIVATE USE_CUDA)
     # Needed by CUTLASS kernels
     target_compile_definitions(_C_stable_libtorch PRIVATE
       CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
   elseif(VLLM_GPU_LANG STREQUAL "HIP")
+    # Set TORCH_TARGET_VERSION for stable ABI compatibility.
+    # This ensures we only use C-shim APIs available in PyTorch 2.10.
+    # _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
+    # which is currently set to 2.10.
+    target_compile_definitions(_C_stable_libtorch PRIVATE
+      TORCH_TARGET_VERSION=0x020A000000000000ULL)
     target_compile_definitions(_C_stable_libtorch PRIVATE USE_ROCM)
   endif()
 
diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
new file mode 100644
index 00000000000..00e06a9329f
--- /dev/null
+++ b/csrc/cuda_view.cu
@@ -0,0 +1,60 @@
+// TODO: Remove this once ROCm upgrade to torch 2.11.
+#include <torch/all.h>
+#include <torch/cuda.h>
+#include <cuda_runtime.h>
+
+// This function assumes that `cpu_tensor` is a CPU tensor,
+// and that UVA (Unified Virtual Addressing) is enabled.
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
+  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
+
+  // handle empty tensor
+  if (cpu_tensor.numel() == 0) {
+    return torch::empty(cpu_tensor.sizes(),
+                        cpu_tensor.options().device(torch::kCUDA));
+  }
+
+  if (cpu_tensor.is_pinned()) {
+    // If CPU tensor is pinned, directly get the device pointer.
+    void* host_ptr = const_cast<void*>(cpu_tensor.data_ptr());
+    void* device_ptr = nullptr;
+    cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
+    TORCH_CHECK(err == cudaSuccess,
+                "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
+
+    return torch::from_blob(
+        device_ptr, cpu_tensor.sizes(), cpu_tensor.strides(),
+        [base = cpu_tensor](void*) {},  // keep cpu tensor alive
+        cpu_tensor.options().device(torch::kCUDA));
+  }
+
+  // If CPU tensor is not pinned, allocate a new pinned memory buffer.
+  torch::Tensor contiguous_cpu = cpu_tensor.contiguous();
+  size_t nbytes = contiguous_cpu.nbytes();
+
+  void* host_ptr = nullptr;
+  cudaError_t err = cudaHostAlloc(&host_ptr, nbytes, cudaHostAllocMapped);
+  if (err != cudaSuccess) {
+    AT_ERROR("cudaHostAlloc failed: ", cudaGetErrorString(err));
+  }
+
+  err = cudaMemcpy(host_ptr, contiguous_cpu.data_ptr(), nbytes,
+                   cudaMemcpyDefault);
+  if (err != cudaSuccess) {
+    cudaFreeHost(host_ptr);
+    AT_ERROR("cudaMemcpy failed: ", cudaGetErrorString(err));
+  }
+
+  void* device_ptr = nullptr;
+  err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
+  if (err != cudaSuccess) {
+    cudaFreeHost(host_ptr);
+    AT_ERROR("cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
+  }
+
+  auto deleter = [host_ptr](void*) { cudaFreeHost(host_ptr); };
+
+  return torch::from_blob(device_ptr, contiguous_cpu.sizes(),
+                          contiguous_cpu.strides(), deleter,
+                          contiguous_cpu.options().device(torch::kCUDA));
+}
\ No newline at end of file
diff --git a/csrc/libtorch_stable/ops.h b/csrc/libtorch_stable/ops.h
index 536693fee96..3571726f617 100644
--- a/csrc/libtorch_stable/ops.h
+++ b/csrc/libtorch_stable/ops.h
@@ -162,12 +162,13 @@ torch::stable::Tensor awq_dequantize(torch::stable::Tensor _kernel,
 // AllSpark ops: declarations are in the source files
 // (allspark_repack.cu and allspark_qgemm_w8a16.cu)
 
-#endif
-
-// CPU tensor -> CUDA UVA view (shared CUDA/ROCm)
+// TODO: Move this out once ROCm upgrade their torch to 2.11.
+// CPU tensor -> CUDA UVA view (shared CUDA)
 torch::stable::Tensor get_cuda_view_from_cpu_tensor(
     torch::stable::Tensor& cpu_tensor);
 
+#endif
+
 // Attention kernels (shared CUDA/ROCm)
 void merge_attn_states(
     torch::stable::Tensor& output,
diff --git a/csrc/libtorch_stable/torch_bindings.cpp b/csrc/libtorch_stable/torch_bindings.cpp
index d376fd43aa7..6c699ffd981 100644
--- a/csrc/libtorch_stable/torch_bindings.cpp
+++ b/csrc/libtorch_stable/torch_bindings.cpp
@@ -28,9 +28,11 @@ STABLE_TORCH_LIBRARY_FRAGMENT(_C, ops) {
       "output_s, int group_size, float eps, float int8_min, float int8_max) -> "
       "()");
 
+#ifndef USE_ROCM
+
+  // TODO: Remove this once ROCm upgrade to torch 2.11.
   ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
 
-#ifndef USE_ROCM
   ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
 #endif
 
@@ -676,11 +678,28 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CUDA, ops) {
   ops.impl("paged_attention_v2", TORCH_BOX(&paged_attention_v2));
 }
 
+// TODO: Remove this once ROCm upgrade to torch 2.11.
+#ifndef USE_ROCM
 STABLE_TORCH_LIBRARY_IMPL(_C, CPU, ops) {
   ops.impl("get_cuda_view_from_cpu_tensor",
            TORCH_BOX(&get_cuda_view_from_cpu_tensor));
 }
 
+STABLE_TORCH_LIBRARY_FRAGMENT(_C_cuda_utils, cuda_utils) {
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
+  cuda_utils.def(
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(_C_cuda_utils, CompositeExplicitAutograd,
+                          cuda_utils) {
+  cuda_utils.impl("get_device_attribute", TORCH_BOX(&get_device_attribute));
+  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
+                  TORCH_BOX(&get_max_shared_memory_per_block_device_attribute));
+}
+
+#endif
+
 // These capability-check functions take only primitive args (no tensors), so
 // there is no device to dispatch on. CompositeExplicitAutograd makes them
 // available for all backends. This is the stable ABI equivalent of calling
@@ -701,19 +720,6 @@ STABLE_TORCH_LIBRARY_IMPL(_C, CompositeExplicitAutograd, ops) {
   ops.impl("ggml_moe_get_block_size", TORCH_BOX(&ggml_moe_get_block_size));
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(_C_cuda_utils, cuda_utils) {
-  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
-  cuda_utils.def(
-      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(_C_cuda_utils, CompositeExplicitAutograd,
-                          cuda_utils) {
-  cuda_utils.impl("get_device_attribute", TORCH_BOX(&get_device_attribute));
-  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
-                  TORCH_BOX(&get_max_shared_memory_per_block_device_attribute));
-}
-
 // Cache ops
 STABLE_TORCH_LIBRARY_FRAGMENT(_C_cache_ops, ops) {
   // Swap in (out) the cache blocks from src to dst.
diff --git a/csrc/ops.h b/csrc/ops.h
index 11b704aff29..e39bae08f19 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -102,4 +102,7 @@ void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
 void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                    int64_t quant_level, bool cast_bf2half = false);
 int64_t qr_max_size();
+
+// TODO: Remove this once ROCm upgrade to torch 2.11.
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
 #endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index f29b941affd..c63e59c3b03 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -2,6 +2,7 @@
 // cache.h, which is no longer included here after cache ops moved to
 // _C_stable_libtorch).
 #include <torch/all.h>
+#include "cuda_utils.h"
 #include "ops.h"
 #include "core/registration.h"
 #include <torch/library.h>
@@ -31,6 +32,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
 
+#ifdef USE_ROCM
+  // TODO: Remove this once we upgrade to torch 2.11.
+  // ROCm still uses torch 2.10,
+  // So we still need to use unstable torch ABI for now.
+  ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
+  ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
+           &get_cuda_view_from_cpu_tensor);
+#endif
+
   // Activation ops (quantized only — basic ops moved to _C_stable_libtorch)
   ops.def(
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
@@ -159,6 +169,20 @@ TORCH_LIBRARY_FRAGMENT(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
 
   custom_ar.def("qr_max_size", &qr_max_size);
 }
+
+// TODO: Remove this once ROCm upgrade to torch 2.11.
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
+  // Cuda utils
+  // Gets the specified device attribute.
+  cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
+  cuda_utils.impl("get_device_attribute", &get_device_attribute);
+
+  // Gets the maximum shared memory per block device attribute.
+  cuda_utils.def(
+      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
+  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
+                  &get_max_shared_memory_per_block_device_attribute);
+}
 #endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)