diff --git a/=4.5.1 b/=4.5.1
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 78825060559..6f836ff5354 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -423,6 +423,12 @@ if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
         ${VLLM_EXT_SRC})
 endif()
 
+if (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)	
+    set(VLLM_EXT_SRC
+        "csrc/cpu/shm.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+
 if(USE_ONEDNN)
     set(VLLM_EXT_SRC
         "csrc/cpu/dnnl_kernels.cpp"
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
index 87c7a9dd51f..ba65e27a15e 100644
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -89,6 +89,35 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
   }
 };
 
+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  ss16x8x2_t reg;
+
+  explicit FP16Vec16(const void* ptr) {
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
+  }
+
+  explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}
+
+  explicit FP16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
+  }
+
+  void save(void* ptr, int elem_num) const {
+    int num = std::max(0, std::min(elem_num, VEC_ELEM_NUM));
+    if (num <= 8) {
+      vec_xst_len(reg.val[0], (signed short*)ptr, num * 2);
+    } else {
+      vec_xst(reg.val[0], 0, (signed short*)ptr);
+      vec_xst_len(reg.val[1], (signed short*)ptr + 8, (num - 8) * 2);
+    }
+  }
+};
+
 struct BF16Vec16 : public Vec<BF16Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
 
@@ -100,6 +129,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
     reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
   }
 
+  explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
+
   explicit BF16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const {
@@ -379,6 +410,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[3] = vec_xl(48, ptr);
   }
 
+  explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
+
   explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
 
   explicit FP32Vec16(const FP32Vec16& data) {
@@ -402,6 +435,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[3] = data.reg.val[1];
   }
 
+  explicit FP32Vec16(const FP16Vec16& v);
   explicit FP32Vec16(const BF16Vec16& v) {
     reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
     reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
@@ -735,6 +769,40 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
 #endif
 }
 
+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  alignas(16) float temp_fp32[16];
+  alignas(16) c10::Half temp_fp16[16];
+
+  vec_xst(v.reg.val[0], 0, temp_fp32);
+  vec_xst(v.reg.val[1], 16, temp_fp32);
+  vec_xst(v.reg.val[2], 32, temp_fp32);
+  vec_xst(v.reg.val[3], 48, temp_fp32);
+
+  for (int i = 0; i < 16; i++) {
+    temp_fp16[i] = c10::Half(temp_fp32[i]);
+  }
+
+  reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)temp_fp16);
+  reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)temp_fp16);
+}
+
+inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
+  alignas(16) c10::Half temp_fp16[16];
+  alignas(16) float temp_fp32[16];
+
+  vec_xst(v.reg.val[0], 0, (signed short*)temp_fp16);
+  vec_xst(v.reg.val[1], 16, (signed short*)temp_fp16);
+
+  for (int i = 0; i < 16; i++) {
+    temp_fp32[i] = float(temp_fp16[i]);
+  }
+
+  reg.val[0] = vec_xl(0, temp_fp32);
+  reg.val[1] = vec_xl(16, temp_fp32);
+  reg.val[2] = vec_xl(32, temp_fp32);
+  reg.val[3] = vec_xl(48, temp_fp32);
+}
+
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 #ifdef _ARCH_PWR10
   __vector signed short ret[4];
@@ -794,6 +862,43 @@ inline void prefetch(const void* addr) {
   __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
 }
 
-};  // namespace vec_op
+struct INT8Vec64 {
+  __vector signed char data[4];
+
+  INT8Vec64() = default;
+
+  explicit INT8Vec64(const int8_t* ptr) {
+    data[0] = vec_xl(0, ptr);
+    data[1] = vec_xl(16, ptr);
+    data[2] = vec_xl(32, ptr);
+    data[3] = vec_xl(48, ptr);
+  }
+
+  explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
+
+  void save(int8_t* ptr) const {
+    vec_xst(data[0], 0, ptr);
+    vec_xst(data[1], 16, ptr);
+    vec_xst(data[2], 32, ptr);
+    vec_xst(data[3], 48, ptr);
+  }
+
+  void save(int8_t* ptr, int elem_num) const {
+    if (elem_num <= 0) return;
+
+    int full_vecs = elem_num / 16;
+    for (int i = 0; i < full_vecs && i < 4; i++) {
+      vec_xst(data[i], i * 16, ptr);
+    }
+
+    int remaining = elem_num % 16;
+    if (remaining > 0 && full_vecs < 4) {
+      vec_xst_len(data[full_vecs], ptr + full_vecs * 16, remaining);
+    }
+  }
+
+  void nt_save(int8_t* ptr) const { save(ptr); }
+};
+}  // namespace vec_op
 
 #endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
index a7fdd0c9d9d..f1538d27646 100644
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@@ -5,7 +5,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
   #include <atomic>
 #endif
 
@@ -38,7 +38,7 @@ struct KernelVecType<c10::Half> {
 };
 
 struct ThreadSHMContext {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
   // memory model is weaker on AArch64, so we use atomic variables for
   // consumer (load-acquire) and producer (store-release) to make sure
   // that a stamp cannot be ready before the corresponding data is ready.
@@ -75,7 +75,7 @@ struct ThreadSHMContext {
     TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
     TORCH_CHECK((size_t)this % 64 == 0);
     TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
     _curr_thread_stamp[0].store(1, std::memory_order_relaxed);
     _curr_thread_stamp[1].store(1, std::memory_order_relaxed);
     _ready_thread_stamp[0].store(0, std::memory_order_relaxed);
@@ -124,7 +124,7 @@ struct ThreadSHMContext {
   }
 
   char get_curr_stamp(int idx) const {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
     return _curr_thread_stamp[idx].load(std::memory_order_acquire);
 #else
     return _curr_thread_stamp[idx];
@@ -132,7 +132,7 @@ struct ThreadSHMContext {
   }
 
   char get_ready_stamp(int idx) const {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
     return _ready_thread_stamp[idx].load(std::memory_order_acquire);
 #else
     return _ready_thread_stamp[idx];
@@ -140,7 +140,7 @@ struct ThreadSHMContext {
   }
 
   void next_stamp() {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
     _curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
         1, std::memory_order_release);
 #else
@@ -150,7 +150,7 @@ struct ThreadSHMContext {
   }
 
   void commit_ready_stamp() {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
     _ready_thread_stamp[local_stamp_buffer_idx].store(
         _curr_thread_stamp[local_stamp_buffer_idx].load(
             std::memory_order_relaxed),
@@ -186,8 +186,10 @@ struct ThreadSHMContext {
         break;
       }
       ++_spinning_count;
-#ifdef __aarch64__
+#if defined(__aarch64__)
       __asm__ __volatile__("yield");
+#elif defined(__powerpc64__)
+      __asm__ __volatile__("or 1,1,1");
 #else
       _mm_pause();
 #endif  // __aarch64__
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index f3119aec80d..7a8188b8c8c 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -378,7 +378,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 
 // SHM CCL
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
+    defined(__powerpc64__)
   ops.def(
       "init_shm_manager(str name, int group_size, int rank, int thread_num) -> "
       "int",
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index 067cdad7348..b8d9d6c53d5 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -32,6 +32,7 @@ class CpuCommunicator(DeviceCommunicatorBase):
             (
                 current_platform.get_cpu_architecture() == CpuArchEnum.X86
                 or current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+                or current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC
             )
             and hasattr(torch.ops._C, "init_shm_manager")
             and (unique_name.startswith("tp") or unique_name.startswith("pp"))