[HARDWARE][POWER] Enable SHM communicator support for PowerPC (#43754)

Signed-off-by: Rukhaiya <rukhaiya@c643n08aix1-lp1.pok.stglabs.ibm.com> Signed-off-by: Rukhaiya <bibirukhaiya123@gmail.com> Co-authored-by: Rukhaiya <rukhaiya@c643n08aix1-lp1.pok.stglabs.ibm.com> Co-authored-by: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com> Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2026-06-06 00:16:14 +00:00 · 2026-06-02 15:36:32 +05:30
parent f8e9c56d15
commit 689b0eeb9e
6 changed files with 125 additions and 10 deletions
@@ -423,6 +423,12 @@ if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
        ${VLLM_EXT_SRC})
 endif()

+if (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)	
+    set(VLLM_EXT_SRC
+        "csrc/cpu/shm.cpp"
+        ${VLLM_EXT_SRC})
+endif()
+
 if(USE_ONEDNN)
    set(VLLM_EXT_SRC
        "csrc/cpu/dnnl_kernels.cpp"
@@ -89,6 +89,35 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
  }
 };

+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  ss16x8x2_t reg;
+
+  explicit FP16Vec16(const void* ptr) {
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
+  }
+
+  explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}
+
+  explicit FP16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
+  }
+
+  void save(void* ptr, int elem_num) const {
+    int num = std::max(0, std::min(elem_num, VEC_ELEM_NUM));
+    if (num <= 8) {
+      vec_xst_len(reg.val[0], (signed short*)ptr, num * 2);
+    } else {
+      vec_xst(reg.val[0], 0, (signed short*)ptr);
+      vec_xst_len(reg.val[1], (signed short*)ptr + 8, (num - 8) * 2);
+    }
+  }
+};
+
 struct BF16Vec16 : public Vec<BF16Vec16> {
  constexpr static int VEC_ELEM_NUM = 16;

@@ -100,6 +129,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
  }

+  explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
+
  explicit BF16Vec16(const FP32Vec16&);

  void save(void* ptr) const {
@@ -379,6 +410,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    reg.val[3] = vec_xl(48, ptr);
  }

+  explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
+
  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}

  explicit FP32Vec16(const FP32Vec16& data) {
@@ -402,6 +435,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    reg.val[3] = data.reg.val[1];
  }

+  explicit FP32Vec16(const FP16Vec16& v);
  explicit FP32Vec16(const BF16Vec16& v) {
    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
@@ -735,6 +769,40 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
 #endif
 }

+inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
+  alignas(16) float temp_fp32[16];
+  alignas(16) c10::Half temp_fp16[16];
+
+  vec_xst(v.reg.val[0], 0, temp_fp32);
+  vec_xst(v.reg.val[1], 16, temp_fp32);
+  vec_xst(v.reg.val[2], 32, temp_fp32);
+  vec_xst(v.reg.val[3], 48, temp_fp32);
+
+  for (int i = 0; i < 16; i++) {
+    temp_fp16[i] = c10::Half(temp_fp32[i]);
+  }
+
+  reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)temp_fp16);
+  reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)temp_fp16);
+}
+
+inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
+  alignas(16) c10::Half temp_fp16[16];
+  alignas(16) float temp_fp32[16];
+
+  vec_xst(v.reg.val[0], 0, (signed short*)temp_fp16);
+  vec_xst(v.reg.val[1], 16, (signed short*)temp_fp16);
+
+  for (int i = 0; i < 16; i++) {
+    temp_fp32[i] = float(temp_fp16[i]);
+  }
+
+  reg.val[0] = vec_xl(0, temp_fp32);
+  reg.val[1] = vec_xl(16, temp_fp32);
+  reg.val[2] = vec_xl(32, temp_fp32);
+  reg.val[3] = vec_xl(48, temp_fp32);
+}
+
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 #ifdef _ARCH_PWR10
  __vector signed short ret[4];
@@ -794,6 +862,43 @@ inline void prefetch(const void* addr) {
  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
 }

-};  // namespace vec_op
+struct INT8Vec64 {
+  __vector signed char data[4];
+
+  INT8Vec64() = default;
+
+  explicit INT8Vec64(const int8_t* ptr) {
+    data[0] = vec_xl(0, ptr);
+    data[1] = vec_xl(16, ptr);
+    data[2] = vec_xl(32, ptr);
+    data[3] = vec_xl(48, ptr);
+  }
+
+  explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
+
+  void save(int8_t* ptr) const {
+    vec_xst(data[0], 0, ptr);
+    vec_xst(data[1], 16, ptr);
+    vec_xst(data[2], 32, ptr);
+    vec_xst(data[3], 48, ptr);
+  }
+
+  void save(int8_t* ptr, int elem_num) const {
+    if (elem_num <= 0) return;
+
+    int full_vecs = elem_num / 16;
+    for (int i = 0; i < full_vecs && i < 4; i++) {
+      vec_xst(data[i], i * 16, ptr);
+    }
+
+    int remaining = elem_num % 16;
+    if (remaining > 0 && full_vecs < 4) {
+      vec_xst_len(data[full_vecs], ptr + full_vecs * 16, remaining);
+    }
+  }
+
+  void nt_save(int8_t* ptr) const { save(ptr); }
+};
+}  // namespace vec_op

 #endif
@@ -5,7 +5,7 @@
 #include <sys/stat.h>
 #include <unistd.h>

-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
  #include <atomic>
 #endif

@@ -38,7 +38,7 @@ struct KernelVecType<c10::Half> {
 };

 struct ThreadSHMContext {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
  // memory model is weaker on AArch64, so we use atomic variables for
  // consumer (load-acquire) and producer (store-release) to make sure
  // that a stamp cannot be ready before the corresponding data is ready.
@@ -75,7 +75,7 @@ struct ThreadSHMContext {
    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
    TORCH_CHECK((size_t)this % 64 == 0);
    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
    _curr_thread_stamp[0].store(1, std::memory_order_relaxed);
    _curr_thread_stamp[1].store(1, std::memory_order_relaxed);
    _ready_thread_stamp[0].store(0, std::memory_order_relaxed);
@@ -124,7 +124,7 @@ struct ThreadSHMContext {
  }

  char get_curr_stamp(int idx) const {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
    return _curr_thread_stamp[idx].load(std::memory_order_acquire);
 #else
    return _curr_thread_stamp[idx];
@@ -132,7 +132,7 @@ struct ThreadSHMContext {
  }

  char get_ready_stamp(int idx) const {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
    return _ready_thread_stamp[idx].load(std::memory_order_acquire);
 #else
    return _ready_thread_stamp[idx];
@@ -140,7 +140,7 @@ struct ThreadSHMContext {
  }

  void next_stamp() {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
    _curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
        1, std::memory_order_release);
 #else
@@ -150,7 +150,7 @@ struct ThreadSHMContext {
  }

  void commit_ready_stamp() {
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__powerpc64__)
    _ready_thread_stamp[local_stamp_buffer_idx].store(
        _curr_thread_stamp[local_stamp_buffer_idx].load(
            std::memory_order_relaxed),
@@ -186,8 +186,10 @@ struct ThreadSHMContext {
        break;
      }
      ++_spinning_count;
-#ifdef __aarch64__
+#if defined(__aarch64__)
      __asm__ __volatile__("yield");
+#elif defined(__powerpc64__)
+      __asm__ __volatile__("or 1,1,1");
 #else
      _mm_pause();
 #endif  // __aarch64__
@@ -378,7 +378,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif

 // SHM CCL
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
+    defined(__powerpc64__)
  ops.def(
      "init_shm_manager(str name, int group_size, int rank, int thread_num) -> "
      "int",
@@ -32,6 +32,7 @@ class CpuCommunicator(DeviceCommunicatorBase):
            (
                current_platform.get_cpu_architecture() == CpuArchEnum.X86
                or current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+                or current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC
            )
            and hasattr(torch.ops._C, "init_shm_manager")
            and (unique_name.startswith("tp") or unique_name.startswith("pp"))