[HARDWARE][POWER] Enable SHM communicator support for PowerPC (#43754)

Signed-off-by: Rukhaiya <rukhaiya@c643n08aix1-lp1.pok.stglabs.ibm.com>
Signed-off-by: Rukhaiya <bibirukhaiya123@gmail.com>
Co-authored-by: Rukhaiya <rukhaiya@c643n08aix1-lp1.pok.stglabs.ibm.com>
Co-authored-by: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
This commit is contained in:
Rukhaiya2004
2026-06-02 15:36:32 +05:30
committed by GitHub
parent f8e9c56d15
commit 689b0eeb9e
6 changed files with 125 additions and 10 deletions
View File
+6
View File
@@ -423,6 +423,12 @@ if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND)
${VLLM_EXT_SRC})
endif()
if (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
set(VLLM_EXT_SRC
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC})
endif()
if(USE_ONEDNN)
set(VLLM_EXT_SRC
"csrc/cpu/dnnl_kernels.cpp"
+106 -1
View File
@@ -89,6 +89,35 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
}
};
struct FP16Vec16 : public Vec<FP16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
ss16x8x2_t reg;
explicit FP16Vec16(const void* ptr) {
reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
}
explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {}
explicit FP16Vec16(const FP32Vec16&);
void save(void* ptr) const {
vec_xst(reg.val[0], 0, (signed short*)ptr);
vec_xst(reg.val[1], 16, (signed short*)ptr);
}
void save(void* ptr, int elem_num) const {
int num = std::max(0, std::min(elem_num, VEC_ELEM_NUM));
if (num <= 8) {
vec_xst_len(reg.val[0], (signed short*)ptr, num * 2);
} else {
vec_xst(reg.val[0], 0, (signed short*)ptr);
vec_xst_len(reg.val[1], (signed short*)ptr + 8, (num - 8) * 2);
}
}
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
@@ -100,6 +129,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
}
explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {}
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const {
@@ -379,6 +410,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
reg.val[3] = vec_xl(48, ptr);
}
explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {}
explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
explicit FP32Vec16(const FP32Vec16& data) {
@@ -402,6 +435,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
reg.val[3] = data.reg.val[1];
}
explicit FP32Vec16(const FP16Vec16& v);
explicit FP32Vec16(const BF16Vec16& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
@@ -735,6 +769,40 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
#endif
}
inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
alignas(16) float temp_fp32[16];
alignas(16) c10::Half temp_fp16[16];
vec_xst(v.reg.val[0], 0, temp_fp32);
vec_xst(v.reg.val[1], 16, temp_fp32);
vec_xst(v.reg.val[2], 32, temp_fp32);
vec_xst(v.reg.val[3], 48, temp_fp32);
for (int i = 0; i < 16; i++) {
temp_fp16[i] = c10::Half(temp_fp32[i]);
}
reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)temp_fp16);
reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)temp_fp16);
}
inline FP32Vec16::FP32Vec16(const FP16Vec16& v) {
alignas(16) c10::Half temp_fp16[16];
alignas(16) float temp_fp32[16];
vec_xst(v.reg.val[0], 0, (signed short*)temp_fp16);
vec_xst(v.reg.val[1], 16, (signed short*)temp_fp16);
for (int i = 0; i < 16; i++) {
temp_fp32[i] = float(temp_fp16[i]);
}
reg.val[0] = vec_xl(0, temp_fp32);
reg.val[1] = vec_xl(16, temp_fp32);
reg.val[2] = vec_xl(32, temp_fp32);
reg.val[3] = vec_xl(48, temp_fp32);
}
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
#ifdef _ARCH_PWR10
__vector signed short ret[4];
@@ -794,6 +862,43 @@ inline void prefetch(const void* addr) {
__asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
}
}; // namespace vec_op
struct INT8Vec64 {
__vector signed char data[4];
INT8Vec64() = default;
explicit INT8Vec64(const int8_t* ptr) {
data[0] = vec_xl(0, ptr);
data[1] = vec_xl(16, ptr);
data[2] = vec_xl(32, ptr);
data[3] = vec_xl(48, ptr);
}
explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {}
void save(int8_t* ptr) const {
vec_xst(data[0], 0, ptr);
vec_xst(data[1], 16, ptr);
vec_xst(data[2], 32, ptr);
vec_xst(data[3], 48, ptr);
}
void save(int8_t* ptr, int elem_num) const {
if (elem_num <= 0) return;
int full_vecs = elem_num / 16;
for (int i = 0; i < full_vecs && i < 4; i++) {
vec_xst(data[i], i * 16, ptr);
}
int remaining = elem_num % 16;
if (remaining > 0 && full_vecs < 4) {
vec_xst_len(data[full_vecs], ptr + full_vecs * 16, remaining);
}
}
void nt_save(int8_t* ptr) const { save(ptr); }
};
} // namespace vec_op
#endif
+10 -8
View File
@@ -5,7 +5,7 @@
#include <sys/stat.h>
#include <unistd.h>
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
#include <atomic>
#endif
@@ -38,7 +38,7 @@ struct KernelVecType<c10::Half> {
};
struct ThreadSHMContext {
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
// memory model is weaker on AArch64, so we use atomic variables for
// consumer (load-acquire) and producer (store-release) to make sure
// that a stamp cannot be ready before the corresponding data is ready.
@@ -75,7 +75,7 @@ struct ThreadSHMContext {
TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
TORCH_CHECK((size_t)this % 64 == 0);
TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
_curr_thread_stamp[0].store(1, std::memory_order_relaxed);
_curr_thread_stamp[1].store(1, std::memory_order_relaxed);
_ready_thread_stamp[0].store(0, std::memory_order_relaxed);
@@ -124,7 +124,7 @@ struct ThreadSHMContext {
}
char get_curr_stamp(int idx) const {
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
return _curr_thread_stamp[idx].load(std::memory_order_acquire);
#else
return _curr_thread_stamp[idx];
@@ -132,7 +132,7 @@ struct ThreadSHMContext {
}
char get_ready_stamp(int idx) const {
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
return _ready_thread_stamp[idx].load(std::memory_order_acquire);
#else
return _ready_thread_stamp[idx];
@@ -140,7 +140,7 @@ struct ThreadSHMContext {
}
void next_stamp() {
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
_curr_thread_stamp[local_stamp_buffer_idx].fetch_add(
1, std::memory_order_release);
#else
@@ -150,7 +150,7 @@ struct ThreadSHMContext {
}
void commit_ready_stamp() {
#ifdef __aarch64__
#if defined(__aarch64__) || defined(__powerpc64__)
_ready_thread_stamp[local_stamp_buffer_idx].store(
_curr_thread_stamp[local_stamp_buffer_idx].load(
std::memory_order_relaxed),
@@ -186,8 +186,10 @@ struct ThreadSHMContext {
break;
}
++_spinning_count;
#ifdef __aarch64__
#if defined(__aarch64__)
__asm__ __volatile__("yield");
#elif defined(__powerpc64__)
__asm__ __volatile__("or 1,1,1");
#else
_mm_pause();
#endif // __aarch64__
+2 -1
View File
@@ -378,7 +378,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#endif
// SHM CCL
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
defined(__powerpc64__)
ops.def(
"init_shm_manager(str name, int group_size, int rank, int thread_num) -> "
"int",
@@ -32,6 +32,7 @@ class CpuCommunicator(DeviceCommunicatorBase):
(
current_platform.get_cpu_architecture() == CpuArchEnum.X86
or current_platform.get_cpu_architecture() == CpuArchEnum.ARM
or current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC
)
and hasattr(torch.ops._C, "init_shm_manager")
and (unique_name.startswith("tp") or unique_name.startswith("pp"))