diff --git a/=4.5.1 b/=4.5.1 new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 78825060559..6f836ff5354 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -423,6 +423,12 @@ if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) ${VLLM_EXT_SRC}) endif() +if (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) + set(VLLM_EXT_SRC + "csrc/cpu/shm.cpp" + ${VLLM_EXT_SRC}) +endif() + if(USE_ONEDNN) set(VLLM_EXT_SRC "csrc/cpu/dnnl_kernels.cpp" diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp index 87c7a9dd51f..ba65e27a15e 100644 --- a/csrc/cpu/cpu_types_vsx.hpp +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -89,6 +89,35 @@ struct BF16Vec8 : public Vec { } }; +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + ss16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); + } + + explicit FP16Vec16(bool, const void* ptr) : FP16Vec16(ptr) {} + + explicit FP16Vec16(const FP32Vec16&); + + void save(void* ptr) const { + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); + } + + void save(void* ptr, int elem_num) const { + int num = std::max(0, std::min(elem_num, VEC_ELEM_NUM)); + if (num <= 8) { + vec_xst_len(reg.val[0], (signed short*)ptr, num * 2); + } else { + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst_len(reg.val[1], (signed short*)ptr + 8, (num - 8) * 2); + } + } +}; + struct BF16Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; @@ -100,6 +129,8 @@ struct BF16Vec16 : public Vec { reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); } + explicit BF16Vec16(bool, const void* ptr) : BF16Vec16(ptr) {} + explicit BF16Vec16(const FP32Vec16&); void save(void* ptr) const { @@ -379,6 +410,8 @@ struct FP32Vec16 : public Vec { reg.val[3] = vec_xl(48, ptr); } + explicit FP32Vec16(bool, const float* ptr) : FP32Vec16(ptr) {} + explicit FP32Vec16(f32x4x4_t data) : reg(data) {} explicit FP32Vec16(const FP32Vec16& data) { @@ -402,6 +435,7 @@ struct FP32Vec16 : public Vec { reg.val[3] = data.reg.val[1]; } + explicit FP32Vec16(const FP16Vec16& v); explicit FP32Vec16(const BF16Vec16& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); @@ -735,6 +769,40 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { #endif } +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + alignas(16) float temp_fp32[16]; + alignas(16) c10::Half temp_fp16[16]; + + vec_xst(v.reg.val[0], 0, temp_fp32); + vec_xst(v.reg.val[1], 16, temp_fp32); + vec_xst(v.reg.val[2], 32, temp_fp32); + vec_xst(v.reg.val[3], 48, temp_fp32); + + for (int i = 0; i < 16; i++) { + temp_fp16[i] = c10::Half(temp_fp32[i]); + } + + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)temp_fp16); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)temp_fp16); +} + +inline FP32Vec16::FP32Vec16(const FP16Vec16& v) { + alignas(16) c10::Half temp_fp16[16]; + alignas(16) float temp_fp32[16]; + + vec_xst(v.reg.val[0], 0, (signed short*)temp_fp16); + vec_xst(v.reg.val[1], 16, (signed short*)temp_fp16); + + for (int i = 0; i < 16; i++) { + temp_fp32[i] = float(temp_fp16[i]); + } + + reg.val[0] = vec_xl(0, temp_fp32); + reg.val[1] = vec_xl(16, temp_fp32); + reg.val[2] = vec_xl(32, temp_fp32); + reg.val[3] = vec_xl(48, temp_fp32); +} + inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { #ifdef _ARCH_PWR10 __vector signed short ret[4]; @@ -794,6 +862,43 @@ inline void prefetch(const void* addr) { __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); } -}; // namespace vec_op +struct INT8Vec64 { + __vector signed char data[4]; + + INT8Vec64() = default; + + explicit INT8Vec64(const int8_t* ptr) { + data[0] = vec_xl(0, ptr); + data[1] = vec_xl(16, ptr); + data[2] = vec_xl(32, ptr); + data[3] = vec_xl(48, ptr); + } + + explicit INT8Vec64(bool, const int8_t* ptr) : INT8Vec64(ptr) {} + + void save(int8_t* ptr) const { + vec_xst(data[0], 0, ptr); + vec_xst(data[1], 16, ptr); + vec_xst(data[2], 32, ptr); + vec_xst(data[3], 48, ptr); + } + + void save(int8_t* ptr, int elem_num) const { + if (elem_num <= 0) return; + + int full_vecs = elem_num / 16; + for (int i = 0; i < full_vecs && i < 4; i++) { + vec_xst(data[i], i * 16, ptr); + } + + int remaining = elem_num % 16; + if (remaining > 0 && full_vecs < 4) { + vec_xst_len(data[full_vecs], ptr + full_vecs * 16, remaining); + } + } + + void nt_save(int8_t* ptr) const { save(ptr); } +}; +} // namespace vec_op #endif diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp index a7fdd0c9d9d..f1538d27646 100644 --- a/csrc/cpu/shm.cpp +++ b/csrc/cpu/shm.cpp @@ -5,7 +5,7 @@ #include #include -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) #include #endif @@ -38,7 +38,7 @@ struct KernelVecType { }; struct ThreadSHMContext { -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) // memory model is weaker on AArch64, so we use atomic variables for // consumer (load-acquire) and producer (store-release) to make sure // that a stamp cannot be ready before the corresponding data is ready. @@ -75,7 +75,7 @@ struct ThreadSHMContext { TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM); TORCH_CHECK((size_t)this % 64 == 0); TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0); -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) _curr_thread_stamp[0].store(1, std::memory_order_relaxed); _curr_thread_stamp[1].store(1, std::memory_order_relaxed); _ready_thread_stamp[0].store(0, std::memory_order_relaxed); @@ -124,7 +124,7 @@ struct ThreadSHMContext { } char get_curr_stamp(int idx) const { -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) return _curr_thread_stamp[idx].load(std::memory_order_acquire); #else return _curr_thread_stamp[idx]; @@ -132,7 +132,7 @@ struct ThreadSHMContext { } char get_ready_stamp(int idx) const { -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) return _ready_thread_stamp[idx].load(std::memory_order_acquire); #else return _ready_thread_stamp[idx]; @@ -140,7 +140,7 @@ struct ThreadSHMContext { } void next_stamp() { -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) _curr_thread_stamp[local_stamp_buffer_idx].fetch_add( 1, std::memory_order_release); #else @@ -150,7 +150,7 @@ struct ThreadSHMContext { } void commit_ready_stamp() { -#ifdef __aarch64__ +#if defined(__aarch64__) || defined(__powerpc64__) _ready_thread_stamp[local_stamp_buffer_idx].store( _curr_thread_stamp[local_stamp_buffer_idx].load( std::memory_order_relaxed), @@ -186,8 +186,10 @@ struct ThreadSHMContext { break; } ++_spinning_count; -#ifdef __aarch64__ +#if defined(__aarch64__) __asm__ __volatile__("yield"); +#elif defined(__powerpc64__) + __asm__ __volatile__("or 1,1,1"); #else _mm_pause(); #endif // __aarch64__ diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index f3119aec80d..7a8188b8c8c 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -378,7 +378,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { #endif // SHM CCL -#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) +#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ + defined(__powerpc64__) ops.def( "init_shm_manager(str name, int group_size, int rank, int thread_num) -> " "int", diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index 067cdad7348..b8d9d6c53d5 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -32,6 +32,7 @@ class CpuCommunicator(DeviceCommunicatorBase): ( current_platform.get_cpu_architecture() == CpuArchEnum.X86 or current_platform.get_cpu_architecture() == CpuArchEnum.ARM + or current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC ) and hasattr(torch.ops._C, "init_shm_manager") and (unique_name.startswith("tp") or unique_name.startswith("pp"))