sycl : support MUL_MAT and OUT_PROD with Q1_0 (#24721)

This commit is contained in:
Neo Zhang
2026-06-18 16:17:37 +08:00
committed by GitHub
parent 6ec59ddaea
commit dd69db2924
7 changed files with 247 additions and 18 deletions
+6
View File
@@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
switch (type) {
case GGML_TYPE_Q1_0:
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
switch (type) {
case GGML_TYPE_Q1_0:
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
@@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
case GGML_TYPE_BF16:
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
#endif
case GGML_TYPE_Q1_0:
return dequantize_block_nc_sycl<QK1_0, QR1_0, dequantize_q1_0>;
case GGML_TYPE_Q4_0:
return dequantize_block_nc_sycl<QK4_0, QR4_0, dequantize_q4_0>;
case GGML_TYPE_Q4_1:
+15
View File
@@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
const int iqs, dfloat2 &v) {
// Q1_0 reorder layout: scale values followed by quantized bits
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
const int bit_index_0 = iqs + 0;
const int bit_index_1 = iqs + 1;
const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1;
const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1;
v.x() = (2 * bit_0 - 1) * d;
v.y() = (2 * bit_1 - 1) * d;
}
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q4_1 * x = (const block_q4_1 *) vx;
+53
View File
@@ -1423,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
}
}
static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
dequantize_mul_mat_vec_reorder<QK1_0, QR1_0, dequantize_q1_0_reorder>(
vx, y, dst, ncols, nrows, item_ct1);
});
}
}
static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
dequantize_mul_mat_vec<QK1_0, QR1_0, dequantize_q1_0>(
vx, y, dst, ncols, nrows, item_ct1);
});
}
}
static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
float *dst, const int ncols,
const int nrows,
@@ -1759,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
sycl::half *src1_dfloat = nullptr; // dfloat == half
bool src1_convert_f16 =
src0->type == GGML_TYPE_Q1_0 ||
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
@@ -1777,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
#endif // GGML_SYCL_F16
switch (src0->type) {
case GGML_TYPE_Q1_0:
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
} else {
dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q4_0:
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+19 -9
View File
@@ -976,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
}
switch(type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
return max_compute_capability >= VER_GEN9 ? 128 : 64;
@@ -3507,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
@@ -3522,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
@@ -3532,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q3_K:
@@ -3546,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q1_0:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -5385,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
return nullptr;
}
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_sycl_device_context *sycl_ctx =
(ggml_backend_sycl_device_context *)dev->context;
int device = sycl_ctx->device;
@@ -5450,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
struct ggml_tensor * a = op->src[0];
struct ggml_tensor * b = op->src[1];
// disable Q1_0 until implementation
if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
return false;
}
if (a->ne[3] != b->ne[3]) {
return false;
}
ggml_type src0_type = op->src[0]->type;
// TODO: The configuration below needs more work to be supported with oneDNN
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
@@ -5472,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
// TODO: This specific configuration can fail with oneDNN and needs more debugging
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
printf("zjy 2\n");
return false;
}
return true;
}
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
return op->type == GGML_TYPE_F32 &&
(op->src[0]->type == GGML_TYPE_F32 ||
(op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] &&
op->src[0]->ne[3] == op->src[1]->ne[3])) &&
op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_GET_ROWS:
{
switch (op->src[0]->type) {
@@ -5734,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
GGML_UNUSED(dev);
}
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
bool res = do_ggml_backend_sycl_device_supports_op(dev, op);
GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op),
ggml_type_name(op->type), res ? "true" : "false");
return res;
}
static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
return false;
+74
View File
@@ -1194,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols(
}
}
static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy,
float * dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK1_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q<QK1_0, QI1_0, block_q1_0,
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
});
}
template <int ncols_dst>
static void mul_mat_vec_q1_0_q8_1_sycl_ncols(
const void * vx, const void * vy, float * dst,
const int ncols, const int nrows,
const int stride_col_y, const int stride_col_dst,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK1_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_ncols<QK1_0, QI1_0, block_q1_0,
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1, ncols_dst>(
vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1);
});
});
}
static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
const void * vx, const void * vy, float * dst,
const int ncols, const int nrows, const int ncols_dst,
const int stride_col_y, const int stride_col_dst,
dpct::queue_ptr stream) {
switch (ncols_dst) {
case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break;
case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst);
}
}
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
@@ -2120,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q1_0:
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
const int stride_col_y = src1_padded_col_size / QK8_1;
const int stride_col_dst = dst->ne[0];
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols);
mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff,
src1_ncols, stride_col_y, stride_col_dst, stream);
return;
} else if (i == 0 || src1_ncols == 1) {
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n");
mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q2_K:
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
const int stride_col_y = src1_padded_col_size / QK8_1;
+45 -9
View File
@@ -1,11 +1,12 @@
#include "outprod.hpp"
#include "convert.hpp"
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
@@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
GGML_ASSERT(ne01 == ne11); // Inner dimensions must match
GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows
GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne2 % ne02 == 0);
GGML_ASSERT(ne3 % ne03 == 0);
// Get data pointers
const float* src0_d = (const float*)src0->data;
const float* src1_d = (const float*)src1->data;
float* dst_d = (float*)dst->data;
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
ggml_sycl_pool_alloc<float> src0_as_f32(ctx.pool());
int64_t src0_nb02 = nb02;
int64_t src0_nb03 = nb03;
if (src0->type == GGML_TYPE_Q1_0) {
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
" : converting src0 Q1_0 to fp32");
src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03);
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
GGML_ASSERT(to_fp32_sycl != nullptr);
to_fp32_sycl(src0->data, const_cast<float *>(src0_d), ne00 * ne01 * ne02 * ne03, stream);
// Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03].
src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float);
src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float);
}
// GEMM parameters
const float alpha = 1.0f;
@@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
const int64_t r2 = ne2 / ne02;
const int64_t r3 = ne3 / ne03;
try {
// Perform matrix multiplication using oneMKL GEMM
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
}
catch (sycl::exception const& exc) {
// OUT_PROD applies independently to each (i2, i3) destination plane.
for (int64_t i3 = 0; i3 < ne3; ++i3) {
for (int64_t i2 = 0; i2 < ne2; ++i2) {
const int64_t i03 = i3 / r3;
const int64_t i02 = i2 / r2;
const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03);
const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13);
float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3);
// Perform matrix multiplication using oneMKL GEMM
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
ne0, ne1, ne01, alpha, src0_plane, ne00,
src1_plane, ldb, beta, dst_plane, ne0);
}
}
} catch (sycl::exception const& exc) {
std::cerr << exc.what() << std::endl;
GGML_ASSERT(false);
}
+35
View File
@@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
}
#define VDR_Q1_0_Q8_1_MMVQ 1
#define VDR_Q1_0_Q8_1_MMQ 4
static __dpct_inline__ float
vec_dot_q1_0_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq;
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
const float d1 = bq1_0->d;
const int v = get_int_from_uint8_aligned(bq1_0->qs, iqs);
int vi_bytes[8];
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int shift = j * 4;
const int bits4 = (v >> shift) & 0x0F;
const int b0 = (bits4 & 0x01) ? 1 : -1;
const int b1 = (bits4 & 0x02) ? 1 : -1;
const int b2 = (bits4 & 0x04) ? 1 : -1;
const int b3 = (bits4 & 0x08) ? 1 : -1;
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
}
int sumi = 0;
#pragma unroll
for (int j = 0; j < 8; ++j) {
const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j);
sumi = ggml_sycl_dp4a(vi_bytes[j], u, sumi);
}
return d1 * bq8_1_chunk->ds[0] * sumi;
}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q