diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 65593402e7..060d0aca2d 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { switch (type) { + case GGML_TYPE_Q1_0: + return dequantize_block_sycl; case GGML_TYPE_Q4_0: if (dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { @@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { switch (type) { + case GGML_TYPE_Q1_0: + return dequantize_block_sycl; case GGML_TYPE_Q4_0: if (dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { @@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) { case GGML_TYPE_BF16: return convert_unary_nc_sycl; #endif + case GGML_TYPE_Q1_0: + return dequantize_block_nc_sycl; case GGML_TYPE_Q4_0: return dequantize_block_nc_sycl; case GGML_TYPE_Q4_1: diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index ca8cd96c08..7b66c73b0c 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int #endif // GGML_SYCL_F16 } +static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs, + const int iqs, dfloat2 &v) { + // Q1_0 reorder layout: scale values followed by quantized bits + const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib); + + const int bit_index_0 = iqs + 0; + const int bit_index_1 = iqs + 1; + + const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1; + const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1; + + v.x() = (2 * bit_0 - 1) * d; + v.y() = (2 * bit_1 - 1) * d; +} + static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, const int iqs, dfloat2 &v) { const block_q4_1 * x = (const block_q4_1 *) vx; diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index e091e52356..fb8a1757f1 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1423,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, } } +static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec_reorder( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, float *dst, const int ncols, const int nrows, @@ -1759,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( sycl::half *src1_dfloat = nullptr; // dfloat == half bool src1_convert_f16 = + src0->type == GGML_TYPE_Q1_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 || @@ -1777,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec( #endif // GGML_SYCL_F16 switch (src0->type) { + case GGML_TYPE_Q1_0: + if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && + ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } else { + dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } + break; case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 77d1458904..0aebdc4413 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -976,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; @@ -3507,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; @@ -3522,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; @@ -3532,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q3_K: @@ -3546,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { static bool ggml_sycl_supports_dmmv(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -5385,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_ return nullptr; } -static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_sycl_device_context *sycl_ctx = (ggml_backend_sycl_device_context *)dev->context; int device = sycl_ctx->device; @@ -5450,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g struct ggml_tensor * a = op->src[0]; struct ggml_tensor * b = op->src[1]; - // disable Q1_0 until implementation - if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) { - return false; - } - if (a->ne[3] != b->ne[3]) { return false; } ggml_type src0_type = op->src[0]->type; - - // TODO: The configuration below needs more work to be supported with oneDNN if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) { @@ -5472,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g // TODO: This specific configuration can fail with oneDNN and needs more debugging if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 && a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) { + printf("zjy 2\n"); return false; } return true; } case GGML_OP_OUT_PROD: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; + return op->type == GGML_TYPE_F32 && + (op->src[0]->type == GGML_TYPE_F32 || + (op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] && + op->src[0]->ne[3] == op->src[1]->ne[3])) && + op->src[1]->type == GGML_TYPE_F32; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -5734,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g GGML_UNUSED(dev); } +static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + bool res = do_ggml_backend_sycl_device_supports_op(dev, op); + GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op), + ggml_type_name(op->type), res ? "true" : "false"); + return res; +} + static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) { return false; diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 909c7aeb26..7b1b3d467f 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1194,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( } } +static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy, + float * dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK1_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); +} + +template +static void mul_mat_vec_q1_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK1_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -2120,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; + case GGML_TYPE_Q1_0: + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q1_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n"); + mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + break; case GGML_TYPE_Q2_K: if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { const int stride_col_y = src1_padded_col_size / QK8_1; diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index f52b11f0d6..8d10dad8c9 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -1,11 +1,12 @@ #include "outprod.hpp" +#include "convert.hpp" void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguous(src0)); @@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { GGML_ASSERT(ne01 == ne11); // Inner dimensions must match GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne2 % ne02 == 0); + GGML_ASSERT(ne3 % ne03 == 0); // Get data pointers - const float* src0_d = (const float*)src0->data; - const float* src1_d = (const float*)src1->data; - float* dst_d = (float*)dst->data; + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + ggml_sycl_pool_alloc src0_as_f32(ctx.pool()); + int64_t src0_nb02 = nb02; + int64_t src0_nb03 = nb03; + if (src0->type == GGML_TYPE_Q1_0) { + scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, + " : converting src0 Q1_0 to fp32"); + src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03); + const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst); + GGML_ASSERT(to_fp32_sycl != nullptr); + to_fp32_sycl(src0->data, const_cast(src0_d), ne00 * ne01 * ne02 * ne03, stream); + + // Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03]. + src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float); + src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float); + } // GEMM parameters const float alpha = 1.0f; @@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans; const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); + const int64_t r2 = ne2 / ne02; + const int64_t r3 = ne3 / ne03; + try { - // Perform matrix multiplication using oneMKL GEMM - oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, - ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0); - } - catch (sycl::exception const& exc) { + // OUT_PROD applies independently to each (i2, i3) destination plane. + for (int64_t i3 = 0; i3 < ne3; ++i3) { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + const int64_t i03 = i3 / r3; + const int64_t i02 = i2 / r2; + + const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03); + const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13); + float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3); + + // Perform matrix multiplication using oneMKL GEMM + oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, + ne0, ne1, ne01, alpha, src0_plane, ne00, + src1_plane, ldb, beta, dst_plane, ne0); + } + } + } catch (sycl::exception const& exc) { std::cerr << exc.what() << std::endl; GGML_ASSERT(false); } diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index 4b58b09ab2..765fb7f159 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]); } +#define VDR_Q1_0_Q8_1_MMVQ 1 +#define VDR_Q1_0_Q8_1_MMQ 4 + +static __dpct_inline__ float +vec_dot_q1_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq; + + const block_q8_1 * bq8_1_chunk = bq8_1 + iqs; + const float d1 = bq1_0->d; + const int v = get_int_from_uint8_aligned(bq1_0->qs, iqs); + + int vi_bytes[8]; +#pragma unroll + for (int j = 0; j < 8; ++j) { + const int shift = j * 4; + const int bits4 = (v >> shift) & 0x0F; + const int b0 = (bits4 & 0x01) ? 1 : -1; + const int b1 = (bits4 & 0x02) ? 1 : -1; + const int b2 = (bits4 & 0x04) ? 1 : -1; + const int b3 = (bits4 & 0x08) ? 1 : -1; + vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24); + } + + int sumi = 0; +#pragma unroll + for (int j = 0; j < 8; ++j) { + const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j); + sumi = ggml_sycl_dp4a(vi_bytes[j], u, sumi); + } + + return d1 * bq8_1_chunk->ds[0] * sumi; +} + // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q