mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 06:10:19 +00:00
sycl : support MUL_MAT and OUT_PROD with Q1_0 (#24721)
This commit is contained in:
@@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct
|
||||
|
||||
to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
@@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
|
||||
to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_sycl<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
@@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) {
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
case GGML_TYPE_Q1_0:
|
||||
return dequantize_block_nc_sycl<QK1_0, QR1_0, dequantize_q1_0>;
|
||||
case GGML_TYPE_Q4_0:
|
||||
return dequantize_block_nc_sycl<QK4_0, QR4_0, dequantize_q4_0>;
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
||||
@@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
|
||||
const int iqs, dfloat2 &v) {
|
||||
// Q1_0 reorder layout: scale values followed by quantized bits
|
||||
const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
|
||||
|
||||
const int bit_index_0 = iqs + 0;
|
||||
const int bit_index_1 = iqs + 1;
|
||||
|
||||
const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1;
|
||||
const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1;
|
||||
|
||||
v.x() = (2 * bit_0 - 1) * d;
|
||||
v.y() = (2 * bit_1 - 1) * d;
|
||||
}
|
||||
|
||||
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
|
||||
const int iqs, dfloat2 &v) {
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
@@ -1423,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_reorder<QK1_0, QR1_0, dequantize_q1_0_reorder>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec<QK1_0, QR1_0, dequantize_q1_0>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -1759,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
sycl::half *src1_dfloat = nullptr; // dfloat == half
|
||||
|
||||
bool src1_convert_f16 =
|
||||
src0->type == GGML_TYPE_Q1_0 ||
|
||||
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
||||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
|
||||
@@ -1777,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
#endif // GGML_SYCL_F16
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
|
||||
@@ -976,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
|
||||
}
|
||||
|
||||
switch(type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return max_compute_capability >= VER_GEN9 ? 128 : 64;
|
||||
@@ -3507,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||
|
||||
inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
@@ -3522,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
|
||||
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
@@ -3532,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
||||
|
||||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -3546,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
|
||||
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q1_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -5385,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
ggml_backend_sycl_device_context *sycl_ctx =
|
||||
(ggml_backend_sycl_device_context *)dev->context;
|
||||
int device = sycl_ctx->device;
|
||||
@@ -5450,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
struct ggml_tensor * a = op->src[0];
|
||||
struct ggml_tensor * b = op->src[1];
|
||||
|
||||
// disable Q1_0 until implementation
|
||||
if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
|
||||
|
||||
|
||||
// TODO: The configuration below needs more work to be supported with oneDNN
|
||||
if (ggml_is_permuted(a) && !ggml_is_contiguous(a) &&
|
||||
a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) {
|
||||
@@ -5472,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
// TODO: This specific configuration can fail with oneDNN and needs more debugging
|
||||
if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 &&
|
||||
a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) {
|
||||
printf("zjy 2\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_OUT_PROD:
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
||||
return op->type == GGML_TYPE_F32 &&
|
||||
(op->src[0]->type == GGML_TYPE_F32 ||
|
||||
(op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] &&
|
||||
op->src[0]->ne[3] == op->src[1]->ne[3])) &&
|
||||
op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
@@ -5734,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
bool res = do_ggml_backend_sycl_device_supports_op(dev, op);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op),
|
||||
ggml_type_name(op->type), res ? "true" : "false");
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) {
|
||||
return false;
|
||||
|
||||
@@ -1194,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols(
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy,
|
||||
float * dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK1_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK1_0, QI1_0, block_q1_0,
|
||||
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <int ncols_dst>
|
||||
static void mul_mat_vec_q1_0_q8_1_sycl_ncols(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols, const int nrows,
|
||||
const int stride_col_y, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK1_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_ncols<QK1_0, QI1_0, block_q1_0,
|
||||
VDR_Q1_0_Q8_1_MMVQ, vec_dot_q1_0_q8_1, ncols_dst>(
|
||||
vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
|
||||
const void * vx, const void * vy, float * dst,
|
||||
const int ncols, const int nrows, const int ncols_dst,
|
||||
const int stride_col_y, const int stride_col_dst,
|
||||
dpct::queue_ptr stream) {
|
||||
switch (ncols_dst) {
|
||||
case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break;
|
||||
case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break;
|
||||
default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -2120,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q1_0:
|
||||
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
|
||||
const int stride_col_y = src1_padded_col_size / QK8_1;
|
||||
const int stride_col_dst = dst->ne[0];
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols);
|
||||
mul_mat_vec_q1_0_q8_1_sycl_switch_ncols(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff,
|
||||
src1_ncols, stride_col_y, stride_col_dst, stream);
|
||||
return;
|
||||
} else if (i == 0 || src1_ncols == 1) {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n");
|
||||
mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) {
|
||||
const int stride_col_y = src1_padded_col_size / QK8_1;
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#include "outprod.hpp"
|
||||
#include "convert.hpp"
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
@@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
GGML_ASSERT(ne01 == ne11); // Inner dimensions must match
|
||||
GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows
|
||||
GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
GGML_ASSERT(ne2 % ne02 == 0);
|
||||
GGML_ASSERT(ne3 % ne03 == 0);
|
||||
|
||||
// Get data pointers
|
||||
const float* src0_d = (const float*)src0->data;
|
||||
const float* src1_d = (const float*)src1->data;
|
||||
float* dst_d = (float*)dst->data;
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
ggml_sycl_pool_alloc<float> src0_as_f32(ctx.pool());
|
||||
int64_t src0_nb02 = nb02;
|
||||
int64_t src0_nb03 = nb03;
|
||||
if (src0->type == GGML_TYPE_Q1_0) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting src0 Q1_0 to fp32");
|
||||
src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
|
||||
GGML_ASSERT(to_fp32_sycl != nullptr);
|
||||
to_fp32_sycl(src0->data, const_cast<float *>(src0_d), ne00 * ne01 * ne02 * ne03, stream);
|
||||
|
||||
// Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03].
|
||||
src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float);
|
||||
src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float);
|
||||
}
|
||||
|
||||
// GEMM parameters
|
||||
const float alpha = 1.0f;
|
||||
@@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
|
||||
const int64_t r2 = ne2 / ne02;
|
||||
const int64_t r3 = ne3 / ne03;
|
||||
|
||||
try {
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
}
|
||||
catch (sycl::exception const& exc) {
|
||||
// OUT_PROD applies independently to each (i2, i3) destination plane.
|
||||
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
const int64_t i03 = i3 / r3;
|
||||
const int64_t i02 = i2 / r2;
|
||||
|
||||
const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03);
|
||||
const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13);
|
||||
float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3);
|
||||
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_plane, ne00,
|
||||
src1_plane, ldb, beta, dst_plane, ne0);
|
||||
}
|
||||
}
|
||||
} catch (sycl::exception const& exc) {
|
||||
std::cerr << exc.what() << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
@@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
|
||||
vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]);
|
||||
}
|
||||
|
||||
#define VDR_Q1_0_Q8_1_MMVQ 1
|
||||
#define VDR_Q1_0_Q8_1_MMQ 4
|
||||
|
||||
static __dpct_inline__ float
|
||||
vec_dot_q1_0_q8_1(const void *__restrict__ vbq,
|
||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||
|
||||
const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq;
|
||||
|
||||
const block_q8_1 * bq8_1_chunk = bq8_1 + iqs;
|
||||
const float d1 = bq1_0->d;
|
||||
const int v = get_int_from_uint8_aligned(bq1_0->qs, iqs);
|
||||
|
||||
int vi_bytes[8];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int shift = j * 4;
|
||||
const int bits4 = (v >> shift) & 0x0F;
|
||||
const int b0 = (bits4 & 0x01) ? 1 : -1;
|
||||
const int b1 = (bits4 & 0x02) ? 1 : -1;
|
||||
const int b2 = (bits4 & 0x04) ? 1 : -1;
|
||||
const int b3 = (bits4 & 0x08) ? 1 : -1;
|
||||
vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24);
|
||||
}
|
||||
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j);
|
||||
sumi = ggml_sycl_dp4a(vi_bytes[j], u, sumi);
|
||||
}
|
||||
|
||||
return d1 * bq8_1_chunk->ds[0] * sumi;
|
||||
}
|
||||
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
|
||||
Reference in New Issue
Block a user