-
Notifications
You must be signed in to change notification settings - Fork 12.2k
sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs #12858
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
187451b
9c8d809
52b1622
e8555ab
b60d637
fc768f3
c7500c9
1e0c4cf
de60819
dc19cd5
351ef2b
34f7bed
d61dda3
48480c8
6afb367
6fe27eb
e809b07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false; | |
int g_ggml_sycl_debug = 0; | ||
int g_ggml_sycl_disable_optimize = 0; | ||
int g_ggml_sycl_disable_graph = 0; | ||
int g_ggml_sycl_prioritize_dmmv = 0; | ||
|
||
static ggml_sycl_device_info ggml_sycl_init() { | ||
ggml_sycl_device_info info = {}; | ||
|
@@ -193,13 +194,15 @@ static void ggml_check_sycl() try { | |
|
||
if (!initialized) { | ||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); | ||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); | ||
g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); | ||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); | ||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); | ||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); | ||
GGML_LOG_INFO("Running with Environment Variables:\n"); | ||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); | ||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); | ||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); | ||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); | ||
GGML_LOG_INFO("Build with Macros:\n"); | ||
#if defined(GGML_SYCL_FORCE_MMQ) | ||
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); | ||
|
@@ -2834,6 +2837,33 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { | |
return false; | ||
} | ||
|
||
inline bool ggml_sycl_supports_reorder_dequantize(enum ggml_type type) { | ||
switch (type) { | ||
case GGML_TYPE_Q4_0: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
} | ||
|
||
inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { | ||
switch (type) { | ||
case GGML_TYPE_Q4_0: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
} | ||
|
||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { | ||
switch (type) { | ||
case GGML_TYPE_Q4_0: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
} | ||
|
||
static bool ggml_sycl_supports_dmmv(enum ggml_type type) { | ||
switch (type) { | ||
case GGML_TYPE_Q4_0: | ||
|
@@ -2862,7 +2892,7 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows, | |
GGML_ASSERT((size % sizeof(block_q4_0) == 0)); | ||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0)); | ||
int offset_blks = offset / sizeof(block_q4_0); | ||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;; | ||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2; | ||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks; | ||
|
||
stream->parallel_for( | ||
|
@@ -2890,17 +2920,19 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { | |
reorder_qw(data_device, ncols, nrows, size, 0, stream); | ||
} | ||
|
||
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) { | ||
return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT | ||
ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. | ||
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. | ||
dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1; | ||
} | ||
|
||
/* | ||
* This function could be called when the OP (mul_mat) function support reorder optimizition. | ||
*/ | ||
static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, | ||
ggml_tensor * dst) { | ||
if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT | ||
ctx->opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf. | ||
dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases. | ||
src0->type == GGML_TYPE_Q4_0 && | ||
src1->ne[2]==1 && src1->ne[3]==1) { | ||
|
||
if (should_reorder_tensor(*ctx, dst)) { | ||
ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra; | ||
if (!extra) return; //only happen in CI/UT permute case. | ||
|
||
|
@@ -2917,7 +2949,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
int64_t min_compute_capability = INT_MAX; | ||
|
||
if (split) { | ||
ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; | ||
ggml_backend_sycl_split_buffer_type_context * buft_ctx = | ||
(ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; | ||
auto & tensor_split = buft_ctx->tensor_split; | ||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) { | ||
// skip devices that are not going to do any work: | ||
|
@@ -2930,7 +2963,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
} | ||
} | ||
} else { | ||
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; | ||
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; | ||
} | ||
|
||
// check data types and tensor shapes for custom matrix multiplication kernels: | ||
|
@@ -2952,9 +2985,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); | ||
#endif // SYCL_USE_XMX | ||
|
||
|
||
// mmvq path is faster in the CUDA backend. | ||
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda) | ||
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda | ||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization | ||
// is enabled takes precedence over DMMV, the current if-else implementation | ||
// requires disabling DMMV if both conditions are met | ||
|| (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) { | ||
Alcpz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; | ||
} | ||
|
||
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { | ||
// TODO: Refactor and cleanup of mul mat dispatching. | ||
|
@@ -2973,17 +3012,28 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
// KQ + KQV multi-batch | ||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); | ||
} else if (use_dequantize_mul_mat_vec) { | ||
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); | ||
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); | ||
constexpr bool convert_src1_to_q8_1 = false; | ||
if (ggml_sycl_supports_reorder_dmmv(src0->type)) { | ||
opt_for_reorder(&ctx, src0, src1, dst); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest only call opt_for_order() here, instead of call 2 functions. Suggest like: opt_for_reorder(&ctx, src0, src1, dst, mm_type = MM_TYPE_DMMV) { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comments for following similar code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the suggestion, you're right, it's more readable now. See 48480c8 for the changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's great! |
||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1); | ||
} else if (use_mul_mat_vec_q) { | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); | ||
constexpr bool convert_src1_to_q8_1 = true; | ||
if (ggml_sycl_supports_reorder_mmvq(src0->type)) { | ||
opt_for_reorder(&ctx, src0, src1, dst); | ||
} | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1); | ||
} else if (use_mul_mat_q) { | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); | ||
constexpr bool convert_src1_to_q8_1 = true; | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1); | ||
} else { | ||
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); | ||
constexpr bool convert_src1_to_q8_1 = false; | ||
if (ggml_sycl_supports_reorder_dequantize(src0->type)) { | ||
opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder. | ||
} | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); | ||
} | ||
GGML_SYCL_DEBUG("call %s done\n", __func__); | ||
} | ||
|
||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.