Skip to content

Commit ed0aab1

Browse files
committed
SYCL: disable faulty fp16 CPU exponent for now
1 parent a5d1fb6 commit ed0aab1

File tree

3 files changed

+2
-158
lines changed

3 files changed

+2
-158
lines changed

ggml/src/ggml-sycl/element_wise.cpp

Lines changed: 0 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -356,33 +356,6 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
356356
}
357357
}
358358

359-
template<typename T>
360-
static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
361-
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
362-
const int64_t j0 = (i / n) * o0 + (i % n);
363-
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
364-
dst[i] = op_gelu(x[j0]) * g[j1];
365-
}
366-
}
367-
368-
template<typename T>
369-
static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
370-
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
371-
const int64_t j0 = (i / n) * o0 + (i % n);
372-
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
373-
dst[i] = op_relu(x[j0]) * g[j1];
374-
}
375-
}
376-
377-
template<typename T>
378-
static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
379-
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
380-
const int64_t j0 = (i / n) * o0 + (i % n);
381-
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
382-
dst[i] = op_silu(x[j0]) * g[j1];
383-
}
384-
}
385-
386359
namespace ggml_sycl_detail {
387360
static void acc_f32_sycl(const float *x, const float *y, float *dst,
388361
const int n_elements, const int ne10, const int ne11,
@@ -457,85 +430,6 @@ static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx,
457430
}
458431
}
459432

460-
template<typename KernelInvoker, typename... Args>
461-
static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
462-
#if defined (GGML_SYCL_F16)
463-
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
464-
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
465-
#else
466-
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
467-
GGML_ASSERT(dst->type == GGML_TYPE_F32);
468-
#endif
469-
GGML_ASSERT(dst->src[0]->type == dst->type);
470-
dpct::queue_ptr main_stream = ctx.stream();
471-
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
472-
const ggml_tensor * src0 = dst->src[0];
473-
const ggml_tensor * src1 = dst->src[1];
474-
const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;;
475-
GGML_ASSERT(dst->ne[0] == nc);
476-
GGML_ASSERT(ggml_is_contiguous_1(dst->src[0]));
477-
GGML_ASSERT(ggml_is_contiguous(dst));
478-
const int32_t swapped = ((const int32_t *) dst->op_params)[1];
479-
void * src0_d = src0->data;
480-
void * src1_d = src1 ? src1->data : src0->data;
481-
const int64_t src0_o = src0->nb[1];
482-
const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
483-
void * dst_d = dst->data;
484-
if (src1) {
485-
GGML_ASSERT(ggml_is_contiguous_1(src1));
486-
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
487-
GGML_ASSERT(src1->ne[0] == nc);
488-
GGML_ASSERT(src0->type == src1->type);
489-
}
490-
switch (dst->type) {
491-
#if defined (GGML_SYCL_F16)
492-
case GGML_TYPE_F16:
493-
{
494-
sycl::half * src0_p = (sycl::half *) src0_d;
495-
sycl::half * src1_p = (sycl::half *) src1_d;
496-
497-
if (!src1) {
498-
src0_p += swapped ? nc : 0;
499-
src1_p += swapped ? 0 : nc;
500-
}
501-
kernel_invoker(src0_p,
502-
src1_p,
503-
(sycl::half *) dst_d,
504-
ggml_nelements(dst),
505-
nc,
506-
src0_o / sizeof(sycl::half),
507-
src1_o / sizeof(sycl::half),
508-
main_stream,
509-
std::forward<Args>(args)...);
510-
break;
511-
}
512-
#endif
513-
case GGML_TYPE_F32:
514-
{
515-
float * src0_p = (float *) src0_d;
516-
float * src1_p = (float *) src1_d;
517-
518-
if (!src1) {
519-
src0_p += swapped ? nc : 0;
520-
src1_p += swapped ? 0 : nc;
521-
}
522-
523-
kernel_invoker(src0_p,
524-
src1_p,
525-
(float *) dst_d,
526-
ggml_nelements(dst),
527-
nc,
528-
src0_o / sizeof(float),
529-
src1_o / sizeof(float),
530-
main_stream,
531-
std::forward<Args>(args)...);
532-
break;
533-
}
534-
default:
535-
GGML_ABORT("GGML tensor type not supported!\n");
536-
}
537-
}
538-
539433
template<typename KernelInvoker, typename... Args>
540434
static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
541435
#if defined (GGML_SYCL_F16)
@@ -945,40 +839,6 @@ static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor
945839
ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
946840
}
947841

948-
static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
949-
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
950-
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
951-
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
952-
sycl_parallel_for(main_stream,
953-
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
954-
gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
955-
});
956-
});
957-
}
958-
959-
static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
960-
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
961-
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
962-
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
963-
sycl_parallel_for(main_stream,
964-
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
965-
gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
966-
});
967-
});
968-
}
969-
970-
static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
971-
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
972-
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
973-
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
974-
sycl_parallel_for(main_stream,
975-
sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
976-
gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
977-
});
978-
});
979-
}
980-
981-
982842
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
983843
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
984844
ggml_sycl_op_sqrt(ctx, dst);
@@ -1104,17 +964,3 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1104964
ggml_sycl_op_elu(ctx, dst);
1105965
}
1106966

1107-
void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1108-
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1109-
ggml_sycl_op_geglu(ctx, dst);
1110-
}
1111-
1112-
void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1113-
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1114-
ggml_sycl_op_reglu(ctx, dst);
1115-
}
1116-
1117-
void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1118-
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1119-
ggml_sycl_op_swiglu(ctx, dst);
1120-
}

ggml/src/ggml-sycl/element_wise.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,4 @@ void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
7777

7878
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
7979

80-
void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
81-
void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
82-
void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
83-
8480
#endif // GGML_SYCL_ELEMENTWISE_HPP

ggml/src/ggml-sycl/ggml-sycl.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4216,6 +4216,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
42164216
case GGML_UNARY_OP_GELU_ERF:
42174217
case GGML_UNARY_OP_TANH:
42184218
case GGML_UNARY_OP_EXP:
4219+
// Disable FP16 until we find out the root cause of failing fp16 sycl::exp
4220+
return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type) && op->src[0]->type == GGML_TYPE_F32;
42194221
case GGML_UNARY_OP_SGN:
42204222
case GGML_UNARY_OP_ABS:
42214223
case GGML_UNARY_OP_ELU:

0 commit comments

Comments
 (0)