@@ -356,33 +356,6 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
356
356
}
357
357
}
358
358
359
- template <typename T>
360
- static void gated_op_fused_geglu (const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1 > &item_ct1) {
361
- SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
362
- const int64_t j0 = (i / n) * o0 + (i % n);
363
- const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
364
- dst[i] = op_gelu (x[j0]) * g[j1];
365
- }
366
- }
367
-
368
- template <typename T>
369
- static void gated_op_fused_reglu (const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1 > &item_ct1) {
370
- SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
371
- const int64_t j0 = (i / n) * o0 + (i % n);
372
- const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
373
- dst[i] = op_relu (x[j0]) * g[j1];
374
- }
375
- }
376
-
377
- template <typename T>
378
- static void gated_op_fused_swiglu (const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1 > &item_ct1) {
379
- SYCL_GLOBAL_ID_LOOP (k, item_ct1) {
380
- const int64_t j0 = (i / n) * o0 + (i % n);
381
- const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
382
- dst[i] = op_silu (x[j0]) * g[j1];
383
- }
384
- }
385
-
386
359
namespace ggml_sycl_detail {
387
360
static void acc_f32_sycl (const float *x, const float *y, float *dst,
388
361
const int n_elements, const int ne10, const int ne11,
@@ -457,85 +430,6 @@ static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx,
457
430
}
458
431
}
459
432
460
- template <typename KernelInvoker, typename ... Args>
461
- static inline void dispatch_ggml_sycl_op_fused_glu (ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
462
- #if defined (GGML_SYCL_F16)
463
- GGML_ASSERT (dst->src [0 ]->type == GGML_TYPE_F32 || dst->src [0 ]->type == GGML_TYPE_F16);
464
- GGML_ASSERT (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
465
- #else
466
- GGML_ASSERT (dst->src [0 ]->type == GGML_TYPE_F32);
467
- GGML_ASSERT (dst->type == GGML_TYPE_F32);
468
- #endif
469
- GGML_ASSERT (dst->src [0 ]->type == dst->type );
470
- dpct::queue_ptr main_stream = ctx.stream ();
471
- SYCL_CHECK (ggml_sycl_set_device (ctx.device ));
472
- const ggml_tensor * src0 = dst->src [0 ];
473
- const ggml_tensor * src1 = dst->src [1 ];
474
- const int64_t nc = src1 ? src0->ne [0 ] : src0->ne [0 ] / 2 ;;
475
- GGML_ASSERT (dst->ne [0 ] == nc);
476
- GGML_ASSERT (ggml_is_contiguous_1 (dst->src [0 ]));
477
- GGML_ASSERT (ggml_is_contiguous (dst));
478
- const int32_t swapped = ((const int32_t *) dst->op_params )[1 ];
479
- void * src0_d = src0->data ;
480
- void * src1_d = src1 ? src1->data : src0->data ;
481
- const int64_t src0_o = src0->nb [1 ];
482
- const int64_t src1_o = src1 ? src1->nb [1 ] : src0->nb [1 ];
483
- void * dst_d = dst->data ;
484
- if (src1) {
485
- GGML_ASSERT (ggml_is_contiguous_1 (src1));
486
- GGML_ASSERT (src1->nb [0 ] == ggml_element_size (src1));
487
- GGML_ASSERT (src1->ne [0 ] == nc);
488
- GGML_ASSERT (src0->type == src1->type );
489
- }
490
- switch (dst->type ) {
491
- #if defined (GGML_SYCL_F16)
492
- case GGML_TYPE_F16:
493
- {
494
- sycl::half * src0_p = (sycl::half *) src0_d;
495
- sycl::half * src1_p = (sycl::half *) src1_d;
496
-
497
- if (!src1) {
498
- src0_p += swapped ? nc : 0 ;
499
- src1_p += swapped ? 0 : nc;
500
- }
501
- kernel_invoker (src0_p,
502
- src1_p,
503
- (sycl::half *) dst_d,
504
- ggml_nelements (dst),
505
- nc,
506
- src0_o / sizeof (sycl::half),
507
- src1_o / sizeof (sycl::half),
508
- main_stream,
509
- std::forward<Args>(args)...);
510
- break ;
511
- }
512
- #endif
513
- case GGML_TYPE_F32:
514
- {
515
- float * src0_p = (float *) src0_d;
516
- float * src1_p = (float *) src1_d;
517
-
518
- if (!src1) {
519
- src0_p += swapped ? nc : 0 ;
520
- src1_p += swapped ? 0 : nc;
521
- }
522
-
523
- kernel_invoker (src0_p,
524
- src1_p,
525
- (float *) dst_d,
526
- ggml_nelements (dst),
527
- nc,
528
- src0_o / sizeof (float ),
529
- src1_o / sizeof (float ),
530
- main_stream,
531
- std::forward<Args>(args)...);
532
- break ;
533
- }
534
- default :
535
- GGML_ABORT (" GGML tensor type not supported!\n " );
536
- }
537
- }
538
-
539
433
template <typename KernelInvoker, typename ... Args>
540
434
static inline void dispatch_ggml_sycl_op_upscale (ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
541
435
#if defined (GGML_SYCL_F16)
@@ -945,40 +839,6 @@ static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor
945
839
ggml_sycl_detail::acc_f32_sycl (src0_dd, src1_dd, dst_dd, (int )ggml_nelements (dst), (int )dst->src [1 ]->ne [0 ], (int )dst->src [1 ]->ne [1 ], (int )dst->src [1 ]->ne [2 ], nb1, nb2, offset, main_stream);
946
840
}
947
841
948
- static inline void ggml_sycl_op_geglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
949
- ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
950
- [](const auto * x_ptr, const auto * g_ptr, auto * dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
951
- const uint32_t num_blocks = ceil_div (k, SYCL_GELU_BLOCK_SIZE);
952
- sycl_parallel_for (main_stream,
953
- sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_GELU_BLOCK_SIZE)), sycl::range<1 >(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
954
- gated_op_fused_geglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
955
- });
956
- });
957
- }
958
-
959
- static inline void ggml_sycl_op_reglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
960
- ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
961
- [](const auto * x_ptr, const auto * g_ptr, auto * dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
962
- const uint32_t num_blocks = ceil_div ((uint32_t )k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
963
- sycl_parallel_for (main_stream,
964
- sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_RELU_BLOCK_SIZE)), sycl::range<1 >(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
965
- gated_op_fused_reglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
966
- });
967
- });
968
- }
969
-
970
- static inline void ggml_sycl_op_swiglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
971
- ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu (ctx, dst,
972
- [](const auto * x_ptr, const auto * g_ptr, auto * dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
973
- const uint32_t num_blocks = ceil_div ((uint32_t )k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
974
- sycl_parallel_for (main_stream,
975
- sycl::nd_range<1 >((num_blocks * sycl::range<1 >(SYCL_SILU_BLOCK_SIZE)), sycl::range<1 >(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1 > item_ct1) {
976
- gated_op_fused_swiglu (x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
977
- });
978
- });
979
- }
980
-
981
-
982
842
void ggml_sycl_sqrt (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
983
843
scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
984
844
ggml_sycl_op_sqrt (ctx, dst);
@@ -1104,17 +964,3 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1104
964
ggml_sycl_op_elu (ctx, dst);
1105
965
}
1106
966
1107
- void ggml_sycl_geglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1108
- scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1109
- ggml_sycl_op_geglu (ctx, dst);
1110
- }
1111
-
1112
- void ggml_sycl_reglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1113
- scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1114
- ggml_sycl_op_reglu (ctx, dst);
1115
- }
1116
-
1117
- void ggml_sycl_swiglu (ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1118
- scope_op_debug_print scope_dbg_print (__func__, dst, /* num_src=*/ 1 );
1119
- ggml_sycl_op_swiglu (ctx, dst);
1120
- }
0 commit comments