@@ -1720,15 +1720,15 @@ static __global__ void k_compute_batched_ptrs(
1720
1720
size_t nb12, size_t nb13,
1721
1721
size_t nbd2, size_t nbd3,
1722
1722
int64_t r2, int64_t r3) {
1723
- int64_t i13 = blockIdx .x * blockDim .x + threadIdx .x ;
1724
- int64_t i12 = blockIdx .y * blockDim .y + threadIdx .y ;
1723
+ const int64_t i13 = blockIdx .x * blockDim .x + threadIdx .x ;
1724
+ const int64_t i12 = blockIdx .y * blockDim .y + threadIdx .y ;
1725
1725
1726
1726
if (i13 >= ne13 || i12 >= ne12) {
1727
1727
return ;
1728
1728
}
1729
1729
1730
- int64_t i03 = i13 / r3;
1731
- int64_t i02 = i12 / r2;
1730
+ const int64_t i03 = i13 / r3;
1731
+ const int64_t i02 = i12 / r2;
1732
1732
1733
1733
ptrs_src[0 *ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
1734
1734
ptrs_src[1 *ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
@@ -1742,6 +1742,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1742
1742
GGML_ASSERT (ggml_backend_buffer_is_cuda (src0->buffer ));
1743
1743
GGML_ASSERT (src0->type == GGML_TYPE_F16);
1744
1744
1745
+ // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1746
+ // As long as dst is contiguous this does not matter though.
1747
+ GGML_ASSERT (ggml_is_contiguous (dst));
1748
+
1745
1749
GGML_TENSOR_BINARY_OP_LOCALS
1746
1750
1747
1751
const int64_t ne_dst = ggml_nelements (dst);
@@ -1750,21 +1754,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1750
1754
1751
1755
CUBLAS_CHECK (cublasSetStream (ctx.cublas_handle (), main_stream));
1752
1756
1753
- void * src0_ddq = src0->data ;
1754
- half * src0_f16 = (half *) src0_ddq;
1755
- float * src1_ddf = (float *) src1->data ;
1756
- float * dst_ddf = (float *) dst->data ;
1757
+ const half * src0_f16 = (const half *) src0->data ;
1758
+ float * dst_ddf = (float *) dst->data ;
1757
1759
1758
- // convert src1 to fp16
1760
+ const half * src1_f16 = (const half *) src1->data ;
1761
+ const size_t ts_src1 = ggml_type_size (src1->type );
1762
+ GGML_ASSERT (nb10 == ts_src1);
1763
+ int64_t s11 = nb11 / ts_src1;
1764
+ int64_t s12 = nb12 / ts_src1;
1765
+ int64_t s13 = nb13 / ts_src1;
1759
1766
ggml_cuda_pool_alloc<half> src1_f16_alloc (ctx.pool ());
1767
+
1768
+ // convert src1 to fp16
1760
1769
if (src1->type != GGML_TYPE_F16) {
1761
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
1770
+ const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda (src1->type );
1762
1771
const int64_t ne_src1 = ggml_nelements (src1);
1763
1772
src1_f16_alloc.alloc (ne_src1);
1764
1773
GGML_ASSERT (to_fp16_cuda != nullptr );
1765
- to_fp16_cuda (src1_ddf, src1_f16_alloc.get (), ne_src1, main_stream);
1774
+
1775
+ to_fp16_cuda (src1_f16, src1_f16_alloc.get (), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1776
+
1777
+ src1_f16 = src1_f16_alloc.get ();
1778
+ s11 = ne10;
1779
+ s12 = ne11*s11;
1780
+ s13 = ne12*s12;
1766
1781
}
1767
- half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get ();
1768
1782
1769
1783
ggml_cuda_pool_alloc<half> dst_f16 (ctx.pool ());
1770
1784
char * dst_t ;
@@ -1824,13 +1838,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1824
1838
int i02 = i12 / r2;
1825
1839
1826
1840
CUBLAS_CHECK(
1827
- cublasGemmEx(g_cublas_handles[g_main_device] , CUBLAS_OP_T, CUBLAS_OP_N,
1828
- ne01, ne11, ne10,
1829
- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
1830
- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float) ,
1831
- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01 ,
1832
- cu_compute_type,
1833
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1841
+ cublasGemmEx(ctx.cublas_handle() , CUBLAS_OP_T, CUBLAS_OP_N,
1842
+ ne01, ne11, ne10,
1843
+ alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02 , CUDA_R_16F, nb01/sizeof(half),
1844
+ src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11 ,
1845
+ beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0 ,
1846
+ cu_compute_type,
1847
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1834
1848
}
1835
1849
}
1836
1850
}
@@ -1841,15 +1855,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1841
1855
CUBLAS_CHECK (
1842
1856
cublasGemmStridedBatchedEx (ctx.cublas_handle (), CUBLAS_OP_T, CUBLAS_OP_N,
1843
1857
ne01, ne11, ne10,
1844
- alpha, ( const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1845
- ( const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
1846
- beta, ( char *) dst_t , cu_data_type, ne01 , nb2/nb0 , // strideC
1858
+ alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1859
+ src1_f16, CUDA_R_16F, s11, s12, // strideB
1860
+ beta, dst_t , cu_data_type, ne0 , ne1*ne0 , // strideC
1847
1861
ne12*ne13,
1848
1862
cu_compute_type,
1849
1863
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1850
1864
} else {
1851
1865
// use cublasGemmBatchedEx
1852
- const int ne23 = ne12*ne13;
1866
+ const int64_t ne23 = ne12*ne13;
1853
1867
1854
1868
ggml_cuda_pool_alloc<const void *> ptrs_src (ctx.pool (), 2 *ne23);
1855
1869
ggml_cuda_pool_alloc< void *> ptrs_dst (ctx.pool (), 1 *ne23);
@@ -1861,8 +1875,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1861
1875
ne12, ne13,
1862
1876
ne23,
1863
1877
nb02, nb03,
1864
- src1->type == GGML_TYPE_F16 ? nb12 : nb12/ 2 ,
1865
- src1->type == GGML_TYPE_F16 ? nb13 : nb13/ 2 ,
1878
+ src1->type == GGML_TYPE_F16 ? nb12 : s12* sizeof (half) ,
1879
+ src1->type == GGML_TYPE_F16 ? nb13 : s13* sizeof (half) ,
1866
1880
nbd2, nbd3,
1867
1881
r2, r3);
1868
1882
CUDA_CHECK (cudaGetLastError ());
@@ -1871,8 +1885,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
1871
1885
cublasGemmBatchedEx (ctx.cublas_handle (), CUBLAS_OP_T, CUBLAS_OP_N,
1872
1886
ne01, ne11, ne10,
1873
1887
alpha, (const void **) (ptrs_src.get () + 0 *ne23), CUDA_R_16F, nb01/nb00,
1874
- (const void **) (ptrs_src.get () + 1 *ne23), CUDA_R_16F, nb11/nb10 ,
1875
- beta, ( void **) (ptrs_dst.get () + 0 *ne23), cu_data_type, ne01 ,
1888
+ (const void **) (ptrs_src.get () + 1 *ne23), CUDA_R_16F, s11 ,
1889
+ beta, ( void **) (ptrs_dst.get () + 0 *ne23), cu_data_type, ne0 ,
1876
1890
ne23,
1877
1891
cu_compute_type,
1878
1892
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1936,7 +1950,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1936
1950
} else if (!split && use_mul_mat_vec_q) {
1937
1951
ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
1938
1952
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1939
- dst-> op_params [ 0 ] == GGML_PREC_DEFAULT && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1953
+ !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1940
1954
// general KQ + KQV multi-batch without FlashAttention
1941
1955
ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
1942
1956
} else if (use_mul_mat_vec) {
0 commit comments