-
Notifications
You must be signed in to change notification settings - Fork 378
Proof-of-concept: speeding up gemm reference kernel #863
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -157,6 +157,89 @@ INSERT_GENTFUNCR_BASIC( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) | |
// instructions via constant loop bounds + #pragma omp simd directives. | ||
// If compile-time MR/NR are not available (indicated by BLIS_[MN]R_x = -1), | ||
// then the non-unrolled version (above) is used. | ||
// first the fastest case, 4 macros for m==mr, n==nr, k>0 | ||
// cs_c = 1, beta != 0 (row major) | ||
// cs_c = 1, beta == 0 | ||
// rs_c = 1, beta != 0 (column major) | ||
// rs_c = 1, beta == 0 | ||
|
||
#define TAIL_NITER 5 // in units of 4x k iterations | ||
#define CACHELINE_SIZE 64 | ||
#define TAXPBYS_BETA0(ch1,ch2,ch3,ch4,ch5,alpha,ab,beta,c) bli_tscal2s(ch1,ch2,ch3,ch4,alpha,ab,c) | ||
#undef GENTFUNC | ||
#define GENTFUNC( ctype, ch, opname, arch, suf, taxpbys, i_or_j, j_or_i, mr_or_nr, nr_or_mr ) \ | ||
\ | ||
static void PASTEMAC(ch,ch,opname,arch,suf) \ | ||
( \ | ||
dim_t k, \ | ||
const ctype* alpha, \ | ||
const ctype* a, \ | ||
const ctype* b, \ | ||
const ctype* beta, \ | ||
ctype* c, inc_t s_c \ | ||
) \ | ||
{ \ | ||
const dim_t mr = PASTECH(BLIS_,mr_or_nr,_,ch); \ | ||
const dim_t nr = PASTECH(BLIS_,nr_or_mr,_,ch); \ | ||
\ | ||
const inc_t cs_a = PASTECH(BLIS_PACKMR_,ch); \ | ||
const inc_t rs_b = PASTECH(BLIS_PACKNR_,ch); \ | ||
\ | ||
char ab_[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))) = { 0 }; \ | ||
ctype* ab = (ctype*)ab_; \ | ||
const inc_t s_ab = nr; \ | ||
\ | ||
\ | ||
/* Initialize the accumulator elements in ab to zero. */ \ | ||
PRAGMA_SIMD \ | ||
for ( dim_t i = 0; i < mr * nr; ++i ) \ | ||
{ \ | ||
bli_tset0s( ch, ab[ i ] ); \ | ||
} \ | ||
\ | ||
/* Perform a series of k rank-1 updates into ab. */ \ | ||
dim_t l = 0; do \ | ||
{ \ | ||
dim_t i = l + TAIL_NITER*4 + mr - k; \ | ||
if ( i >= 0 && i < mr ) \ | ||
for ( dim_t j = 0; j < nr; j += CACHELINE_SIZE/sizeof(double) ) \ | ||
bli_prefetch( &c[ i*s_c + j ], 0, 3 ); \ | ||
for ( dim_t i = 0; i < mr; ++i ) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about unroll pragmas but its syntax is different between compilers, e.g. for GCC its
|
||
{ \ | ||
PRAGMA_SIMD \ | ||
for ( dim_t j = 0; j < nr; ++j ) \ | ||
{ \ | ||
bli_tdots \ | ||
( \ | ||
ch,ch,ch,ch, \ | ||
a[ i_or_j ], \ | ||
b[ j_or_i ], \ | ||
ab[ i*s_ab + j ] \ | ||
); \ | ||
} \ | ||
} \ | ||
\ | ||
a += cs_a; \ | ||
b += rs_b; \ | ||
} while ( ++l < k ); \ | ||
\ | ||
for ( dim_t i = 0; i < mr; ++i ) \ | ||
PRAGMA_SIMD \ | ||
for ( dim_t j = 0; j < nr; ++j ) \ | ||
taxpbys \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is probably what brought you some gain compared to the reference kernel. The scaling-by-alpha is done at the same time of accumulation-to-c by using AXBY (FMA). Now the point is: This reference kernel was written to be simple-and-stupid, easily comprehensible and not aimed to be fast. Do we really want to make it a little harder to new people to understand, in exchange of some percentage of performance, given it was not the original purpose of this kernel. BTW, maybe I was a bit paranoid, and perhaps a simple comment saying I would prefer some direct modification in the original reference kernel (PRAGMA_UNROLL, remove redundant ab-zero-init, AXBY-alpha-scaling-accumulation, or even There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well there are already two original reference kernels; the slow version is the first one in the file called
If I try to make the original fast path faster I simply don't get the same speed ups because the whole C tile is spilled to memory and I might as well not change anything. An alternative also would be to have the 4 new kernels doing the only fast path, and let all other oddball cases use
to
or even flipping it around: the slow version is the main reference gemm and the fast version is called via that if statement flipped? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will have to think if the |
||
( \ | ||
ch,ch,ch,ch,ch, \ | ||
*alpha, \ | ||
ab[ i*s_ab + j ], \ | ||
*beta, \ | ||
c [ i*s_c + j ] \ | ||
); \ | ||
} | ||
|
||
INSERT_GENTFUNC_BASIC( gemm_vect_r_beta0, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, TAXPBYS_BETA0, i, j, MR, NR ) | ||
INSERT_GENTFUNC_BASIC( gemm_vect_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, bli_taxpbys, i, j, MR, NR ) | ||
INSERT_GENTFUNC_BASIC( gemm_vect_c_beta0, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, TAXPBYS_BETA0, j, i, NR, MR ) | ||
INSERT_GENTFUNC_BASIC( gemm_vect_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, bli_taxpbys, j, i, NR, MR ) | ||
|
||
#undef GENTFUNC | ||
#define GENTFUNC( ctype, ch, opname, arch, suf ) \ | ||
|
@@ -210,6 +293,36 @@ void PASTEMAC(ch,ch,opname,arch,suf) \ | |
); \ | ||
return; \ | ||
} \ | ||
\ | ||
if ( m == mr && n == nr && k > 0 ) \ | ||
{ \ | ||
if ( cs_c == 1 ) \ | ||
{ \ | ||
(bli_teq0s( ch, *beta ) ? PASTEMAC(ch,ch,gemm_vect_r_beta0,arch,suf) : PASTEMAC(ch,ch,gemm_vect_r,arch,suf)) \ | ||
( \ | ||
k, \ | ||
alpha, \ | ||
a, \ | ||
b, \ | ||
beta, \ | ||
c, rs_c \ | ||
); \ | ||
return; \ | ||
} \ | ||
if ( rs_c == 1 ) \ | ||
{ \ | ||
(bli_teq0s( ch, *beta ) ? PASTEMAC(ch,ch,gemm_vect_c_beta0,arch,suf) : PASTEMAC(ch,ch,gemm_vect_c,arch,suf)) \ | ||
( \ | ||
k, \ | ||
alpha, \ | ||
a, \ | ||
b, \ | ||
beta, \ | ||
c, cs_c \ | ||
); \ | ||
return; \ | ||
} \ | ||
} \ | ||
\ | ||
char ab_[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))) = { 0 }; \ | ||
ctype* ab = (ctype*)ab_; \ | ||
|
@@ -382,5 +495,3 @@ void PASTEMAC(chab,chc,opname,arch,suf) \ | |
} | ||
|
||
INSERT_GENTFUNC2_MIX_P( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know this is copied from the reference kernel, this zero-init is redundant to the just-following loop L195-198. I suggest you check in the assembler whether the compiler did eliminate one of them (and did manage to vectorize the zeroing as well).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The zero-init is actually required for certain versions of clang since it improperly optimizes out some of the later zero assignments. See #854.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes exactly I saw #854 as well because this puzzled me too. But the compiler generated optimal code like this (for my case with MR=32, NR=6 for Zen4, from
objdump -d
:and then continuing all the way up to
zmm28
, so 24 zero'd vectors of 8 doubles each, which is exactly 6*32.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh, I did not make the link to #854, hmm it looks a weird issue.