Skip to content

Commit aeb5f0c

Browse files
Omnibus PR - Oct 2023 (#678)
Details: - This is an "omnibus" commit, consisting of multiple medium-sized commits that affect non-trivial aspects of BLIS. The major highlights: - Relocated the pba, sba pool (from the rntm_t), and mem_t (from the cntl_t) to the thrinfo_t object. This allows the rntm_t to be effectively const (although it is sometimes copied internally and modified to reflect different ways of parallelism). Moving the mem_t sets the stage for sharing a global control tree amongst all threads. - De-templatized the macrokernels for gemmt, trmm, and trsm to match the macrokernel for gemm, which has been de-templatized since 54fa28b. - Reimplemented bli_l3_determine_kc() by separating out the logic for adjusting KC based on MR/NR for triangular A and/or B into a new function, bli_l3_adjust_kc(). For now, this function is still called from bli_l3_determine_kc(), but in the future we plan to have it called once when constructing the control tree. - Refactored the level-3 thread decorator into two parts: - One part deals only with launching threads, each one calling a generic thread entry function. This code resides in frame/thread and constitutes the definition of bli_thread_launch(). Note that it is specific to the threading implementation (OpenMP, pthreads, single, etc.) - The other part deals with passing the matrix operands and related information into bli_thread_launch(). This is the "l3 decorator" and now resides in frame/3. It is agnostic to the threading implementation. - Modified the "level" of the thread control tree passed in at each operation. Previously, each operation (e.g. bli_gemm_blk_var1()) was passed in a communicator representing the active thread teams which would share the available work. Now, the *parent* thread comm is passed in. The operation then grabs the child comm and uses it to partition the work. The difference is in bli_trsm_blk_var1(), where there are now two children nodes for this single operation (i.e. the thread control tree is split one level above where the control tree is). The sub-prenode is used for the trsm subproblem while the normal sub-node is used for the gemm part. Importantly, the parent comm is used for the barrier between them. - Removed cntl_t* arguments from bli_*_front() functions. These will be added back in the future when the control tree's creation is moved so that it happens much sooner (provided that bli_*_front() have not been absorbed into their respective bli_*_ex() functions). - Renamed various bli_thread_*() query functions to bli_thrinfo_*(), for consistency. This includes _num_threads(), _thread_id(), _n_way(), _work_id(), _sba_pool(), _pba(), _mem(), _barrier(), _broadcast(), and _am_chief(). - Removed extraneous barrier from _blk_var3() of gemm and trsm. - Fixed a typo in bli_type_defs.h where BLIS_BLAS_INT_TYPE_SIZE was misspelled.
1 parent c803b03 commit aeb5f0c

File tree

206 files changed

+5013
-11035
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

206 files changed

+5013
-11035
lines changed

addon/gemmd/attic/bao_gemmd_bp_var2.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -386,8 +386,8 @@ void PASTECH2(bao_,ch,varname) \
386386
/* Query the number of threads and thread ids for the JR loop.
387387
NOTE: These values are only needed when computing the next
388388
micropanel of B. */ \
389-
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
390-
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
389+
const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \
390+
const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
391391
\
392392
/* Compute number of primary and leftover components of the JR loop. */ \
393393
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
@@ -416,8 +416,8 @@ void PASTECH2(bao_,ch,varname) \
416416
/* Query the number of threads and thread ids for the IR loop.
417417
NOTE: These values are only needed when computing the next
418418
micropanel of A. */ \
419-
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
420-
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
419+
const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \
420+
const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
421421
\
422422
/* Compute number of primary and leftover components of the IR loop. */ \
423423
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -476,7 +476,7 @@ void PASTECH2(bao_,ch,varname) \
476476
/* This barrier is needed to prevent threads from starting to pack
477477
the next row panel of B before the current row panel is fully
478478
computed upon. */ \
479-
bli_thread_barrier( thread_pb ); \
479+
bli_thrinfo_barrier( thread_pb ); \
480480
} \
481481
} \
482482
\

addon/gemmd/bao_gemmd_bp_var1.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -370,8 +370,8 @@ void PASTECH2(bao_,ch,varname) \
370370
/* Query the number of threads and thread ids for the JR loop.
371371
NOTE: These values are only needed when computing the next
372372
micropanel of B. */ \
373-
const dim_t jr_nt = bli_thread_n_way( thread_jr ); \
374-
const dim_t jr_tid = bli_thread_work_id( thread_jr ); \
373+
const dim_t jr_nt = bli_thrinfo_n_way( thread_jr ); \
374+
const dim_t jr_tid = bli_thrinfo_work_id( thread_jr ); \
375375
\
376376
/* Compute number of primary and leftover components of the JR loop. */ \
377377
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
@@ -400,8 +400,8 @@ void PASTECH2(bao_,ch,varname) \
400400
/* Query the number of threads and thread ids for the IR loop.
401401
NOTE: These values are only needed when computing the next
402402
micropanel of A. */ \
403-
const dim_t ir_nt = bli_thread_n_way( thread_ir ); \
404-
const dim_t ir_tid = bli_thread_work_id( thread_ir ); \
403+
const dim_t ir_nt = bli_thrinfo_n_way( thread_ir ); \
404+
const dim_t ir_tid = bli_thrinfo_work_id( thread_ir ); \
405405
\
406406
/* Compute number of primary and leftover components of the IR loop. */ \
407407
dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -458,7 +458,7 @@ void PASTECH2(bao_,ch,varname) \
458458
/* This barrier is needed to prevent threads from starting to pack
459459
the next row panel of B before the current row panel is fully
460460
computed upon. */ \
461-
bli_thread_barrier( rntm, thread_pb ); \
461+
bli_thrinfo_barrier( thread_pb ); \
462462
} \
463463
} \
464464
\

addon/gemmd/bao_l3_packm_a.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
6161
\
6262
/* Barrier to make sure all threads are caught up and ready to begin the
6363
packm stage. */ \
64-
bli_thread_barrier( rntm, thread ); \
64+
bli_thrinfo_barrier( thread ); \
6565
\
6666
/* Compute the size of the memory block eneded. */ \
6767
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
9090
\
9191
/* Broadcast the address of the chief thread's passed-in mem_t to all
9292
threads. */ \
93-
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
93+
mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
9494
\
9595
/* Non-chief threads: Copy the contents of the chief thread's
9696
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
139139
\
140140
/* Broadcast the address of the chief thread's passed-in mem_t
141141
to all threads. */ \
142-
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
142+
mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
143143
\
144144
/* Non-chief threads: Copy the contents of the chief thread's
145145
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -313,13 +313,13 @@ void PASTECH2(bao_,ch,opname) \
313313
d, incd, \
314314
a, rs_a, cs_a, \
315315
*p, *rs_p, *cs_p, \
316-
pd_p, *ps_p, \
316+
pd_p, *ps_p, \
317317
cntx, \
318318
thread \
319319
); \
320320
\
321321
/* Barrier so that packing is done before computation. */ \
322-
bli_thread_barrier( rntm, thread ); \
322+
bli_thrinfo_barrier( thread ); \
323323
}
324324

325325
//INSERT_GENTFUNC_BASIC0( packm_a )

addon/gemmd/bao_l3_packm_b.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ void PASTECH2(bao_,ch,opname) \
6161
\
6262
/* Barrier to make sure all threads are caught up and ready to begin the
6363
packm stage. */ \
64-
bli_thread_barrier( rntm, thread ); \
64+
bli_thrinfo_barrier( thread ); \
6565
\
6666
/* Compute the size of the memory block eneded. */ \
6767
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
@@ -90,7 +90,7 @@ void PASTECH2(bao_,ch,opname) \
9090
\
9191
/* Broadcast the address of the chief thread's passed-in mem_t to all
9292
threads. */ \
93-
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
93+
mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
9494
\
9595
/* Non-chief threads: Copy the contents of the chief thread's
9696
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -139,7 +139,7 @@ void PASTECH2(bao_,ch,opname) \
139139
\
140140
/* Broadcast the address of the chief thread's passed-in mem_t
141141
to all threads. */ \
142-
mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
142+
mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); \
143143
\
144144
/* Non-chief threads: Copy the contents of the chief thread's
145145
passed-in mem_t to the passed-in mem_t for this thread. (The
@@ -313,13 +313,13 @@ void PASTECH2(bao_,ch,opname) \
313313
d, incd, \
314314
b, rs_b, cs_b, \
315315
*p, *rs_p, *cs_p, \
316-
pd_p, *ps_p, \
316+
pd_p, *ps_p, \
317317
cntx, \
318318
thread \
319319
); \
320320
\
321321
/* Barrier so that packing is done before computation. */ \
322-
bli_thread_barrier( rntm, thread ); \
322+
bli_thrinfo_barrier( thread ); \
323323
}
324324

325325
//INSERT_GENTFUNC_BASIC0( packm_b )

addon/gemmd/bao_l3_packm_var1.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ void PASTECH2(bao_,ch,varname) \
127127
\
128128
/* Query the number of threads and thread ids from the current thread's
129129
packm thrinfo_t node. */ \
130-
const dim_t nt = bli_thread_n_way( thread ); \
131-
const dim_t tid = bli_thread_work_id( thread ); \
130+
const dim_t nt = bli_thrinfo_n_way( thread ); \
131+
const dim_t tid = bli_thrinfo_work_id( thread ); \
132132
\
133133
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
134134
( void )nt; \

addon/gemmd/bao_l3_packm_var2.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,8 @@ void PASTECH2(bao_,ch,varname) \
127127
\
128128
/* Query the number of threads and thread ids from the current thread's
129129
packm thrinfo_t node. */ \
130-
const dim_t nt = bli_thread_n_way( thread ); \
131-
const dim_t tid = bli_thread_work_id( thread ); \
130+
const dim_t nt = bli_thrinfo_n_way( thread ); \
131+
const dim_t tid = bli_thrinfo_work_id( thread ); \
132132
\
133133
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
134134
( void )nt; \

0 commit comments

Comments
 (0)