Skip to content

Tile-level partitioning in jr/ir loops (ex-trsm). #695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 11, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions build/bli_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@
#define BLIS_ENABLE_JRIR_RR
#endif

#if @enable_jrir_tlb@
#define BLIS_ENABLE_JRIR_TLB
#endif

#if @enable_pba_pools@
#define BLIS_ENABLE_PBA_POOLS
#else
Expand Down
63 changes: 44 additions & 19 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -340,16 +340,36 @@ print_usage()
echo " "
echo " -r METHOD, --thread-part-jrir=METHOD"
echo " "
echo " Request a method of assigning micropanels to threads in"
echo " the JR and IR loops. Valid values for METHOD are 'slab'"
echo " and 'rr'. Using 'slab' assigns (as much as possible)"
echo " contiguous regions of micropanels to each thread while"
echo " using 'rr' assigns micropanels to threads in a round-"
echo " robin fashion. The chosen method also applies during"
echo " the packing of A and B. The default method is 'slab'."
echo " NOTE: Specifying this option constitutes a request,"
echo " which may be ignored in select situations if the"
echo " implementation has a good reason to do so."
echo " Select a strategy for partitioning computation in JR and"
echo " IR loops and assigning that computation to threads. Valid"
echo " values for METHOD are 'rr', 'slab', and 'tlb':"
echo " 'rr': Assign the computation associated with whole"
echo " columns of microtiles to threads in a round-"
echo " robin fashion. When selected, round-robin"
echo " assignment is also employed during packing."
echo " 'slab': Partition the computation into N contiguous"
echo " regions, where each region contains a whole"
echo " number of microtile columns, and assign one"
echo " region to each thread. For some operations, the"
echo " number of microtile columns contained within a"
echo " given region may differ from that of other"
echo " regions, depending on how much work is implied"
echo " by each region. When selected, slab assignment"
echo " is also employed during packing."
echo " 'tlb': Tile-level load balancing is similar to slab,"
echo " except that regions will be divided at a more"
echo " granular level (individual microtiles instead"
echo " of whole columns of microtiles) to ensure more"
echo " equitable assignment of work to threads. When"
echo " selected, tlb will only be employed for level-3"
echo " operations except trsm; due to practical and"
echo " algorithmic limitations, slab partitioning will"
echo " be used instead during packing and for trsm."
echo " The default strategy is 'tlb'. NOTE: Specifying this"
echo " option constitutes a request, which may be ignored in"
echo " select situations if implementation has a good reason to"
echo " do so. (See description of 'tlb' above for an example of"
echo " this.)"
echo " "
echo " --disable-trsm-preinversion, --enable-trsm-preinversion"
echo " "
Expand Down Expand Up @@ -2490,7 +2510,7 @@ main()
threading_model='off'

# The method of assigning micropanels to threads in the JR and JR loops.
thread_part_jrir='slab'
thread_part_jrir='tlb'

# Option variables.
quiet_flag=''
Expand Down Expand Up @@ -3731,16 +3751,20 @@ main()

# Check the method of assigning micropanels to threads in the JR and IR
# loops.
enable_jrir_slab_01=0
enable_jrir_rr_01=0
if [ "x${thread_part_jrir}" = "xslab" ]; then
echo "${script_name}: requesting slab threading in jr and ir loops."
enable_jrir_slab_01=1
elif [ "x${thread_part_jrir}" = "xrr" ]; then
echo "${script_name}: requesting round-robin threading in jr and ir loops."
enable_jrir_slab_01=0
enable_jrir_tlb_01=0
if [ "x${thread_part_jrir}" = "xrr" ]; then
echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops."
enable_jrir_rr_01=1
elif [ "x${thread_part_jrir}" = "xslab" ]; then
echo "${script_name}: requesting slab work partitioning in jr and/or ir loops."
enable_jrir_slab_01=1
elif [ "x${thread_part_jrir}" = "xtlb" ]; then
echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop."
enable_jrir_tlb_01=1
else
echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}."
echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}."
exit 1
fi

Expand Down Expand Up @@ -4177,8 +4201,9 @@ main()
| sed -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \
| sed -e "s/@enable_hpx@/${enable_hpx_01}/g" \
| sed -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \
| sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
| sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \
| sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
| sed -e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g" \
| sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
| sed -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \
| sed -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \
Expand Down
1 change: 0 additions & 1 deletion frame/1m/packm/bli_packm.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
#include "bli_packm_init.h"
#include "bli_packm_int.h"
#include "bli_packm_scalar.h"
#include "bli_packm_thrinfo.h"

#include "bli_packm_part.h"

Expand Down
12 changes: 6 additions & 6 deletions frame/1m/packm/bli_packm_blk_var1.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,11 @@ void bli_packm_blk_var1
const dim_t tid = bli_thrinfo_work_id( thread );

// Determine the thread range and increment using the current thread's
// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
// packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
// will depend on whether slab or round-robin partitioning was requested
// at configure-time.
dim_t it_start, it_end, it_inc;
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );

char* p_begin = p_cast;

Expand All @@ -190,15 +190,15 @@ void bli_packm_blk_var1

inc_t p_inc = ps_p;

// NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr())
// NOTE: We MUST use round-robin work allocation (bli_is_my_iter_rr())
// when packing micropanels of a triangular matrix. Hermitian/symmetric
// and general packing may use slab or round-robin (bli_packm_my_iter()),
// and general packing may use slab or round-robin (bli_is_my_iter()),
// depending on which was selected at configure-time.
bool my_iter = ( bli_is_triangular( strucc ) &&
bli_intersects_diag_n( diagoffc_i, panel_dim_i,
panel_len_full )
? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
: bli_packm_my_iter ( it, it_start, it_end, tid, nt )
? bli_is_my_iter_rr( it, tid, nt )
: bli_is_my_iter ( it, it_start, it_end, tid, nt )
);

if ( bli_is_triangular( strucc ) &&
Expand Down
16 changes: 8 additions & 8 deletions frame/3/bli_l3_sup_packm_var.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,10 @@ void PASTEMAC(ch,varname) \
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, it = 0; it < n_iter; \
Expand All @@ -175,9 +175,9 @@ void PASTEMAC(ch,varname) \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
/* The definition of bli_is_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
f \
( \
Expand Down Expand Up @@ -398,10 +398,10 @@ void PASTEMAC(ch,varname) \
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( it = 0; it < n_iter; it += 1 ) \
Expand All @@ -412,9 +412,9 @@ void PASTEMAC(ch,varname) \
ctype* p_use = p_begin; \
\
{ \
/* The definition of bli_packm_my_iter() will depend on whether slab
/* The definition of bli_is_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
( \
Expand Down
4 changes: 2 additions & 2 deletions frame/3/bli_l3_sup_var12.c
Original file line number Diff line number Diff line change
Expand Up @@ -357,11 +357,11 @@ void PASTEMAC(ch,varname) \
object. */ \
/*
ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \
if ( bli_is_last_iter_slrr( i, ir_iter, 0, 1 ) ) \
{ \
a2 = a_00; \
b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \
if ( bli_is_last_iter_slrr( j, jr_iter, 0, 1 ) ) \
b2 = b_00; \
} \
\
Expand Down
12 changes: 6 additions & 6 deletions frame/3/bli_l3_thrinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,22 @@

// gemm

// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

// gemmt

// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

// NOTE: Here, we assume NO parallelism in the IR loop.
#define bli_gemmt_l_wrap_a_upanel( a0, step, doff_j, mr, nr ) \
( a0 + ( (-doff_j + 1*nr) / mr ) * step )
#define bli_gemmt_u_wrap_a_upanel( a0, step, doff_j, mr, nr ) \
( a0 )

// trmm

// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )

Expand Down
23 changes: 19 additions & 4 deletions frame/3/gemm/bli_gemm_cntl.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,25 @@ cntl_t* bli_gemmbp_cntl_create
void_fp macro_kernel_fp;

// Choose the default macrokernel based on the operation family...
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
else /* should never execute */ macro_kernel_fp = NULL;
if ( family == BLIS_GEMM ) macro_kernel_fp =
#ifdef BLIS_ENABLE_JRIR_TLB
bli_gemm_ker_var2b;
#else // ifdef ( _SLAB || _RR )
bli_gemm_ker_var2;
#endif
else if ( family == BLIS_GEMMT ) macro_kernel_fp =
#ifdef BLIS_ENABLE_JRIR_TLB
bli_gemmt_x_ker_var2b;
#else // ifdef ( _SLAB || _RR )
bli_gemmt_x_ker_var2;
#endif
else if ( family == BLIS_TRMM ) macro_kernel_fp =
#ifdef BLIS_ENABLE_JRIR_TLB
bli_trmm_xx_ker_var2b;
#else // ifdef ( _SLAB || _RR )
bli_trmm_xx_ker_var2;
#endif
else /* should never execute */ macro_kernel_fp = NULL;

// ...unless a non-NULL kernel function pointer is passed in, in which
// case we use that instead.
Expand Down
Loading