Skip to content

BLIS-DMA support #563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions build/bli_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@
#define BLIS_DISABLE_MEM_TRACING
#endif

#if @enable_dma@
#define BLIS_ENABLE_DMA
#endif

#if @int_type_size@ == 64
#define BLIS_INT_TYPE_SIZE 64
#elif @int_type_size@ == 32
Expand Down
1 change: 1 addition & 0 deletions build/libblis-symbols.def
Original file line number Diff line number Diff line change
Expand Up @@ -1192,6 +1192,7 @@ bli_info_get_enable_openmp
bli_info_get_enable_pba_pools
bli_info_get_enable_pthreads
bli_info_get_enable_sandbox
bli_info_get_enable_dma
bli_info_get_enable_sba_pools
bli_info_get_enable_stay_auto_init
bli_info_get_enable_threading
Expand Down
13 changes: 13 additions & 0 deletions common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ get-noopt-cflags-for = $(strip $(CFLAGS_PRESET) \
$(call load-var-for,CLANGFLAGS,$(1)) \
$(call load-var-for,CPPROCFLAGS,$(1)) \
$(CTHREADFLAGS) \
$(CSANITIZEFLAGS) \
$(CINCFLAGS) $(VERS_DEF) \
)

Expand All @@ -112,6 +113,7 @@ get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \
$(call load-var-for,CXXLANGFLAGS,$(1)) \
$(call load-var-for,CPPROCFLAGS,$(1)) \
$(CTHREADFLAGS) \
$(CSANITIZEFLAGS) \
$(CINCFLAGS) $(VERS_DEF) \
)

Expand Down Expand Up @@ -513,6 +515,17 @@ ifeq ($(DEBUG_TYPE),sde)
LDFLAGS := $(filter-out $(LIBMEMKIND),$(LDFLAGS))
endif

ifeq ($(DEBUG_TYPE),address)
CSANITIZEFLAGS := -fsanitize=address
LDFLAGS += -fsanitize=address -static-libasan
endif

ifeq ($(DEBUG_TYPE),thread)
CSANITIZEFLAGS := -fsanitize=thread
LDFLAGS += -fsanitize=thread -static-libasan
endif


# Specify the shared library's 'soname' field.
# NOTE: The flag for creating shared objects is different for Linux and OS X.
ifeq ($(OS_NAME),Darwin)
Expand Down
29 changes: 29 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ print_usage()
echo " kept in the framework, otherwise optimization is"
echo " turned off."
echo " "
echo " If DEBUG is 'address', then -fsanitize=address is added."
echo " If DEBUG is 'thread', then -fsanitize=thread is added."
echo " "
echo " --disable-static, --enable-static"
echo " "
echo " Disable (enabled by default) building BLIS as a static"
Expand Down Expand Up @@ -217,6 +220,11 @@ print_usage()
echo " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE."
echo " Please use only for informational/debugging purposes."
echo " "
echo " --enable-dma, --disable-dma"
echo " "
echo " Enable (disabled by default) DMA support."
echo " [Experimental] Only useful on DMA-based architectures."
echo " "
echo " -i SIZE, --int-size=SIZE"
echo " "
echo " Set the size (in bits) of internal BLIS integers and"
Expand Down Expand Up @@ -2076,6 +2084,7 @@ main()
enable_pba_pools='yes'
enable_sba_pools='yes'
enable_mem_tracing='no'
enable_dma='no'
int_type_size=0
blas_int_type_size=32
enable_blas='yes'
Expand Down Expand Up @@ -2239,6 +2248,12 @@ main()
disable-mem-tracing)
enable_mem_tracing='no'
;;
enable-dma)
enable_dma='yes'
;;
disable-dma)
enable_dma='no'
;;
enable-sandbox=*)
sandbox_flag=1
sandbox=${OPTARG#*=}
Expand Down Expand Up @@ -2926,6 +2941,12 @@ main()
elif [ "x${debug_type}" = "xsde" ]; then
debug_type='sde'
echo "${script_name}: enabling SDE processor emulation."
elif [ "x${debug_type}" = "xaddress" ]; then
debug_type='address'
echo "${script_name}: enabling debug symbols; -fsanitize=address."
elif [ "x${debug_type}" = "xthread" ]; then
debug_type='thread'
echo "${script_name}: enabling debug symbols; -fsanitize=thread."
else
debug_type='noopt'
echo "${script_name}: enabling debug symbols; optimizations disabled."
Expand Down Expand Up @@ -3062,6 +3083,13 @@ main()
echo "${script_name}: memory tracing output is disabled."
enable_mem_tracing_01=0
fi
if [ "x${enable_dma}" = "xyes" ]; then
echo "${script_name}: DMA is enabled."
enable_dma_01=1
else
echo "${script_name}: DMA is disabled."
enable_dma_01=0
fi
if [ "x${has_memkind}" = "xyes" ]; then
if [ "x${enable_memkind}" = "x" ]; then
# If no explicit option was given for libmemkind one way or the other,
Expand Down Expand Up @@ -3402,6 +3430,7 @@ main()
| sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \
| sed -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \
| sed -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \
| sed -e "s/@enable_dma@/${enable_dma_01}/g" \
| sed -e "s/@int_type_size@/${int_type_size}/g" \
| sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \
| sed -e "s/@enable_blas@/${enable_blas_01}/g" \
Expand Down
6 changes: 6 additions & 0 deletions frame/1m/packm/bli_packm_cntl.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ cntl_t* bli_packm_cntl_create_node
params->rev_iter_if_lower = rev_iter_if_lower;
params->pack_schema = pack_schema;
params->pack_buf_type = pack_buf_type;
#ifdef BLIS_ENABLE_DMA
params->a_dma = NULL;
params->p_dma = NULL;
params->mem_p_dma = NULL;
params->event_dma = NULL;
#endif // BLIS_ENABLE_DMA

#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_packm_cntl_create_node(): " );
Expand Down
61 changes: 61 additions & 0 deletions frame/1m/packm/bli_packm_cntl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,25 @@ struct packm_params_s
bool rev_iter_if_lower;
pack_t pack_schema;
packbuf_t pack_buf_type;

#ifdef BLIS_ENABLE_DMA
// Extra information to trigger a DMA-prefetch right after each packing.
// The idea is to recycle the input buffer of packm (which is originally
// a DMA-buffer) to trigger a new DMA copy immediately after the end of
// packing and before the subsequent computation, since the packed submatrix
// has been written to another buffer:
//
// DDR -> SMEM -> SMEM
// 1. DMA : a_global -> a_dma
// 2. Packing : a_dma -> a_packed
// 3. DMA next block : a_global' -> a_dma
// 4. Computation : a_packed -> bli_gemm_int() ...
obj_t* a_dma;
obj_t* p_dma;
mem_t* mem_p_dma;
dma_event_t* event_dma;
#endif // BLIS_ENABLE_DMA

};
typedef struct packm_params_s packm_params_t;

Expand Down Expand Up @@ -87,6 +106,48 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl )
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type;
}

#ifdef BLIS_ENABLE_DMA
BLIS_INLINE obj_t* bli_cntl_packm_params_a_dma( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->a_dma;
}

BLIS_INLINE obj_t* bli_cntl_packm_params_p_dma( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->p_dma;
}

BLIS_INLINE mem_t* bli_cntl_packm_params_mem_p_dma( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->mem_p_dma;
}

BLIS_INLINE dma_event_t* bli_cntl_packm_params_event_dma( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->event_dma;
}

BLIS_INLINE void bli_cntl_packm_params_set_a_dma( obj_t* a_dma, cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->a_dma = a_dma;
}

BLIS_INLINE void bli_cntl_packm_params_set_p_dma( obj_t* p_dma, cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->p_dma = p_dma;
}

BLIS_INLINE void bli_cntl_packm_params_set_mem_p_dma( mem_t* mem_p_dma, cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->mem_p_dma = mem_p_dma;
}

BLIS_INLINE void bli_cntl_packm_params_set_event_dma( dma_event_t* event_dma, cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->event_dma = event_dma;
}
#endif // BLIS_ENABLE_DMA

// -----------------------------------------------------------------------------

cntl_t* bli_packm_cntl_create_node
Expand Down
61 changes: 61 additions & 0 deletions frame/3/bli_l3_blocksize.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,50 @@ dim_t PASTEMAC0(opname) \
if ( bli_obj_root_is_herm_or_symm( a ) ) \
{ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
/* When DMA is enabled on herm_or_symm, and in the case where KC is
larger than the other dimension (MC or NC), the copied DMA panel
should be extended to cover the stored region, used to "symmetrize"
the unstored one. This extension requires implicit extra local
memory allocation that the developer might not anticipate. We call
this increasing of KC as "Upper-squarization".

On some embedded platforms, the developer may even not afford this
increase of memory footprint, due to some resource limitation; and
doing so exceeds the HW capacity that likely to fail at execution.
In such situation, we prefer to reduce KC to be equal to MC (or NC),
so that the DMA transfer is not to be extended, as the mirror-region
is now entirely covered by the DMA panel. We call this
"Lower-squarization".

We are aware that this choice is sub-optimal, the best solution
will be computing a trade-off between KC and MC (or NC) so that
the DMA panel does not exceed the initially allocated buffer.
We call this "Mid-squarization". Contribution is welcome.

Reference: Section 9.3.4. Special cases handling in
https://tel.archives-ouvertes.fr/tel-02426014/document
*/ \
if( bli_info_get_enable_dma() ) \
{ \
dim_t mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
b_alg = bli_min( b_alg, mnc+1-mnr ); \
b_max = bli_min( b_max, mnc+1-mnr ); \
} \
\
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
} \
else if ( bli_obj_root_is_herm_or_symm( b ) ) \
{ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
/* DMA: same reason as above. */ \
if( bli_info_get_enable_dma() ) \
{ \
dim_t mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
b_alg = bli_min( b_alg, mnc+1-mnr ); \
b_max = bli_min( b_max, mnc+1-mnr ); \
} \
\
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
} \
Expand Down Expand Up @@ -232,6 +270,7 @@ dim_t PASTEMAC0(opname) \
num_t dt; \
blksz_t* bsize; \
dim_t mnr; \
dim_t mnc; \
dim_t b_alg, b_max; \
dim_t b_use; \
\
Expand All @@ -258,9 +297,22 @@ dim_t PASTEMAC0(opname) \
multiple of MR if the triangular matrix is on the left, or NR
if the triangular matrix is one the right. */ \
if ( bli_obj_root_is_triangular( a ) ) \
{ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
} \
else \
{ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \
} \
\
/* DMA: lower-squarization to minimize footprint */ \
if( bli_info_get_enable_dma() ) \
{ \
b_alg = bli_min( b_alg, mnc+1-mnr ); \
b_max = bli_min( b_max, mnc+1-mnr ); \
} \
\
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
Expand Down Expand Up @@ -321,6 +373,15 @@ dim_t PASTEMAC0(opname) \
matrix uses MR, since only left-side trsm micro-kernels are
supported. */ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
\
/* DMA: lower-squarization to minimize footprint */ \
if( bli_info_get_enable_dma() ) \
{ \
dim_t mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \
b_alg = bli_min( b_alg, mnc+1-mnr ); \
b_max = bli_min( b_max, mnc+1-mnr ); \
} \
\
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
\
Expand Down
33 changes: 18 additions & 15 deletions frame/3/bli_l3_packm.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ void bli_l3_packm
// Query the address of the mem_t entry within the control tree node.
cntl_mem_p = bli_cntl_pack_mem( cntl );

mem_t* local_mem_p;
mem_t local_mem_s;

// Check the mem_t field in the control tree. If it is unallocated, then
// we need to acquire a block from the memory broker and broadcast it to
// all threads in the chief's thread group.
if ( bli_mem_is_unalloc( cntl_mem_p ) )
{
mem_t* local_mem_p;
mem_t local_mem_s;

if ( bli_thread_am_ochief( thread ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
Expand Down Expand Up @@ -110,9 +110,6 @@ void bli_l3_packm
}
else // ( bli_mem_is_alloc( cntl_mem_p ) )
{
mem_t* local_mem_p;
mem_t local_mem_s;

// If the mem_t entry in the control tree does NOT contain a NULL
// buffer, then a block has already been acquired from the memory
// broker and cached in the control tree.
Expand Down Expand Up @@ -152,17 +149,11 @@ void bli_l3_packm
// this thread's control tree node.
*cntl_mem_p = *local_mem_p;
}
else
{
// If the mem_t entry is already allocated and sufficiently large,
// then we use it as-is. No action is needed, because all threads
// will already have the cached values in their local control
// trees' mem_t entries, currently pointed to by cntl_mem_p.

bli_thread_barrier( thread );
}
}

// Barrier so that all threads have read the content of local_mem_p,
// located in the stack of the chief thread.
bli_thread_barrier( thread );

// Update the buffer address in x_pack to point to the buffer associated
// with the mem_t entry acquired from the memory broker (now cached in
Expand All @@ -183,5 +174,17 @@ void bli_l3_packm

// Barrier so that packing is done before computation.
bli_thread_barrier( thread );

#ifdef BLIS_ENABLE_DMA
// After packing, recycle the DMA buffer to prefetch next block
obj_t* a_dma = bli_cntl_packm_params_a_dma( cntl );
obj_t* p_dma = bli_cntl_packm_params_p_dma( cntl );
mem_t* mem_p_dma = bli_cntl_packm_params_mem_p_dma( cntl );
dma_event_t* event_dma = bli_cntl_packm_params_event_dma( cntl );
if ( a_dma && p_dma && mem_p_dma )
{
bli_dma_get( a_dma, p_dma, mem_p_dma, event_dma, rntm, thread );
}
#endif // BLIS_ENABLE_DMA
}

Loading