diff --git a/build/bli_config.h.in b/build/bli_config.h.in index fa6bbbe12e..ef45a1e87e 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -85,6 +85,10 @@ #define BLIS_DISABLE_MEM_TRACING #endif +#if @enable_dma@ +#define BLIS_ENABLE_DMA +#endif + #if @int_type_size@ == 64 #define BLIS_INT_TYPE_SIZE 64 #elif @int_type_size@ == 32 diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def index 97146a7861..02ce8f9cee 100644 --- a/build/libblis-symbols.def +++ b/build/libblis-symbols.def @@ -1192,6 +1192,7 @@ bli_info_get_enable_openmp bli_info_get_enable_pba_pools bli_info_get_enable_pthreads bli_info_get_enable_sandbox +bli_info_get_enable_dma bli_info_get_enable_sba_pools bli_info_get_enable_stay_auto_init bli_info_get_enable_threading diff --git a/common.mk b/common.mk index 2da306d792..a2d1930261 100644 --- a/common.mk +++ b/common.mk @@ -101,6 +101,7 @@ get-noopt-cflags-for = $(strip $(CFLAGS_PRESET) \ $(call load-var-for,CLANGFLAGS,$(1)) \ $(call load-var-for,CPPROCFLAGS,$(1)) \ $(CTHREADFLAGS) \ + $(CSANITIZEFLAGS) \ $(CINCFLAGS) $(VERS_DEF) \ ) @@ -112,6 +113,7 @@ get-noopt-cxxflags-for = $(strip $(CFLAGS_PRESET) \ $(call load-var-for,CXXLANGFLAGS,$(1)) \ $(call load-var-for,CPPROCFLAGS,$(1)) \ $(CTHREADFLAGS) \ + $(CSANITIZEFLAGS) \ $(CINCFLAGS) $(VERS_DEF) \ ) @@ -513,6 +515,17 @@ ifeq ($(DEBUG_TYPE),sde) LDFLAGS := $(filter-out $(LIBMEMKIND),$(LDFLAGS)) endif +ifeq ($(DEBUG_TYPE),address) +CSANITIZEFLAGS := -fsanitize=address +LDFLAGS += -fsanitize=address -static-libasan +endif + +ifeq ($(DEBUG_TYPE),thread) +CSANITIZEFLAGS := -fsanitize=thread +LDFLAGS += -fsanitize=thread -static-libasan +endif + + # Specify the shared library's 'soname' field. # NOTE: The flag for creating shared objects is different for Linux and OS X. ifeq ($(OS_NAME),Darwin) diff --git a/configure b/configure index 3c865dad90..7a69142996 100755 --- a/configure +++ b/configure @@ -132,6 +132,9 @@ print_usage() echo " kept in the framework, otherwise optimization is" echo " turned off." echo " " + echo " If DEBUG is 'address', then -fsanitize=address is added." + echo " If DEBUG is 'thread', then -fsanitize=thread is added." + echo " " echo " --disable-static, --enable-static" echo " " echo " Disable (enabled by default) building BLIS as a static" @@ -217,6 +220,11 @@ print_usage() echo " Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE." echo " Please use only for informational/debugging purposes." echo " " + echo " --enable-dma, --disable-dma" + echo " " + echo " Enable (disabled by default) DMA support." + echo " [Experimental] Only useful on DMA-based architectures." + echo " " echo " -i SIZE, --int-size=SIZE" echo " " echo " Set the size (in bits) of internal BLIS integers and" @@ -2076,6 +2084,7 @@ main() enable_pba_pools='yes' enable_sba_pools='yes' enable_mem_tracing='no' + enable_dma='no' int_type_size=0 blas_int_type_size=32 enable_blas='yes' @@ -2239,6 +2248,12 @@ main() disable-mem-tracing) enable_mem_tracing='no' ;; + enable-dma) + enable_dma='yes' + ;; + disable-dma) + enable_dma='no' + ;; enable-sandbox=*) sandbox_flag=1 sandbox=${OPTARG#*=} @@ -2926,6 +2941,12 @@ main() elif [ "x${debug_type}" = "xsde" ]; then debug_type='sde' echo "${script_name}: enabling SDE processor emulation." + elif [ "x${debug_type}" = "xaddress" ]; then + debug_type='address' + echo "${script_name}: enabling debug symbols; -fsanitize=address." + elif [ "x${debug_type}" = "xthread" ]; then + debug_type='thread' + echo "${script_name}: enabling debug symbols; -fsanitize=thread." else debug_type='noopt' echo "${script_name}: enabling debug symbols; optimizations disabled." @@ -3062,6 +3083,13 @@ main() echo "${script_name}: memory tracing output is disabled." enable_mem_tracing_01=0 fi + if [ "x${enable_dma}" = "xyes" ]; then + echo "${script_name}: DMA is enabled." + enable_dma_01=1 + else + echo "${script_name}: DMA is disabled." + enable_dma_01=0 + fi if [ "x${has_memkind}" = "xyes" ]; then if [ "x${enable_memkind}" = "x" ]; then # If no explicit option was given for libmemkind one way or the other, @@ -3402,6 +3430,7 @@ main() | sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \ | sed -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \ | sed -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \ + | sed -e "s/@enable_dma@/${enable_dma_01}/g" \ | sed -e "s/@int_type_size@/${int_type_size}/g" \ | sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \ | sed -e "s/@enable_blas@/${enable_blas_01}/g" \ diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c index fc6ba8052c..f0d07d6f72 100644 --- a/frame/1m/packm/bli_packm_cntl.c +++ b/frame/1m/packm/bli_packm_cntl.c @@ -70,6 +70,12 @@ cntl_t* bli_packm_cntl_create_node params->rev_iter_if_lower = rev_iter_if_lower; params->pack_schema = pack_schema; params->pack_buf_type = pack_buf_type; + #ifdef BLIS_ENABLE_DMA + params->a_dma = NULL; + params->p_dma = NULL; + params->mem_p_dma = NULL; + params->event_dma = NULL; + #endif // BLIS_ENABLE_DMA #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_packm_cntl_create_node(): " ); diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h index 17aa196e8d..d6dfa9fce8 100644 --- a/frame/1m/packm/bli_packm_cntl.h +++ b/frame/1m/packm/bli_packm_cntl.h @@ -44,6 +44,25 @@ struct packm_params_s bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; + +#ifdef BLIS_ENABLE_DMA + // Extra information to trigger a DMA-prefetch right after each packing. + // The idea is to recycle the input buffer of packm (which is originally + // a DMA-buffer) to trigger a new DMA copy immediately after the end of + // packing and before the subsequent computation, since the packed submatrix + // has been written to another buffer: + // + // DDR -> SMEM -> SMEM + // 1. DMA : a_global -> a_dma + // 2. Packing : a_dma -> a_packed + // 3. DMA next block : a_global' -> a_dma + // 4. Computation : a_packed -> bli_gemm_int() ... + obj_t* a_dma; + obj_t* p_dma; + mem_t* mem_p_dma; + dma_event_t* event_dma; +#endif // BLIS_ENABLE_DMA + }; typedef struct packm_params_s packm_params_t; @@ -87,6 +106,48 @@ BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } +#ifdef BLIS_ENABLE_DMA +BLIS_INLINE obj_t* bli_cntl_packm_params_a_dma( cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->a_dma; +} + +BLIS_INLINE obj_t* bli_cntl_packm_params_p_dma( cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->p_dma; +} + +BLIS_INLINE mem_t* bli_cntl_packm_params_mem_p_dma( cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->mem_p_dma; +} + +BLIS_INLINE dma_event_t* bli_cntl_packm_params_event_dma( cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->event_dma; +} + +BLIS_INLINE void bli_cntl_packm_params_set_a_dma( obj_t* a_dma, cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->a_dma = a_dma; +} + +BLIS_INLINE void bli_cntl_packm_params_set_p_dma( obj_t* p_dma, cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->p_dma = p_dma; +} + +BLIS_INLINE void bli_cntl_packm_params_set_mem_p_dma( mem_t* mem_p_dma, cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->mem_p_dma = mem_p_dma; +} + +BLIS_INLINE void bli_cntl_packm_params_set_event_dma( dma_event_t* event_dma, cntl_t* cntl ) +{ + packm_params_t* ppp = ( packm_params_t* )cntl->params; ppp->event_dma = event_dma; +} +#endif // BLIS_ENABLE_DMA + // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c index 58b658d1d8..7fef44f1bb 100644 --- a/frame/3/bli_l3_blocksize.c +++ b/frame/3/bli_l3_blocksize.c @@ -142,12 +142,50 @@ dim_t PASTEMAC0(opname) \ if ( bli_obj_root_is_herm_or_symm( a ) ) \ { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + /* When DMA is enabled on herm_or_symm, and in the case where KC is + larger than the other dimension (MC or NC), the copied DMA panel + should be extended to cover the stored region, used to "symmetrize" + the unstored one. This extension requires implicit extra local + memory allocation that the developer might not anticipate. We call + this increasing of KC as "Upper-squarization". + + On some embedded platforms, the developer may even not afford this + increase of memory footprint, due to some resource limitation; and + doing so exceeds the HW capacity that likely to fail at execution. + In such situation, we prefer to reduce KC to be equal to MC (or NC), + so that the DMA transfer is not to be extended, as the mirror-region + is now entirely covered by the DMA panel. We call this + "Lower-squarization". + + We are aware that this choice is sub-optimal, the best solution + will be computing a trade-off between KC and MC (or NC) so that + the DMA panel does not exceed the initially allocated buffer. + We call this "Mid-squarization". Contribution is welcome. + + Reference: Section 9.3.4. Special cases handling in + https://tel.archives-ouvertes.fr/tel-02426014/document + */ \ + if( bli_info_get_enable_dma() ) \ + { \ + dim_t mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + b_alg = bli_min( b_alg, mnc+1-mnr ); \ + b_max = bli_min( b_max, mnc+1-mnr ); \ + } \ +\ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ else if ( bli_obj_root_is_herm_or_symm( b ) ) \ { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + /* DMA: same reason as above. */ \ + if( bli_info_get_enable_dma() ) \ + { \ + dim_t mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + b_alg = bli_min( b_alg, mnc+1-mnr ); \ + b_max = bli_min( b_max, mnc+1-mnr ); \ + } \ +\ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ @@ -232,6 +270,7 @@ dim_t PASTEMAC0(opname) \ num_t dt; \ blksz_t* bsize; \ dim_t mnr; \ + dim_t mnc; \ dim_t b_alg, b_max; \ dim_t b_use; \ \ @@ -258,9 +297,22 @@ dim_t PASTEMAC0(opname) \ multiple of MR if the triangular matrix is on the left, or NR if the triangular matrix is one the right. */ \ if ( bli_obj_root_is_triangular( a ) ) \ + { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ + mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + } \ else \ + { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ + mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_NC, cntx ); \ + } \ +\ + /* DMA: lower-squarization to minimize footprint */ \ + if( bli_info_get_enable_dma() ) \ + { \ + b_alg = bli_min( b_alg, mnc+1-mnr ); \ + b_max = bli_min( b_max, mnc+1-mnr ); \ + } \ \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ @@ -321,6 +373,15 @@ dim_t PASTEMAC0(opname) \ matrix uses MR, since only left-side trsm micro-kernels are supported. */ \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ +\ + /* DMA: lower-squarization to minimize footprint */ \ + if( bli_info_get_enable_dma() ) \ + { \ + dim_t mnc = bli_cntx_get_blksz_def_dt( dt, BLIS_MC, cntx ); \ + b_alg = bli_min( b_alg, mnc+1-mnr ); \ + b_max = bli_min( b_max, mnc+1-mnr ); \ + } \ +\ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c index 48f55c3602..d8a20aaab6 100644 --- a/frame/3/bli_l3_packm.c +++ b/frame/3/bli_l3_packm.c @@ -75,14 +75,14 @@ void bli_l3_packm // Query the address of the mem_t entry within the control tree node. cntl_mem_p = bli_cntl_pack_mem( cntl ); + mem_t* local_mem_p; + mem_t local_mem_s; + // Check the mem_t field in the control tree. If it is unallocated, then // we need to acquire a block from the memory broker and broadcast it to // all threads in the chief's thread group. if ( bli_mem_is_unalloc( cntl_mem_p ) ) { - mem_t* local_mem_p; - mem_t local_mem_s; - if ( bli_thread_am_ochief( thread ) ) { #ifdef BLIS_ENABLE_MEM_TRACING @@ -110,9 +110,6 @@ void bli_l3_packm } else // ( bli_mem_is_alloc( cntl_mem_p ) ) { - mem_t* local_mem_p; - mem_t local_mem_s; - // If the mem_t entry in the control tree does NOT contain a NULL // buffer, then a block has already been acquired from the memory // broker and cached in the control tree. @@ -152,17 +149,11 @@ void bli_l3_packm // this thread's control tree node. *cntl_mem_p = *local_mem_p; } - else - { - // If the mem_t entry is already allocated and sufficiently large, - // then we use it as-is. No action is needed, because all threads - // will already have the cached values in their local control - // trees' mem_t entries, currently pointed to by cntl_mem_p. - - bli_thread_barrier( thread ); - } } + // Barrier so that all threads have read the content of local_mem_p, + // located in the stack of the chief thread. + bli_thread_barrier( thread ); // Update the buffer address in x_pack to point to the buffer associated // with the mem_t entry acquired from the memory broker (now cached in @@ -183,5 +174,17 @@ void bli_l3_packm // Barrier so that packing is done before computation. bli_thread_barrier( thread ); + +#ifdef BLIS_ENABLE_DMA + // After packing, recycle the DMA buffer to prefetch next block + obj_t* a_dma = bli_cntl_packm_params_a_dma( cntl ); + obj_t* p_dma = bli_cntl_packm_params_p_dma( cntl ); + mem_t* mem_p_dma = bli_cntl_packm_params_mem_p_dma( cntl ); + dma_event_t* event_dma = bli_cntl_packm_params_event_dma( cntl ); + if ( a_dma && p_dma && mem_p_dma ) + { + bli_dma_get( a_dma, p_dma, mem_p_dma, event_dma, rntm, thread ); + } +#endif // BLIS_ENABLE_DMA } diff --git a/frame/3/gemm/bli_gemm_blk_var1_dma.c b/frame/3/gemm/bli_gemm_blk_var1_dma.c new file mode 100644 index 0000000000..88825777e8 --- /dev/null +++ b/frame/3/gemm/bli_gemm_blk_var1_dma.c @@ -0,0 +1,247 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_DMA + +void bli_gemm_blk_var1_dma + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + dim_t i; + obj_t a1, c1; + dim_t my_start, my_end; + + // Determine the direction in which to partition (forwards or backwards). + dir_t direct = bli_l3_direct( a, b, c, cntl ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); + + // Determine the current thread's subpartition range. + bli_thread_range_mdim + ( + direct, thread, a, b, c, cntl, cntx, + &my_start, &my_end + ); + + // ======================================================================== + // DMA settings + // ======================================================================== + dim_t b_alg = 0; + dim_t b_alg_next = 0; + + // A-DMA + obj_t a1_dma; + dma_event_t event_a1_dma; + mem_t mem_a1_dma = BLIS_MEM_INITIALIZER; + + // Triple-buffering on C-DMA + // - one for computing + // - one for putting + // - one for getting + obj_t c1_dma [3]; + dma_event_t event_c1_dma[3]; + mem_t mem_c1_dma [3] = { BLIS_MEM_INITIALIZER }; + // Track if a put is outstanding on any slot, to avoid calling wait twice + // on any slot + bool putting_c1_dma[3] = { FALSE }; + + dim_t c1_counter; + obj_t c1_next; + + // Initialize mem_t for A-DMA and C-DMA + bli_mem_set_buf_type( BLIS_BUFFER_FOR_A_BLOCK, &mem_a1_dma ); + + bli_mem_set_buf_type( BLIS_BUFFER_FOR_C_PANEL, &mem_c1_dma[0] ); + bli_mem_set_buf_type( BLIS_BUFFER_FOR_C_PANEL, &mem_c1_dma[1] ); + bli_mem_set_buf_type( BLIS_BUFFER_FOR_C_PANEL, &mem_c1_dma[2] ); + + // Setup next A-DMA prefetch to the packm subnode + bli_cntl_packm_params_set_a_dma( &a1, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_p_dma( &a1_dma, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( &mem_a1_dma, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_event_dma( &event_a1_dma, bli_cntl_sub_node( cntl ) ); + + // ======================================================================== + // PROLOG DMA: Get the first panel A and block C + // ======================================================================== + i = my_start; + b_alg_next = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, "\n" ); + fprintf( stdout, " %s(): b_alg %d b_alg_next %d\n", __FUNCTION__, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg_next, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg_next, c, &c1_next ); + + bli_dma_get( &a1, &a1_dma, &mem_a1_dma, &event_a1_dma, + rntm, bli_thrinfo_sub_node( thread ) ); + bli_dma_get( &c1_next, &c1_dma[0], &mem_c1_dma[0], &event_c1_dma[0], + rntm, bli_thrinfo_sub_node( thread ) ); + + // ======================================================================== + // Loop: Partition along the k dimension. + // ======================================================================== + // Partition along the m dimension. + c1_counter = 0; + for ( i = my_start; i < my_end; i += b_alg ) + { + // Update current b_alg with b_alg_next of the previous iteration + b_alg = b_alg_next; + + // Update c1 with the c1_next of the previous iteration + c1 = c1_next; + + // Determine current ic slot + dim_t ic = c1_counter % 3; + + // Increment counter of number of C blocks + ++c1_counter; + + // Determine next ic slot + dim_t ic_next = c1_counter % 3; + + // Determine the next algorithmic blocksize. + b_alg_next = bli_determine_blocksize( direct, i+b_alg, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): b_alg %d b_alg_next %d\n", __FUNCTION__, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + // DMA: get next block (if any) + if ( b_alg_next > 0 ) + { + // Sanity: Before triggering get on slot c1[ic_next], we must + // wait for its previous put (if any) to finish. This prevents the + // DMA-get from overriding the "being put" data on the same slot. + // This wait is needed from the 3rd iteration (i.e c1_counter >= 3, + // or putting_c1_dma[ic_next] is TRUE). + if ( putting_c1_dma[ic_next] ) + { + bli_dma_wait( &event_c1_dma[ic_next], bli_thrinfo_sub_node( thread ) ); + putting_c1_dma[ic_next] = FALSE; + } + + // Acquire next partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, c, &c1_next ); + + // get next C + bli_dma_get( &c1_next, &c1_dma[ic_next], &mem_c1_dma[ic_next], + &event_c1_dma[ic_next], rntm, bli_thrinfo_sub_node( thread ) ); + } + else + { + // If no more block, stop DMA-prefetching of A after packm by setting + // obj_t* and mem_t* to NULL + bli_cntl_packm_params_set_a_dma( NULL, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_p_dma( NULL, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( NULL, bli_cntl_sub_node( cntl ) ); + } + + // DMA: wait for arrival of current partitions A1 and C1 + bli_dma_wait( &event_a1_dma , bli_thrinfo_sub_node( thread ) ); + bli_dma_wait( &event_c1_dma[ic], bli_thrinfo_sub_node( thread ) ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + &a1_dma, + b, + &BLIS_ONE, + &c1_dma[ic], + cntx, + rntm, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // DMA: put C to global memory + bli_dma_put( &c1, &c1_dma[ic], &event_c1_dma[ic], bli_thrinfo_sub_node( thread ) ); + putting_c1_dma[ic] = TRUE; + } + + // ======================================================================== + // EPILOG DMA: Wait for put C + // ======================================================================== + for( dim_t ic = 0; ic < 3; ++ic ) + { + if ( putting_c1_dma[ic] ) + { + bli_dma_wait( &event_c1_dma[ic], bli_thrinfo_sub_node( thread ) ); + putting_c1_dma[ic] = FALSE; + } + } + + // ======================================================================== + // Release DMA buffer of A and C at the end + // ======================================================================== + if ( bli_thread_am_ochief( bli_thrinfo_sub_node( thread ) ) ) + { + // release A-DMA + if ( bli_mem_is_alloc( &mem_a1_dma ) ) + { + bli_pba_release( rntm, &mem_a1_dma ); + } + + // release C-DMA + for( dim_t ic = 0; ic < 3; ++ic ) + { + if ( bli_mem_is_alloc( &mem_c1_dma[ic] ) ) + { + bli_pba_release( rntm, &mem_c1_dma[ic] ); + } + } + } +} + +#endif // BLIS_ENABLE_DMA diff --git a/frame/3/gemm/bli_gemm_blk_var3_dma.c b/frame/3/gemm/bli_gemm_blk_var3_dma.c new file mode 100644 index 0000000000..d93b5cc6f8 --- /dev/null +++ b/frame/3/gemm/bli_gemm_blk_var3_dma.c @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_DMA + +void bli_gemm_blk_var3_dma + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + + // Determine the direction in which to partition (forwards or backwards). + dir_t direct = bli_l3_direct( a, b, c, cntl ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); + + // Query dimension in partitioning direction. + dim_t k_trans = bli_obj_width_after_trans( a ); + + // ======================================================================== + // DMA settings + // ======================================================================== + obj_t b1_dma; + dim_t b_alg, b_alg_next; + + // Event DMA + dma_event_t event_b1_dma; + + // Initialize mem_t for b1_dma + mem_t mem_b1_dma = BLIS_MEM_INITIALIZER; + bli_mem_set_buf_type( BLIS_BUFFER_FOR_B_PANEL, &mem_b1_dma ); + + // Setup next DMA prefetch to the packm subnode + bli_cntl_packm_params_set_a_dma( &b1, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_p_dma( &b1_dma, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( &mem_b1_dma, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_event_dma( &event_b1_dma, bli_cntl_sub_node( cntl ) ); + + // ======================================================================== + // PROLOG DMA: Get the first panel B + // ======================================================================== + // Acquire next partitions for B1. + b_alg = 0; + b_alg_next = bli_l3_determine_kc( direct, 0 /*i*/, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx, cntl ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, "\n" ); + fprintf( stdout, "%s(): k_trans %d b_alg %d b_alg_next %d\n", + __FUNCTION__, k_trans, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + 0 /*i*/, b_alg_next, b, &b1 ); + + // Get first B panel + bli_dma_get( &b1, &b1_dma, &mem_b1_dma, &event_b1_dma, + rntm, bli_thrinfo_sub_node( thread ) ); + + // ======================================================================== + // Loop: Partition along the k dimension. + // ======================================================================== + for ( dim_t i = 0; i < k_trans; i += b_alg ) + { + // Update current b_alg with b_alg_next of the previous iteration + b_alg = b_alg_next; + + // Determine the next algorithmic blocksize. + b_alg_next = bli_l3_determine_kc( direct, i+b_alg, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx, cntl ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, "%s(): k_trans %d b_alg %d b_alg_next %d\n", + __FUNCTION__, k_trans, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + // DMA: prepare to get next block (if any) + if ( b_alg_next > 0 ) + { + // Acquire next partitions for B1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, b, &b1 ); + } + else + { + // If no more block, stop DMA-prefetching after packm by setting + // obj_t* and mem_t* to NULL + bli_cntl_packm_params_set_a_dma( NULL, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_p_dma( NULL, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( NULL, bli_cntl_sub_node( cntl ) ); + } + + // Acquire current partitions for A1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + + // DMA: wait for arrival of current partitions B1 + bli_dma_wait( &event_b1_dma, bli_thrinfo_sub_node( thread ) ); + + // Perform gemm subproblem. + bli_gemm_int + ( + &BLIS_ONE, + &a1, + &b1_dma, + &BLIS_ONE, + c, + cntx, + rntm, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal beta scalar on matrix C is non-zero, we must use it + // only for the first iteration (and then BLIS_ONE for all others). + // And since c is a locally aliased obj_t (see _int() function), we + // can simply overwrite the internal beta scalar with BLIS_ONE once + // it has been used in the first iteration. However... + + // Unlike variant 3 of gemm and herk, which reset the internal scalar + // on C at the end of the first iteration so that subsequent iterations + // do not erroneously apply beta more than once, it is important that + // this behavior not be applied to trmm. That is because the order of + // computation is always such that the beta that is passed into the + // macro-kernel must be zero, since the macro-kernel only applies that + // beta to (and thus overwrites) the row-panel of C that corresponds to + // the current block intersecting the diagonal. It turns out that this + // same pattern holds for trmm3 as well--except there, the beta scalar + // is potentially non-zero, but is still applied only to the current + // row-panel of C, and thus beta is applied to all of C exactly once. + // Thus, for neither trmm nor trmm3 should we reset the scalar on C + // after the first iteration. + if ( bli_cntl_family( cntl ) != BLIS_TRMM ) + if ( i == 0 ) bli_obj_scalar_reset( c ); + } + + // ======================================================================== + // Release DMA buffer of B1 at the end + // ======================================================================== + if ( bli_thread_am_ochief( bli_thrinfo_sub_node( thread ) ) ) + { + if ( bli_mem_is_alloc( &mem_b1_dma ) ) { + bli_pba_release( rntm, &mem_b1_dma ); + } + } +} + +#endif // BLIS_ENABLE_DMA diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index d7cd0a92ce..8bfaa5e7a0 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -111,7 +111,11 @@ cntl_t* bli_gemmbp_cntl_create rntm, family, BLIS_MC, +#ifdef BLIS_ENABLE_DMA + bli_gemm_blk_var1_dma, +#else bli_gemm_blk_var1, +#endif // BLIS_ENABLE_DMA gemm_cntl_packa ); @@ -137,7 +141,11 @@ cntl_t* bli_gemmbp_cntl_create rntm, family, BLIS_KC, +#ifdef BLIS_ENABLE_DMA + bli_gemm_blk_var3_dma, +#else bli_gemm_blk_var3, +#endif // BLIS_ENABLE_DMA gemm_cntl_packb ); diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index b08271e9b9..4bb14bd10e 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -58,6 +58,11 @@ GENPROT( gemm_blk_var3 ) GENPROT( gemm_packa ) GENPROT( gemm_packb ) +#ifdef BLIS_ENABLE_DMA +GENPROT( gemm_blk_var1_dma ) +GENPROT( gemm_blk_var3_dma ) +#endif // BLIS_ENABLE_DMA + GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) diff --git a/frame/3/trsm/bli_trsm_blk_var1_dma.c b/frame/3/trsm/bli_trsm_blk_var1_dma.c new file mode 100644 index 0000000000..48d72e8dbb --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var1_dma.c @@ -0,0 +1,439 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_DMA + +//#define PRINT + +void bli_trsm_blk_var1_dma + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + dim_t i; + dim_t my_start, my_end; + dim_t b_alg; + const dim_t kc = bli_obj_width( a ); + + // Determine the direction in which to partition (forwards or backwards). + dir_t direct = bli_l3_direct( a, b, c, cntl ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_m( a, b, c, cntl ); + + // ======================================================================== + // DMA settings + // ======================================================================== + dim_t b_alg_next = 0; + + // A-DMA + obj_t a1_dma; + dma_event_t event_a1_dma; + mem_t mem_a1_dma = BLIS_MEM_INITIALIZER; + + // Triple-buffering on C-DMA + // - one for computing + // - one for putting + // - one for getting + obj_t c1_dma [3]; + dma_event_t event_c1_dma[3]; + mem_t mem_c1_dma [3] = { BLIS_MEM_INITIALIZER }; + + // Initialize mem_t for A-DMA and C-DMA + bli_mem_set_buf_type( BLIS_BUFFER_FOR_A_BLOCK, &mem_a1_dma ); + + bli_mem_set_buf_type( BLIS_BUFFER_FOR_C_PANEL, &mem_c1_dma[0] ); + bli_mem_set_buf_type( BLIS_BUFFER_FOR_C_PANEL, &mem_c1_dma[1] ); + bli_mem_set_buf_type( BLIS_BUFFER_FOR_C_PANEL, &mem_c1_dma[2] ); + + + // ======================================================================== + // Step 1: + // Isolate the diagonal block A11 and its corresponding row panel C1. + // ======================================================================== + { + obj_t a11, c1; + obj_t a11_1, c1_1; + obj_t c1_1_next; + + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + 0, kc, a, &a11 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + 0, kc, c, &c1 ); + + // All threads iterate over the entire diagonal block A11. + my_start = 0; my_end = kc; + + #ifdef PRINT + printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n", + (int)bli_obj_length( &a11 ), (int)bli_obj_width( &a11 ), + (int)bli_obj_row_off( &a11 ), (int)bli_obj_col_off( &a11 ) ); + printf( "bli_trsm_blk_var1(): entering trsm subproblem loop.\n" ); + #endif + + // Track if a put is outstanding on any slot, to avoid calling wait twice + // on any slot + bool putting_c1_dma[3] = { FALSE }; + dim_t c1_counter; + + // Setup next A-DMA prefetch to the packm subnode + bli_cntl_packm_params_set_a_dma( &a11_1, bli_cntl_sub_prenode( cntl ) ); + bli_cntl_packm_params_set_p_dma( &a1_dma, bli_cntl_sub_prenode( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( &mem_a1_dma, bli_cntl_sub_prenode( cntl ) ); + bli_cntl_packm_params_set_event_dma( &event_a1_dma, bli_cntl_sub_prenode( cntl ) ); + + // PROLOG DMA: Get the first panel A and block C + i = my_start; + b_alg = 0; + b_alg_next = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, "\n" ); + fprintf( stdout, " %s(): b_alg %d b_alg_next %d\n", __FUNCTION__, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg_next, &a11, &a11_1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg_next, &c1, &c1_1_next ); + + bli_dma_get( &a11_1, &a1_dma, &mem_a1_dma, &event_a1_dma, + rntm, bli_thrinfo_sub_prenode( thread ) ); + bli_dma_get( &c1_1_next, &c1_dma[0], &mem_c1_dma[0], &event_c1_dma[0], + rntm, bli_thrinfo_sub_prenode( thread ) ); + + // Partition along the m dimension for the trsm subproblem. + c1_counter = 0; + for ( i = my_start; i < my_end; i += b_alg ) + { + #ifdef PRINT + printf( "bli_trsm_blk_var1(): a11_1 is %d x %d at offsets (%3d, %3d)\n", + (int)bli_obj_length( &a11_1 ), (int)bli_obj_width( &a11_1 ), + (int)bli_obj_row_off( &a11_1 ), (int)bli_obj_col_off( &a11_1 ) ); + #endif + + // Update current b_alg with b_alg_next of the previous iteration + b_alg = b_alg_next; + + // Update c1 with the c1_1_next of the previous iteration + c1_1 = c1_1_next; + + // Determine current ic slot + dim_t ic = c1_counter % 3; + + // Increment counter of number of C blocks + ++c1_counter; + + // Determine next ic slot + dim_t ic_next = c1_counter % 3; + + // Determine the next algorithmic blocksize. + b_alg_next = bli_determine_blocksize( direct, i+b_alg, my_end, &a11, + bli_cntl_bszid( cntl ), cntx ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): b_alg %d b_alg_next %d\n", __FUNCTION__, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + // DMA: get next block (if any) + if ( b_alg_next > 0 ) + { + // Sanity: Before triggering get on slot c1[ic_next], we must + // wait for its previous put (if any) to finish. This prevents the + // DMA-get from overriding the "being put" data on the same slot. + // This wait is needed from the 3rd iteration (i.e c1_counter >= 3, + // or putting_c1_dma[ic_next] is TRUE). + if ( putting_c1_dma[ic_next] ) + { + bli_dma_wait( &event_c1_dma[ic_next], bli_thrinfo_sub_prenode( thread ) ); + putting_c1_dma[ic_next] = FALSE; + } + + // Acquire next partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, &a11, &a11_1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, &c1, &c1_1_next ); + + // get next C + bli_dma_get( &c1_1_next, &c1_dma[ic_next], &mem_c1_dma[ic_next], + &event_c1_dma[ic_next], rntm, bli_thrinfo_sub_prenode( thread ) ); + } + else + { + // If no more block, stop DMA-prefetching of A after packm by setting + // obj_t* and mem_t* to NULL + bli_cntl_packm_params_set_a_dma( NULL, bli_cntl_sub_prenode( cntl ) ); + bli_cntl_packm_params_set_p_dma( NULL, bli_cntl_sub_prenode( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( NULL, bli_cntl_sub_prenode( cntl ) ); + } + + // DMA: wait for arrival of current partitions A1 and C1 + bli_dma_wait( &event_a1_dma , bli_thrinfo_sub_prenode( thread ) ); + bli_dma_wait( &event_c1_dma[ic], bli_thrinfo_sub_prenode( thread ) ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1_dma, + b, + &BLIS_ONE, + &c1_dma[ic], + cntx, + rntm, + bli_cntl_sub_prenode( cntl ), + bli_thrinfo_sub_prenode( thread ) + ); + + // DMA: put C to global memory + bli_dma_put( &c1_1, &c1_dma[ic], &event_c1_dma[ic], bli_thrinfo_sub_prenode( thread ) ); + putting_c1_dma[ic] = TRUE; + } + + // EPILOG DMA: Wait for put C + for( dim_t ic = 0; ic < 3; ++ic ) + { + if ( putting_c1_dma[ic] ) + { + bli_dma_wait( &event_c1_dma[ic], bli_thrinfo_sub_prenode( thread ) ); + putting_c1_dma[ic] = FALSE; + } + } + + #ifdef PRINT + printf( "bli_trsm_blk_var1(): finishing trsm subproblem loop.\n" ); + #endif + } // Step 1: Isolate the diagonal block A11 and its corresponding row panel C1. + + + // ======================================================================== + // We must execute a barrier here because the upcoming rank-k update + // requires the packed matrix B to be fully updated by the trsm + // subproblem. + // ======================================================================== + bli_thread_barrier( thread ); + + + // ======================================================================== + // Step 2: + // Isolate the remaining part of the column panel matrix A, which we do by + // acquiring the subpartition ahead of A11 (that is, A21 or A01, depending + // on whether we are moving forwards or backwards, respectively). + // ======================================================================== + { + obj_t ax1, cx1; + obj_t a11, c1; + obj_t c1_next; + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A, + 0, kc, a, &ax1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A, + 0, kc, c, &cx1 ); + + #ifdef PRINT + printf( "bli_trsm_blk_var1(): ax1 is %d x %d at offsets (%3d, %3d)\n", + (int)bli_obj_length( &ax1 ), (int)bli_obj_width( &ax1 ), + (int)bli_obj_row_off( &ax1 ), (int)bli_obj_col_off( &ax1 ) ); + #endif + + // Determine the current thread's subpartition range for the gemm + // subproblem over Ax1. + bli_thread_range_mdim + ( + direct, thread, &ax1, b, &cx1, cntl, cntx, + &my_start, &my_end + ); + + #ifdef PRINT + printf( "bli_trsm_blk_var1(): entering gemm subproblem loop (%d->%d).\n", (int)my_start, (int)my_end ); + #endif + + // Track if a put is outstanding on any slot, to avoid calling wait twice + // on any slot + bool putting_c1_dma[3] = { FALSE }; + dim_t c1_counter; + + // Setup next A-DMA prefetch to the packm subnode + bli_cntl_packm_params_set_a_dma( &a11, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_p_dma( &a1_dma, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( &mem_a1_dma, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_event_dma( &event_a1_dma, bli_cntl_sub_node( cntl ) ); + + // PROLOG DMA: Get the first panel A and block C + i = my_start; + b_alg = 0; + b_alg_next = bli_determine_blocksize( direct, i, my_end, a, + bli_cntl_bszid( cntl ), cntx ); + + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, "\n" ); + fprintf( stdout, " %s(): b_alg %d b_alg_next %d\n", __FUNCTION__, b_alg, b_alg_next ); + #endif // BLIS_DMA_DEBUG + + // Acquire partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg_next, &ax1, &a11 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg_next, &cx1, &c1_next ); + + bli_dma_get( &a11, &a1_dma, &mem_a1_dma, &event_a1_dma, + rntm, bli_thrinfo_sub_node( thread ) ); + bli_dma_get( &c1_next, &c1_dma[0], &mem_c1_dma[0], &event_c1_dma[0], + rntm, bli_thrinfo_sub_node( thread ) ); + + // Partition along the m dimension for the gemm subproblem. + c1_counter = 0; + for ( i = my_start; i < my_end; i += b_alg ) + { + // Update current b_alg with b_alg_next of the previous iteration + b_alg = b_alg_next; + + // Update c1 with the c1_1_next of the previous iteration + c1 = c1_next; + + // Determine current ic slot + dim_t ic = c1_counter % 3; + + // Increment counter of number of C blocks + ++c1_counter; + + // Determine next ic slot + dim_t ic_next = c1_counter % 3; + + // Determine the next algorithmic blocksize. + b_alg_next = bli_determine_blocksize( direct, i+b_alg, my_end, &ax1, + bli_cntl_bszid( cntl ), cntx ); + + // DMA: get next block (if any) + if ( b_alg_next > 0 ) + { + // Sanity: Before triggering get on slot c1[ic_next], we must + // wait for its previous put (if any) to finish. This prevents the + // DMA-get from overriding the "being put" data on the same slot. + // This wait is needed from the 3rd iteration (i.e c1_counter >= 3, + // or putting_c1_dma[ic_next] is TRUE). + if ( putting_c1_dma[ic_next] ) + { + bli_dma_wait( &event_c1_dma[ic_next], bli_thrinfo_sub_node( thread ) ); + putting_c1_dma[ic_next] = FALSE; + } + + // Acquire next partitions for A1 and C1. + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, &ax1, &a11 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i+b_alg, b_alg_next, &cx1, &c1_next ); + + // get next C + bli_dma_get( &c1_next, &c1_dma[ic_next], &mem_c1_dma[ic_next], + &event_c1_dma[ic_next], rntm, bli_thrinfo_sub_node( thread ) ); + } + else + { + // If no more block, stop DMA-prefetching of A after packm by setting + // obj_t* and mem_t* to NULL + bli_cntl_packm_params_set_a_dma( NULL, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_p_dma( NULL, bli_cntl_sub_node( cntl ) ); + bli_cntl_packm_params_set_mem_p_dma( NULL, bli_cntl_sub_node( cntl ) ); + } + + // DMA: wait for arrival of current partitions A1 and C1 + bli_dma_wait( &event_a1_dma , bli_thrinfo_sub_node( thread ) ); + bli_dma_wait( &event_c1_dma[ic], bli_thrinfo_sub_node( thread ) ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1_dma, + b, + &BLIS_ONE, + &c1_dma[ic], + cntx, + rntm, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + // DMA: put C to global memory + bli_dma_put( &c1, &c1_dma[ic], &event_c1_dma[ic], bli_thrinfo_sub_node( thread ) ); + putting_c1_dma[ic] = TRUE; + } + + // EPILOG DMA: Wait for put C + for( dim_t ic = 0; ic < 3; ++ic ) + { + if ( putting_c1_dma[ic] ) + { + bli_dma_wait( &event_c1_dma[ic], bli_thrinfo_sub_node( thread ) ); + putting_c1_dma[ic] = FALSE; + } + } + + #ifdef PRINT + printf( "bli_trsm_blk_var1(): finishing gemm subproblem loop.\n" ); + #endif + } // Step 2: Isolate the remaining part of the column panel matrix A + + + // ======================================================================== + // Release DMA buffer of A and C at the end + // ======================================================================== + if ( bli_thread_am_ochief( bli_thrinfo_sub_node( thread ) ) ) + { + // release A-DMA + if ( bli_mem_is_alloc( &mem_a1_dma ) ) + { + bli_pba_release( rntm, &mem_a1_dma ); + } + + // release C-DMA + for( dim_t ic = 0; ic < 3; ++ic ) + { + if ( bli_mem_is_alloc( &mem_c1_dma[ic] ) ) + { + bli_pba_release( rntm, &mem_c1_dma[ic] ); + } + } + } +} + +#endif // BLIS_ENABLE_DMA diff --git a/frame/3/trsm/bli_trsm_blk_var3_dma.c b/frame/3/trsm/bli_trsm_blk_var3_dma.c new file mode 100644 index 0000000000..886b84a24b --- /dev/null +++ b/frame/3/trsm/bli_trsm_blk_var3_dma.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_DMA + +void bli_trsm_blk_var3_dma + ( + obj_t* a, + obj_t* b, + obj_t* c, + cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + obj_t a1, b1; + dim_t b_alg; + + // Determine the direction in which to partition (forwards or backwards). + dir_t direct = bli_l3_direct( a, b, c, cntl ); + + // Prune any zero region that exists along the partitioning dimension. + bli_l3_prune_unref_mparts_k( a, b, c, cntl ); + + // Query dimension in partitioning direction. + dim_t k_trans = bli_obj_width_after_trans( a ); + + // ======================================================================== + // DMA settings + // ======================================================================== + obj_t b1_dma; + + // Event DMA + dma_event_t event_b1_dma; + + // Initialize mem_t for b1_dma + mem_t mem_b1_dma = BLIS_MEM_INITIALIZER; + bli_mem_set_buf_type( BLIS_BUFFER_FOR_B_PANEL, &mem_b1_dma ); + + // ======================================================================== + // Loop: Partition along the k dimension. + // ======================================================================== + for ( dim_t i = 0; i < k_trans; i += b_alg ) + { + // Determine the current algorithmic blocksize. + b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, + bli_cntl_bszid( cntl ), cntx ); + + // Acquire partitions for A1 and B1. + bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, + i, b_alg, a, &a1 ); + bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, + i, b_alg, b, &b1 ); + + // In trsm_l, we can not parallelize in the dimension M (i.e B panels are + // dependent on each other, so can not overlap them by DMA). + // We need to use blocking DMA transfer here (wait after get). + bli_dma_get( &b1, &b1_dma, &mem_b1_dma, &event_b1_dma, + rntm, bli_thrinfo_sub_node( thread ) ); + bli_dma_wait( &event_b1_dma, bli_thrinfo_sub_node( thread ) ); + + // Perform trsm subproblem. + bli_trsm_int + ( + &BLIS_ONE, + &a1, + &b1_dma, + &BLIS_ONE, + c, + cntx, + rntm, + bli_cntl_sub_node( cntl ), + bli_thrinfo_sub_node( thread ) + ); + + bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); + + // This variant executes multiple rank-k updates. Therefore, if the + // internal alpha scalars on A/B and C are non-zero, we must ensure + // that they are only used in the first iteration. + if ( i == 0 ) + { + bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); + bli_obj_scalar_reset( c ); + } + } + + // ======================================================================== + // Release DMA buffer of B1 at the end + // ======================================================================== + if ( bli_thread_am_ochief( bli_thrinfo_sub_node( thread ) ) ) + { + if ( bli_mem_is_alloc( &mem_b1_dma ) ) + { + bli_pba_release( rntm, &mem_b1_dma ); + } + } +} + +#endif // BLIS_ENABLE_DMA diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c index 4a7a4de8fd..4bbdb67755 100644 --- a/frame/3/trsm/bli_trsm_cntl.c +++ b/frame/3/trsm/bli_trsm_cntl.c @@ -158,7 +158,11 @@ cntl_t* bli_trsm_l_cntl_create rntm, family, BLIS_MC, +#ifdef BLIS_ENABLE_DMA + bli_trsm_blk_var1_dma, +#else bli_trsm_blk_var1, +#endif // BLIS_ENABLE_DMA gemm_cntl_packa ); @@ -189,7 +193,11 @@ cntl_t* bli_trsm_l_cntl_create rntm, family, BLIS_KC, +#ifdef BLIS_ENABLE_DMA + bli_trsm_blk_var3_dma, +#else bli_trsm_blk_var3, +#endif // BLIS_ENABLE_DMA trsm_cntl_packb ); @@ -262,7 +270,11 @@ cntl_t* bli_trsm_r_cntl_create rntm, family, BLIS_MC, +#ifdef BLIS_ENABLE_DMA + bli_trsm_blk_var1_dma, +#else bli_trsm_blk_var1, +#endif // BLIS_ENABLE_DMA trsm_cntl_packa ); @@ -288,7 +300,11 @@ cntl_t* bli_trsm_r_cntl_create rntm, family, BLIS_KC, +#ifdef BLIS_ENABLE_DMA + bli_trsm_blk_var3_dma, +#else bli_trsm_blk_var3, +#endif // BLIS_ENABLE_DMA trsm_cntl_packb ); diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index de7c65936f..63a4905bd0 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -55,6 +55,12 @@ void PASTEMAC0(opname) \ GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) + +#ifdef BLIS_ENABLE_DMA +GENPROT( trsm_blk_var1_dma ) +GENPROT( trsm_blk_var3_dma ) +#endif // BLIS_ENABLE_DMA + GENPROT( trsm_packa ) GENPROT( trsm_packb ) diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c index 524653d743..63c69634d5 100644 --- a/frame/base/bli_blksz.c +++ b/frame/base/bli_blksz.c @@ -270,6 +270,38 @@ dim_t bli_determine_blocksize_f b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); +#ifdef BLIS_ENABLE_DMA + // When DMA is enabled on herm_or_symm, blocksize in MC and NC dimension + // must not exceed KC, because we are currently doing Lower-squarization. + // This is complementary to the further partitioning in KC dimension + // (bli_l3_blocksize.c), in which KC will be also aligned to MC/NC. + // This squarization of subpartitions is only needed on herm_or_symm, + // with DMA enabled. + if ( bli_obj_root_is_herm_or_symm( obj ) ) + { + dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + + // Ensure b_alg to be smaller or equal to KC, and also multiple of MR/NR + dim_t mnr = 0; + if ( bszid == BLIS_MC ) + { + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + } + else if ( bszid == BLIS_NC ) + { + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + } + else + { + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + } + b_alg = bli_min( b_alg, kc+1-mnr ); + b_max = bli_min( b_max, kc+1-mnr ); + b_alg = bli_align_dim_to_mult( b_alg, mnr ); + b_max = bli_align_dim_to_mult( b_max, mnr ); + } +#endif // BLIS_ENABLE_DMA + b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); return b_use; @@ -296,6 +328,38 @@ dim_t bli_determine_blocksize_b b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); +#ifdef BLIS_ENABLE_DMA + // When DMA is enabled on herm_or_symm, blocksize in MC and NC dimension + // must not exceed KC, because we are currently doing Lower-squarization. + // This is complementary to the further partitioning in KC dimension + // (bli_l3_blocksize.c), in which KC will be also aligned to MC/NC. + // This squarization of subpartitions is only needed on herm_or_symm, + // with DMA enabled. + if ( bli_obj_root_is_herm_or_symm( obj ) ) + { + dim_t kc = bli_cntx_get_blksz_def_dt( dt, BLIS_KC, cntx ); + + // Ensure b_alg to be smaller or equal to KC, and also multiple of MR/NR + dim_t mnr = 0; + if ( bszid == BLIS_MC ) + { + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); + } + else if ( bszid == BLIS_NC ) + { + mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); + } + else + { + bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); + } + b_alg = bli_min( b_alg, kc+1-mnr ); + b_max = bli_min( b_max, kc+1-mnr ); + b_alg = bli_align_dim_to_mult( b_alg, mnr ); + b_max = bli_align_dim_to_mult( b_max, mnr ); + } +#endif // BLIS_ENABLE_DMA + b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); return b_use; diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c index 37add3b674..fd90ecb555 100644 --- a/frame/base/bli_error.c +++ b/frame/base/bli_error.c @@ -112,6 +112,10 @@ static char *bli_error_string[-BLIS_ERROR_CODE_MAX] = [-BLIS_NC_MAX_NONMULTIPLE_OF_NR] = "Maximum NC is non-multiple of NR for one or more datatypes.", [-BLIS_KC_DEF_NONMULTIPLE_OF_KR] = "Default KC is non-multiple of KR for one or more datatypes.", [-BLIS_KC_MAX_NONMULTIPLE_OF_KR] = "Maximum KC is non-multiple of KR for one or more datatypes.", + + [-BLIS_DMA_GET_FAILURE] = "DMA-get failure.", + [-BLIS_DMA_PUT_FAILURE] = "DMA-put failure.", + [-BLIS_DMA_WAIT_FAILURE] = "DMA-wait failure.", }; // ----------------------------------------------------------------------------- diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index fa7901583f..08741ca95d 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -156,6 +156,14 @@ gint_t bli_info_get_enable_sandbox( void ) return 0; #endif } +gint_t bli_info_get_enable_dma( void ) +{ +#ifdef BLIS_ENABLE_DMA + return 1; +#else + return 0; +#endif +} diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index d900ca4f51..cef13eef3f 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -74,6 +74,7 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_enable_dma( void ); // -- Kernel implementation-related -------------------------------------------- diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c index e616ac2d7b..b10ea56a11 100644 --- a/frame/base/bli_init.c +++ b/frame/base/bli_init.c @@ -84,6 +84,14 @@ void bli_finalize_once( void ) void bli_init_apis( void ) { +#ifdef BLIS_ENABLE_DMA + // Since a DMA system may want to change and initialize the default memory + // allocator (SMEM), we call bli_dma_backend_init() first, to be sure + // that the SMEM allocator is ready before any later internal or sba/pba + // allocation in other init steps. + bli_dma_backend_init(); +#endif // BLIS_ENABLE_DMA + // Initialize various sub-APIs. bli_gks_init(); bli_ind_init(); @@ -104,12 +112,17 @@ void bli_init_apis( void ) void bli_finalize_apis( void ) { // Finalize various sub-APIs. + bli_memsys_finalize(); bli_pack_finalize(); bli_thread_finalize(); bli_ind_finalize(); bli_gks_finalize(); +#ifdef BLIS_ENABLE_DMA + bli_dma_backend_finalize(); +#endif // BLIS_ENABLE_DMA + // Reset the control variable that will allow (re-)initialization. // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the // contents to the static control variable because some implementations of diff --git a/frame/base/dma/bli_dma.h b/frame/base/dma/bli_dma.h new file mode 100644 index 0000000000..e1fe2b09fd --- /dev/null +++ b/frame/base/dma/bli_dma.h @@ -0,0 +1,63 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_DMA_H +#define BLIS_DMA_H + +// #define BLIS_DMA_DEBUG 1 + +// -- DMA type and macro definitions ------------------------------------------- + +#include "bli_dma_type_defs.h" + +// -- Reference DMA backend API ------------------------------------------------ + +#include "bli_dma_backend_ref.h" + +// -- Vendor-specific DMA headers ---------------------------------------------- + +// This is the place where vendors define the `dma_event_t` type, based on +// their own DMA library. +#include "bli_dma_vendor_type_defs.h" + +// -- Default DMA-backend functions -------------------------------------------- + +#include "bli_dma_macro_defs.h" + +// -- Object-API DMA API ------------------------------------------------------- + +#include "bli_dma_oapi.h" + +#endif // BLIS_DMA_H diff --git a/frame/base/dma/bli_dma_backend_ref.c b/frame/base/dma/bli_dma_backend_ref.c new file mode 100644 index 0000000000..05cf3488ba --- /dev/null +++ b/frame/base/dma/bli_dma_backend_ref.c @@ -0,0 +1,211 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_DMA + +// ============================================================================= +// -- Reference implementation of DMA backend using memcpy and bli_pthread +// ============================================================================= + +// Backend init, called by bli_dma_sys_init() +int bli_dma_backend_init_ref() +{ + return 0; +} + +// Backend finalize, called by bli_dma_sys_finalize() +int bli_dma_backend_finalize_ref() +{ + return 0; +} + +static void* get2D_routine( void* arg_ ) +{ + bli_dma_thread_arg_t* arg = (bli_dma_thread_arg_t*) arg_; + void* global = arg->global; + void* local = arg->local; + size_t elem_size = arg->elem_size; + int32_t width = arg->width; + int32_t height = arg->height; + point2d_t global_point = arg->global_point; + point2d_t local_point = arg->local_point; + + char* local_ptr = ((char*) local) + + (((local_point.ypos * local_point.xdim) + + local_point.xpos) * elem_size); + const char* global_ptr = ((const char*) global) + + (((global_point.ypos * global_point.xdim) + + global_point.xpos) * elem_size); + + for( int i = 0; i < height; ++i ) + { + memcpy(local_ptr, global_ptr, width*elem_size); + local_ptr += (local_point.xdim * elem_size); + global_ptr += (global_point.xdim * elem_size); + } + + return NULL; +} + +static void* put2D_routine( void* arg_ ) +{ + bli_dma_thread_arg_t* arg = (bli_dma_thread_arg_t*) arg_; + void* global = arg->global; + void* local = arg->local; + size_t elem_size = arg->elem_size; + int32_t width = arg->width; + int32_t height = arg->height; + point2d_t global_point = arg->global_point; + point2d_t local_point = arg->local_point; + + const char* local_ptr = ((const char*) local) + + (((local_point.ypos * local_point.xdim) + + local_point.xpos) * elem_size); + char* global_ptr = ((char*) global) + + (((global_point.ypos * global_point.xdim) + + global_point.xpos) * elem_size); + + for( int i = 0; i < height; ++i ) + { + memcpy(global_ptr, local_ptr, width*elem_size); + local_ptr += (local_point.xdim * elem_size); + global_ptr += (global_point.xdim * elem_size); + } + + return NULL; +} + +// 2D (asynchronous) copy between scratchpad and global memory +int bli_dma_backend_get2D_ref( + const void* global, + void* local, + size_t elem_size, + int32_t width, + int32_t height, + point2d_t* global_point, + point2d_t* local_point, + dma_event_ref_t* event +) +{ + int ret = 0; + + dma_event_ref_t event_local; + dma_event_ref_t* event_used = event ? event : &event_local; + + event_used->arg.global = (void* )global; + event_used->arg.local = (void* )local; + event_used->arg.elem_size = elem_size; + event_used->arg.width = width; + event_used->arg.height = height; + event_used->arg.global_point = *global_point; + event_used->arg.local_point = *local_point; + + if (event) { + // copy asynchronously by another thread + ret = bli_pthread_create( &(event->thread), NULL, + &get2D_routine, &(event_used->arg) ); + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): bli_pthread_create() returned %d event %p\n", + __FUNCTION__, ret, event ); + #endif // BLIS_DMA_DEBUG + } else { + // blocking: copy myself + get2D_routine( &(event_used->arg) ); + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): get2D_routine() event %p\n", __FUNCTION__, event ); + #endif // BLIS_DMA_DEBUG + } + + return ret; +} + +// 2D (asynchronous) copy between scratchpad and global memory +int bli_dma_backend_put2D_ref( + void* global, + const void* local, + size_t elem_size, + int32_t width, + int32_t height, + point2d_t* global_point, + point2d_t* local_point, + dma_event_ref_t* event +) +{ + int ret = 0; + + dma_event_ref_t event_local; + dma_event_ref_t* event_used = event ? event : &event_local; + + event_used->arg.global = (void* )global; + event_used->arg.local = (void* )local; + event_used->arg.elem_size = elem_size; + event_used->arg.width = width; + event_used->arg.height = height; + event_used->arg.global_point = *global_point; + event_used->arg.local_point = *local_point; + + if (event) { + // copy asynchronously by another thread + ret = bli_pthread_create( &(event->thread), NULL, + &put2D_routine, &(event_used->arg) ); + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): bli_pthread_create() returned %d event %p\n", + __FUNCTION__, ret, event ); + #endif // BLIS_DMA_DEBUG + } else { + // blocking: copy myself + put2D_routine( &(event_used->arg) ); + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): put2D_routine() event %p\n", __FUNCTION__, event ); + #endif // BLIS_DMA_DEBUG + } + + return ret; +} + +// Wait for termination of a DMA transfer +int bli_dma_backend_wait_ref( dma_event_ref_t *event ) +{ + #ifdef BLIS_DMA_DEBUG + fprintf( stdout, " %s(): event %p\n", __FUNCTION__, event ); + #endif // BLIS_DMA_DEBUG + return ( event ? bli_pthread_join( event->thread, NULL ) : 0 ); +} + + + +#endif // BLIS_ENABLE_DMA diff --git a/frame/base/dma/bli_dma_backend_ref.h b/frame/base/dma/bli_dma_backend_ref.h new file mode 100644 index 0000000000..bda8f86fa2 --- /dev/null +++ b/frame/base/dma/bli_dma_backend_ref.h @@ -0,0 +1,136 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_DMA_BACKEND_REF_H +#define BLIS_DMA_BACKEND_REF_H + +// -- Define a reference `dma_event_t` to work with bli_pthread ---------------- + +// Arguments for a DMA transfer. References: +// - https://github.com/kalray/opencl_optim_examples +// - https://hal.univ-grenoble-alpes.fr/hal-01652614/document +typedef struct bli_dma_thread_arg_s +{ + void* global; // begin address of global buffer + void* local; // begin address of local buffer + size_t elem_size; // size of an element in byte + int32_t width; // block width in element + int32_t height; // block height in element + point2d_t global_point; // global_point + point2d_t local_point; // local_point +} bli_dma_thread_arg_t; + +typedef struct dma_event_s +{ + bli_dma_thread_arg_t arg; + bli_pthread_t thread; +} dma_event_ref_t; + + +// -- Reference DMA backend API ------------------------------------------------ + +/** + * DMA-backend initialization + * @return 0 on success, non-zero otherwise + */ +int bli_dma_backend_init_ref(); + +/** + * DMA-backend finalize + * @return 0 on success, non-zero otherwise + */ +int bli_dma_backend_finalize_ref(); + +/** + * DMA-backend copy 2D2D from global memory to scratchpad + * @param global begin address of global buffer + * @param local begin address of local buffer + * @param elem_size size of an element in byte + * @param width block width in element + * @param height block height in element + * @param global_point global_point + * @param local_point local_point + * @param event event. If non-NULL, the call configures an asynchronous + * transfer and returns immediately. One must later call + wait() on this event. + If NULL, the call is blocking until the transfer is done. + * @return 0 on success, non-zero otherwise + */ +int bli_dma_backend_get2D_ref( + const void* global, + void* local, + size_t elem_size, + int32_t width, + int32_t height, + point2d_t* global_point, + point2d_t* local_point, + dma_event_ref_t* event +); + +/** + * DMA-backend copy 2D2D from scratchpad to global memory + * @param global begin address of global buffer + * @param local begin address of local buffer + * @param elem_size size of an element in byte + * @param width block width in element + * @param height block height in element + * @param global_point global_point + * @param local_point local_point + * @param event event. If non-NULL, the call configures an asynchronous + * transfer and returns immediately. One must later call + wait() on this event. + If NULL, the call is blocking until the transfer is done. + * @return 0 on success, non-zero otherwise + */ +int bli_dma_backend_put2D_ref( + void* global, + const void* local, + size_t elem_size, + int32_t width, + int32_t height, + point2d_t* global_point, + point2d_t* local_point, + dma_event_ref_t* event +); + +/** + * DMA-backend wait for asynchronous transfer + * @param event event + * @return 0 on success, non-zero otherwise + */ +int bli_dma_backend_wait_ref( dma_event_ref_t *event ); + + +#endif // BLIS_DMA_BACKEND_REF_H diff --git a/frame/base/dma/bli_dma_macro_defs.h b/frame/base/dma/bli_dma_macro_defs.h new file mode 100644 index 0000000000..721fb696a8 --- /dev/null +++ b/frame/base/dma/bli_dma_macro_defs.h @@ -0,0 +1,108 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_DMA_MACRO_DEFS_H +#define BLIS_DMA_MACRO_DEFS_H + + +// -- Default DMA-backend functions -------------------------------------------- + +#ifndef BLIS_DMA_BACKEND_INIT +#define BLIS_DMA_BACKEND_INIT bli_dma_backend_init_ref +#endif + +#ifndef BLIS_DMA_BACKEND_FINALIZE +#define BLIS_DMA_BACKEND_FINALIZE bli_dma_backend_finalize_ref +#endif + +#ifndef BLIS_DMA_BACKEND_GET2D +#define BLIS_DMA_BACKEND_GET2D bli_dma_backend_get2D_ref +#endif + +#ifndef BLIS_DMA_BACKEND_PUT2D +#define BLIS_DMA_BACKEND_PUT2D bli_dma_backend_put2D_ref +#endif + +#ifndef BLIS_DMA_BACKEND_WAIT +#define BLIS_DMA_BACKEND_WAIT bli_dma_backend_wait_ref +#endif + +BLIS_INLINE int bli_dma_backend_init() +{ + return BLIS_DMA_BACKEND_INIT(); +} + +BLIS_INLINE int bli_dma_backend_finalize() +{ + return BLIS_DMA_BACKEND_FINALIZE(); +} + +BLIS_INLINE int bli_dma_backend_get2D( + const void* global, + void* local, + size_t elem_size, + int32_t width, + int32_t height, + point2d_t* global_point, + point2d_t* local_point, + dma_event_t* event +) +{ + return BLIS_DMA_BACKEND_GET2D(global, local, elem_size, width, height, + global_point, local_point, event); +} + +BLIS_INLINE int bli_dma_backend_put2D( + void* global, + const void* local, + size_t elem_size, + int32_t width, + int32_t height, + point2d_t* global_point, + point2d_t* local_point, + dma_event_t* event +) +{ + return BLIS_DMA_BACKEND_PUT2D(global, local, elem_size, width, height, + global_point, local_point, event); +} + +BLIS_INLINE int bli_dma_backend_wait( dma_event_t *event ) +{ + return BLIS_DMA_BACKEND_WAIT( event ); +} + + +#endif // BLIS_DMA_MACRO_DEFS_H diff --git a/frame/base/dma/bli_dma_oapi.c b/frame/base/dma/bli_dma_oapi.c new file mode 100644 index 0000000000..466082a030 --- /dev/null +++ b/frame/base/dma/bli_dma_oapi.c @@ -0,0 +1,326 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef BLIS_ENABLE_DMA + +static void bli_dma_get_check + ( + obj_t* a, + obj_t* p + ) +{ + err_t e_val; + + // Check object datatypes. + e_val = bli_check_floating_object( a ); + bli_check_error_code( e_val ); + + e_val = bli_check_floating_object( p ); + bli_check_error_code( e_val ); + + // Check object dimensions. + e_val = bli_check_conformal_dims( a, p ); + bli_check_error_code( e_val ); +} + +#undef GENFRONT +#define GENFRONT( opname, dma_func ) \ +\ +static int PASTEMAC0(opname) \ + ( \ + obj_t* a, \ + obj_t* p, \ + dma_event_t* event, \ + thrinfo_t* thread \ + ) \ +{ \ + int ret = 0; \ +\ + bli_dma_get_check( a, p ); \ +\ + /* only thread chief triggers dma */ \ + if ( bli_thread_am_ochief( thread ) ) \ + { \ + obj_t* global_root = bli_obj_root( a ); \ + dim_t m_root = bli_obj_length( global_root ); \ + dim_t n_root = bli_obj_width( global_root ); \ + dim_t m_a = bli_obj_length( a ); \ + dim_t n_a = bli_obj_width( a ); \ + dim_t rs_a = bli_obj_row_stride( a ); \ + dim_t cs_a = bli_obj_col_stride( a ); \ +\ + /* Query if object a is stored in row-major or col-major */ \ + bool is_row_major = bli_is_row_stored_f( m_a, n_a, rs_a, cs_a ); \ +\ + void* global = bli_obj_buffer( global_root ); \ + void* local = bli_obj_buffer( p ); \ + siz_t elem_size = bli_obj_elem_size( p ); \ +\ + point2d_t local_point = { 0 }; \ + point2d_t global_point = { 0 }; \ +\ + if ( is_row_major ) \ + { \ + local_point.xpos = 0; /* xpos = zero in local panel */ \ + local_point.ypos = 0; /* ypos = zero in local panel */ \ + local_point.xdim = bli_obj_width( p ); /* xdim = width of local panel */ \ + local_point.ydim = bli_obj_length( p ); /* ydim = length of root */ \ +\ + /* The "invisible" part of a: for any (alignment) reason, the real + allocated buffer of the root-matrix can be larger than its dimensions. + This is why we must use rs_a (or cs_a) instead of n_a (or m_a) for + global_point.xdim. */ \ + global_point.xpos = bli_obj_col_off( a ); /* xpos = col-offset of a in root */ \ + global_point.ypos = bli_obj_row_off( a ); /* ypos = row-offset of a in root */ \ + global_point.xdim = rs_a; /* xdim = row-stride of a */ \ + global_point.ydim = m_root; /* ydim = length of root */ \ + } \ + else /* is_col_major */ \ + { \ + local_point.xpos = 0; /* xpos = zero in local panel */ \ + local_point.ypos = 0; /* ypos = zero in local panel */ \ + local_point.xdim = bli_obj_length( p ); /* xdim = length of local panel */ \ + local_point.ydim = bli_obj_width( p ); /* ydim = width of root */ \ +\ + global_point.xpos = bli_obj_row_off( a ); /* xpos = row-offset of a in root */ \ + global_point.ypos = bli_obj_col_off( a ); /* ypos = col-offset of a in root */ \ + global_point.xdim = cs_a; /* xdim = col-stride of a */ \ + global_point.ydim = n_root; /* ydim = width of root */ \ + } \ +\ + /* trigger DMA transfer */ \ + ret = dma_func(global, /* global */ \ + local, /* local */ \ + elem_size, /* elem_size */ \ + local_point.xdim, /* width */ \ + local_point.ydim, /* height */ \ + &global_point, /* global_point */ \ + &local_point, /* local_point */ \ + event /* event */ \ + ); \ + } \ +\ + return ret; \ +} + +GENFRONT( dma_get2D, bli_dma_backend_get2D ) +GENFRONT( dma_put2D, bli_dma_backend_put2D ) + + +static siz_t bli_dma_get_init ( obj_t* a, obj_t* p ) +{ + siz_t elem_size = bli_obj_elem_size( a ); + dim_t m_a = bli_obj_length( a ); + dim_t n_a = bli_obj_width( a ); + dim_t rs_a = bli_obj_row_stride( a ); + dim_t cs_a = bli_obj_col_stride( a ); + siz_t size_needed = 0; + + dim_t rs_p, cs_p; + dim_t offm_p, offn_p; + dim_t m_p, n_p; + + // We begin by copying the fields of A. + bli_obj_alias_to( a, p ); + + // If the object is marked as being filled with zeros, then we can skip + // the dma operation entirely and return zero, otherwise + if ( !bli_obj_is_zeros( a ) ) + { + // Default offsets of p is zeros + offm_p = 0; + offn_p = 0; + m_p = m_a; + n_p = n_a; + + bool is_row_major = bli_is_row_stored_f( m_a, n_a, rs_a, cs_a ); + + if( is_row_major ) + { + rs_p = n_p; + cs_p = 1; + } + else // is_col_major + { + rs_p = 1; + cs_p = m_p; + } + + bli_obj_set_dims( m_p, n_p, p ); + bli_obj_set_offs( offm_p, offn_p, p ); + bli_obj_set_strides( rs_p, cs_p, p ); + + size_needed = m_p * n_p * elem_size; + } + + return size_needed; +} + +static void bli_dma_alloc + ( + siz_t size_needed, + mem_t* mem_p_dma, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + mem_t* local_mem_p; + mem_t local_mem_s; + + siz_t mem_size = 0; + + if ( !mem_p_dma ) + { + fprintf( stderr, "%s:%d: mem_p_dma must not be NULL\n", + __FILE__, __LINE__ ); + bli_check_error_code( BLIS_NULL_POINTER ); + } + + if ( bli_mem_is_alloc( mem_p_dma ) ) { + mem_size = bli_mem_size( mem_p_dma ); + } + + if ( mem_size < size_needed ) + { + if ( bli_thread_am_ochief( thread ) ) + { + packbuf_t dma_buf_type = bli_mem_buf_type( mem_p_dma ); + + // The chief thread releases the existing block associated with + // the mem_t entry in the control tree, and then re-acquires a + // new block, saving the associated mem_t entry to local_mem_s. + if ( bli_mem_is_alloc( mem_p_dma ) ) + { + bli_pba_release + ( + rntm, + mem_p_dma + ); + } + bli_pba_acquire_m + ( + rntm, + size_needed, + dma_buf_type, + &local_mem_s + ); + } + + // Broadcast the address of the chief thread's local mem_t entry to + // all threads. + local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); + + // Save the chief thread's local mem_t entry to the mem_t field in + // this thread's control tree node. + *mem_p_dma = *local_mem_p; + } + + bli_thread_barrier( thread ); +} + +void bli_dma_get + ( + obj_t* a, + obj_t* p, + mem_t* mem_p_dma, + dma_event_t* event, + rntm_t* rntm, + thrinfo_t* thread + ) +{ + bli_init_once(); + bli_thread_barrier( thread ); + + siz_t size_needed = bli_dma_get_init( a, p ); + if ( size_needed > 0 ) + { + // check for potentially need to increase the local DMA buffer + bli_dma_alloc( size_needed, mem_p_dma, rntm, thread ); + + // reset local buffer to p + void* dma_buffer = bli_mem_buffer( mem_p_dma ); + bli_obj_set_buffer( dma_buffer, p ); + + // call get2D backend + int err = bli_dma_get2D( a, p, event, thread ); + if ( err ) { + bli_check_error_code( BLIS_DMA_GET_FAILURE ); + } + } + + bli_thread_barrier( thread ); +} + +void bli_dma_put + ( + obj_t* a, + obj_t* p, + dma_event_t* event, + thrinfo_t* thread + ) +{ + bli_init_once(); + bli_thread_barrier( thread ); + + // call put2D backend + int err = bli_dma_put2D( a, p, event, thread ); + if ( err ) { + bli_check_error_code( BLIS_DMA_PUT_FAILURE ); + } + + bli_thread_barrier( thread ); +} + +void bli_dma_wait( dma_event_t* event, thrinfo_t* thread ) +{ + bli_thread_barrier( thread ); + + // only thread chief waits + if ( event && bli_thread_am_ochief( thread ) ) + { + int err = bli_dma_backend_wait( event ); + if ( err ) { + bli_check_error_code( BLIS_DMA_WAIT_FAILURE ); + } + } + + // Barrier so that DMA is done before computation, or data sent after + // computation. + bli_thread_barrier( thread ); +} + +#endif // BLIS_ENABLE_DMA diff --git a/frame/base/dma/bli_dma_oapi.h b/frame/base/dma/bli_dma_oapi.h new file mode 100644 index 0000000000..3a3332351f --- /dev/null +++ b/frame/base/dma/bli_dma_oapi.h @@ -0,0 +1,59 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_DMA_OAPI_H +#define BLIS_DMA_OAPI_H + +void bli_dma_get + ( + obj_t* a, + obj_t* p, + mem_t* mem_p_dma, + dma_event_t* event, + rntm_t* rntm, + thrinfo_t* thread + ); + +void bli_dma_put + ( + obj_t* a, + obj_t* p, + dma_event_t* event, + thrinfo_t* thread + ); + +void bli_dma_wait( dma_event_t* event, thrinfo_t* thread ); + +#endif // BLIS_DMA_OAPI_H diff --git a/frame/base/dma/bli_dma_type_defs.h b/frame/base/dma/bli_dma_type_defs.h new file mode 100644 index 0000000000..0381398943 --- /dev/null +++ b/frame/base/dma/bli_dma_type_defs.h @@ -0,0 +1,52 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_DMA_TYPE_DEFS_H +#define BLIS_DMA_TYPE_DEFS_H + + +// -- DMA type and macro definitions ------------------------------------------- + +// coordinates type for DMA 2D-copy +typedef struct point2d_s +{ + int32_t xpos; // x-position (or offset) in elements from the buffer pointer + int32_t ypos; // y-position (or offset) in elements from the buffer pointer + int32_t xdim; // x-dim of the allocated buffer + int32_t ydim; // y-dim of the allocated buffer +} point2d_t; + + +#endif // BLIS_DMA_TYPE_DEFS_H diff --git a/frame/base/dma/bli_dma_vendor_type_defs.h b/frame/base/dma/bli_dma_vendor_type_defs.h new file mode 100644 index 0000000000..cae3b8db67 --- /dev/null +++ b/frame/base/dma/bli_dma_vendor_type_defs.h @@ -0,0 +1,70 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2021, Kalray Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_DMA_VENDOR_TYPE_DEFS_H +#define BLIS_DMA_VENDOR_TYPE_DEFS_H + + +// -- Vendor-specific DMA headers ---------------------------------------------- +// +// This is the place where vendors define the `dma_event_t` type, based on +// their own DMA library, as well as overriding the default DMA backend API: +// +// #define BLIS_DMA_BACKEND_INIT your_favorite_dma_func_init +// #define BLIS_DMA_BACKEND_FINALIZE your_favorite_dma_func_finalize +// #define BLIS_DMA_BACKEND_GET2D your_favorite_dma_func_get2d +// #define BLIS_DMA_BACKEND_PUT2D your_favorite_dma_func_put2d +// #define BLIS_DMA_BACKEND_WAIT your_favorite_dma_func_wait +// +// NOTE: +// - The current DMA support calls bli_pba_acquire_m() (bli_dma_oapi.c) to +// allocate a DMA buffer in an expected-to-be local/scratchpad memory (SMEM). +// Developer should accordingly map the PBA allocator onto SMEM. +// Generally, having packed buffers in such a near-core, fast scratchpad memory +// is always worth for performance. + +#if defined(BLIS_OS_YOUR_FAVORITE_ARCH) + + // Define your vendor-specific dma_event_t here + // ... + +#else // Reference DMA + // No vendor-specific DMA library, define a reference `dma_event_t` to + // work with bli_pthread. + typedef dma_event_ref_t dma_event_t; +#endif + + +#endif // BLIS_DMA_VENDOR_TYPE_DEFS_H diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h index fe030f193f..34bed0dbc9 100644 --- a/frame/include/bli_type_defs.h +++ b/frame/include/bli_type_defs.h @@ -1662,7 +1662,12 @@ typedef enum BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), - BLIS_ERROR_CODE_MAX = (-170) + // DMA-related error + BLIS_DMA_GET_FAILURE = (-170), + BLIS_DMA_PUT_FAILURE = (-171), + BLIS_DMA_WAIT_FAILURE = (-172), + + BLIS_ERROR_CODE_MAX = (-180) } err_t; #endif diff --git a/frame/include/blis.h b/frame/include/blis.h index b374e85398..6c4897b2f5 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -91,6 +91,11 @@ extern "C" { #include "bli_extern_defs.h" +// -- DMA definitions -- +#ifdef BLIS_ENABLE_DMA +#include "bli_dma.h" +#endif // BLIS_ENABLE_DMA + // -- BLIS architecture/kernel definitions -- #include "bli_l1v_ker_prot.h" diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index f5bfd0f729..0e1020fc37 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -887,6 +887,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) libblis_test_fprintf_c( os, "gemm sandbox \n" ); libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_sandbox() ); libblis_test_fprintf_c( os, "\n" ); + libblis_test_fprintf_c( os, "DMA \n" ); + libblis_test_fprintf_c( os, " enabled? %d\n", ( int )bli_info_get_enable_dma() ); + libblis_test_fprintf_c( os, "\n" ); libblis_test_fprintf_c( os, "floating-point types s d c z \n" ); libblis_test_fprintf_c( os, " sizes (bytes) %7u %7u %7u %7u\n", sizeof(float), sizeof(double),