Skip to content

Commit bdd46f9

Browse files
committed
Rewrote reference kernels to use #pragma omp simd.
Details: - Rewrote level-1v, -1f, and -3 reference kernels in terms of simplified indexing annotated by the #pragma omp simd directive, which a compiler can use to vectorize certain constant-bounded loops. (The new kernels actually use _Pragma("omp simd") since the kernels are defined via templatizing macros.) Modest speedup was observed in most cases using gcc 5.4.0, which may improve with newer versions. Thanks to Devin Matthews for suggesting this via issue #286 and #259. - Updated default blocksizes defined in ref_kernels/bli_cntx_ref.c to be 4x16, 4x8, 4x8, and 4x4 for single, double, scomplex and dcomplex, respectively, with a default row preference for the gemm ukernel. Also updated axpyf, dotxf, and dotxaxpyf fusing factors to 8, 6, and 4, respectively, for all datatypes. - Modified configure to verify that -fopenmp-simd is a valid compiler option (via a new detect/omp_simd/omp_simd_detect.c file). - Added a new header in which prefetch macros are defined according to which compiler is detected (via macros such as __GNUC__). These prefetch macros are not yet employed anywhere, though. - Updated the year in copyrights of template license headers in build/templates and removed AMD as a default copyright holder.
1 parent 63de2b0 commit bdd46f9

36 files changed

+1766
-397
lines changed

build/config.mk.in

+3
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ DEBUG_TYPE := @debug_type@
112112
# The requested threading model.
113113
THREADING_MODEL := @threading_model@
114114

115+
# Whether the compiler supports "#pragma omp simd" via the -fopenmp-simd option.
116+
PRAGMA_OMP_SIMD := @pragma_omp_simd@
117+
115118
# The install libdir, includedir, and shareddir values from configure tell
116119
# us where to install the libraries, header files, and public makefile
117120
# fragments, respectively. Notice that we support the use of DESTDIR so that
+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#include <stdio.h>
2+
#include <string.h>
3+
4+
#define ARRAY_LEN 4096
5+
6+
double x[ ARRAY_LEN ];
7+
double y[ ARRAY_LEN ];
8+
9+
int main( int argc, char **argv )
10+
{
11+
const double alpha = 2.1;
12+
13+
for ( int i = 0; i < ARRAY_LEN; ++i )
14+
{
15+
y[ i ] = 0.0;
16+
x[ i ] = 1.0;
17+
}
18+
19+
#pragma omp simd
20+
for ( int i = 0; i < ARRAY_LEN; ++i )
21+
{
22+
y[ i ] += alpha * x[ i ];
23+
}
24+
25+
#if 0
26+
_Pragma( "omp simd" )
27+
for ( int i = 0; i < ARRAY_LEN; ++i )
28+
{
29+
x[ i ] += alpha * y[ i ];
30+
}
31+
#endif
32+
33+
return 0;
34+
}
35+

build/templates/license.c

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
An object-based framework for developing high-performance BLAS-like
55
libraries.
66
7-
Copyright (C) 2014, The University of Texas at Austin
8-
Copyright (C) 2017, Advanced Micro Devices, Inc.
7+
Copyright (C) 2019, The University of Texas at Austin
98
109
Redistribution and use in source and binary forms, with or without
1110
modification, are permitted provided that the following conditions are

build/templates/license.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
An object-based framework for developing high-performance BLAS-like
55
libraries.
66
7-
Copyright (C) 2014, The University of Texas at Austin
8-
Copyright (C) 2017, Advanced Micro Devices, Inc.
7+
Copyright (C) 2019, The University of Texas at Austin
98
109
Redistribution and use in source and binary forms, with or without
1110
modification, are permitted provided that the following conditions are

build/templates/license.sh

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@
44
# An object-based framework for developing high-performance BLAS-like
55
# libraries.
66
#
7-
# Copyright (C) 2014, The University of Texas at Austin
8-
# Copyright (C) 2017, Advanced Micro Devices, Inc.
7+
# Copyright (C) 2019, The University of Texas at Austin
98
#
109
# Redistribution and use in source and binary forms, with or without
1110
# modification, are permitted provided that the following conditions are

common.mk

+11-2
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ get-refinit-cflags-for = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
124124
get-refkern-cflags-for = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
125125
$(call load-var-for,CRVECFLAGS,$(1)) \
126126
$(call get-noopt-cflags-for,$(1)) \
127+
$(COMPSIMDFLAGS) \
127128
-DBLIS_CNAME=$(1) \
128129
$(BUILD_FLAGS) \
129130
)
@@ -635,7 +636,6 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))
635636
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
636637
$(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c))))
637638

638-
639639
# --- Threading flags ---
640640

641641
ifeq ($(CC_VENDOR),gcc)
@@ -680,6 +680,14 @@ LDFLAGS += $(LIBPTHREAD)
680680
endif
681681
endif
682682

683+
# --- #pragma omp simd flags (used for reference kernels only) ---
684+
685+
ifeq ($(PRAGMA_OMP_SIMD),yes)
686+
COMPSIMDFLAGS := -fopenmp-simd
687+
else
688+
COMPSIMDFLAGS :=
689+
endif
690+
683691

684692

685693
#
@@ -948,7 +956,8 @@ BLIS_CONFIG_H := ./bli_config.h
948956
VERS_DEF := -DBLIS_VERSION_STRING=\"$(VERSION)\"
949957

950958
# Define a C preprocessor flag that is *only* defined when BLIS is being
951-
# compiled.
959+
# compiled. (In other words, an application that #includes blis.h will not
960+
# get this cpp macro.)
952961
BUILD_FLAGS := -DBLIS_IS_BUILDING_LIBRARY
953962

954963

configure

+39
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,36 @@ has_libmemkind()
10811081
echo "${rval}"
10821082
}
10831083

1084+
has_pragma_omp_simd()
1085+
{
1086+
local main_c main_c_filepath binname rval
1087+
1088+
# Path to omp-simd detection source file.
1089+
main_c="omp_simd_detect.c"
1090+
main_c_filepath=$(find ${dist_path}/build -name "${main_c}")
1091+
1092+
# Binary executable filename.
1093+
binname="omp_simd-detect.x"
1094+
1095+
# Attempt to compile a simple main() program that contains a
1096+
# #pragma omp simd.
1097+
${found_cc} -std=c99 -O3 -march=native -fopenmp-simd \
1098+
-o ${binname} ${main_c_filepath} 2> /dev/null
1099+
1100+
# Depending on the return code from the compile step above, we set
1101+
# enable_memkind accordingly.
1102+
if [ "$?" == 0 ]; then
1103+
rval='yes'
1104+
else
1105+
rval='no'
1106+
fi
1107+
1108+
# Remove the executable generated above.
1109+
rm -f ./${binname}
1110+
1111+
echo "${rval}"
1112+
}
1113+
10841114
echoerr()
10851115
{
10861116
printf "${script_name}: error: %s\n" "$*" #>&2;
@@ -2423,6 +2453,9 @@ main()
24232453
# --without-memkind.
24242454
has_memkind=$(has_libmemkind)
24252455

2456+
# Try to determine whether the chosen compiler supports #pragma omp simd.
2457+
pragma_omp_simd=$(has_pragma_omp_simd)
2458+
24262459

24272460
# -- Prepare variables for subsitution into template files -----------------
24282461

@@ -2633,6 +2666,11 @@ main()
26332666
enable_memkind="no"
26342667
enable_memkind_01=0
26352668
fi
2669+
if [ "x${pragma_omp_simd}" = "xyes" ]; then
2670+
echo "${script_name}: compiler appears to support #pragma omp simd."
2671+
else
2672+
echo "${script_name}: compiler appears to not support #pragma omp simd."
2673+
fi
26362674
if [ "x${enable_blas}" = "xyes" ]; then
26372675
echo "${script_name}: the BLAS compatibility layer is enabled."
26382676
enable_blas_01=1
@@ -2842,6 +2880,7 @@ main()
28422880
| sed -e "s/@enable_blas@/${enable_blas}/g" \
28432881
| sed -e "s/@enable_cblas@/${enable_cblas}/g" \
28442882
| sed -e "s/@enable_memkind@/${enable_memkind}/g" \
2883+
| sed -e "s/@pragma_omp_simd@/${pragma_omp_simd}/g" \
28452884
| sed -e "s/@sandbox@/${sandbox}/g" \
28462885
> "${config_mk_out_path}"
28472886

frame/include/bli_arch_config_pre.h

+4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
#define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME)
5050
#endif
5151

52+
// Combine the CNAME and _ref for convenience to the code that defines
53+
// reference kernels.
54+
//#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX)
55+
5256
// -- Prototype-generating macro definitions -----------------------------------
5357

5458
// Prototype-generating macro for bli_cntx_init_<arch>*() functions.
+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
3+
BLIS
4+
An object-based framework for developing high-performance BLAS-like
5+
libraries.
6+
7+
Copyright (C) 2019, The University of Texas at Austin
8+
9+
Redistribution and use in source and binary forms, with or without
10+
modification, are permitted provided that the following conditions are
11+
met:
12+
- Redistributions of source code must retain the above copyright
13+
notice, this list of conditions and the following disclaimer.
14+
- Redistributions in binary form must reproduce the above copyright
15+
notice, this list of conditions and the following disclaimer in the
16+
documentation and/or other materials provided with the distribution.
17+
- Neither the name(s) of the copyright holder(s) nor the names of its
18+
contributors may be used to endorse or promote products derived
19+
from this software without specific prior written permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32+
33+
*/
34+
35+
#ifndef BLIS_BUILTIN_MACRO_DEFS_H
36+
#define BLIS_BUILTIN_MACRO_DEFS_H
37+
38+
#if defined(__ICC) || defined(__INTEL_COMPILER)
39+
40+
// icc
41+
42+
#define bli_prefetch( addr, rw, loc )
43+
44+
#elif defined(__clang__)
45+
46+
// clang
47+
48+
#define bli_prefetch( addr, rw, loc )
49+
50+
#elif defined(__GNUC__)
51+
52+
// gcc
53+
54+
#define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc );
55+
56+
#endif
57+
58+
59+
#endif

frame/include/bli_macro_defs.h

+10
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,15 @@
128128
#define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op
129129
#define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op)
130130

131+
#define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op
132+
#define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op)
133+
134+
#define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op
135+
#define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op)
136+
137+
#define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op
138+
#define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op)
139+
131140
#define PASTEBLACHK_(op) bla_ ## op ## _check
132141
#define PASTEBLACHK(op) PASTEBLACHK_(op)
133142

@@ -163,6 +172,7 @@
163172
#include "bli_scalar_macro_defs.h"
164173
#include "bli_error_macro_defs.h"
165174
#include "bli_blas_macro_defs.h"
175+
#include "bli_builtin_macro_defs.h"
166176

167177
#include "bli_oapi_macro_defs.h"
168178
#include "bli_tapi_macro_defs.h"

frame/include/bli_system.h

+26-16
Original file line numberDiff line numberDiff line change
@@ -45,36 +45,46 @@
4545
#include <errno.h>
4646
#include <ctype.h>
4747

48+
// Determine the compiler (hopefully) and define conveniently named macros
49+
// accordingly.
50+
#if defined(__ICC) || defined(__INTEL_COMPILER)
51+
#define BLIS_ICC
52+
#elif defined(__clang__)
53+
#define BLIS_CLANG
54+
#elif defined(__GNUC__)
55+
#define BLIS_GCC
56+
#endif
57+
4858
// Determine if we are on a 64-bit or 32-bit architecture.
4959
#if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \
5060
defined(_ARCH_PPC64)
51-
#define BLIS_ARCH_64
61+
#define BLIS_ARCH_64
5262
#else
53-
#define BLIS_ARCH_32
63+
#define BLIS_ARCH_32
5464
#endif
5565

5666
// Determine the target operating system.
5767
#if defined(_WIN32) || defined(__CYGWIN__)
58-
#define BLIS_OS_WINDOWS 1
68+
#define BLIS_OS_WINDOWS 1
5969
#elif defined(__gnu_hurd__)
60-
#define BLIS_OS_GNU 1
70+
#define BLIS_OS_GNU 1
6171
#elif defined(__APPLE__) || defined(__MACH__)
62-
#define BLIS_OS_OSX 1
72+
#define BLIS_OS_OSX 1
6373
#elif defined(__ANDROID__)
64-
#define BLIS_OS_ANDROID 1
74+
#define BLIS_OS_ANDROID 1
6575
#elif defined(__linux__)
66-
#define BLIS_OS_LINUX 1
76+
#define BLIS_OS_LINUX 1
6777
#elif defined(__bgq__)
68-
#define BLIS_OS_BGQ 1
78+
#define BLIS_OS_BGQ 1
6979
#elif defined(__bg__)
70-
#define BLIS_OS_BGP 1
80+
#define BLIS_OS_BGP 1
7181
#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
7282
defined(__bsdi__) || defined(__DragonFly__)
73-
#define BLIS_OS_BSD 1
83+
#define BLIS_OS_BSD 1
7484
#elif defined(EMSCRIPTEN)
75-
#define BLIS_OS_EMSCRIPTEN
85+
#define BLIS_OS_EMSCRIPTEN
7686
#else
77-
#error "Cannot determine operating system"
87+
#error "Cannot determine operating system"
7888
#endif
7989

8090
// A few changes that may be necessary in Windows environments.
@@ -86,11 +96,11 @@
8696
#include <windows.h>
8797

8898
#if !defined(__clang__) && !defined(__GNUC__)
89-
// Undefine attribute specifiers in Windows.
90-
#define __attribute__(x)
99+
// Undefine attribute specifiers in Windows.
100+
#define __attribute__(x)
91101

92-
// Undefine restrict.
93-
#define restrict
102+
// Undefine restrict.
103+
#define restrict
94104
#endif
95105

96106
#endif

0 commit comments

Comments
 (0)