@@ -52,18 +52,20 @@ static void swap(int* m, int* n) { int tmp = *m; *m = *n; *n = tmp; }
52
52
53
53
int main (int argc , char * argv [])
54
54
{
55
- const int nrepeat = (1 < argc ? atoi (argv [1 ]) : 5 ), offset = 0 ;
55
+ const int nrepeat = (1 < argc ? atoi (argv [1 ]) : 5 );
56
56
const int nodd = (0 < nrepeat ? ((nrepeat & 1 /*odd*/ ) ? nrepeat : (nrepeat - 1 )) : 1 );
57
57
const int stack_size = (2 < argc ? atoi (argv [2 ]) : 30000 );
58
58
const int m = (3 < argc ? atoi (argv [3 ]) : 23 );
59
59
const int n = (4 < argc ? atoi (argv [4 ]) : m );
60
+ const int offset = (5 < argc ? atoi (argv [5 ]) : 0 );
61
+ const int offset_stack_size = offset + stack_size ;
60
62
#if defined(ALIGNMENT ) && (0 < ALIGNMENT )
61
63
const int mn = (int )ROUNDUP2 (sizeof (ELEM_TYPE ) * m , ALIGNMENT ) * n / sizeof (ELEM_TYPE );
62
64
#else
63
65
const int mn = m * n ;
64
66
#endif
65
67
#if defined(SHUFFLE )
66
- const size_t shuffle = libxsmm_shuffle ((unsigned int )stack_size );
68
+ const size_t shuffle = libxsmm_shuffle ((unsigned int )offset_stack_size );
67
69
#endif
68
70
#if defined(WARMUP ) && (0 < WARMUP ) && !defined(_DEBUG )
69
71
const int warmup = MAX (WARMUP , 2 ) / 2 * 2 ;
@@ -104,34 +106,34 @@ int main(int argc, char* argv[])
104
106
#else
105
107
CHECK (acc_stream_create (& stream , "stream" , -1 /*default priority*/ ), & result );
106
108
#endif
107
- CHECK (acc_host_mem_allocate ((void * * )& mat_hst , sizeof (ELEM_TYPE ) * mn * stack_size , stream ), & result );
108
- CHECK (acc_host_mem_allocate ((void * * )& stack_hst , sizeof (int ) * stack_size , stream ), & result );
109
+ CHECK (acc_host_mem_allocate ((void * * )& mat_hst , sizeof (ELEM_TYPE ) * mn * offset_stack_size , stream ), & result );
110
+ CHECK (acc_host_mem_allocate ((void * * )& stack_hst , sizeof (int ) * offset_stack_size , stream ), & result );
109
111
CHECK (acc_stream_sync (stream ), & result ); /* ensure host-data is allocated */
110
- for (i = 0 ; i < stack_size ; ++ i ) { /* initialize matrices */
112
+ for (i = 0 ; i < offset_stack_size ; ++ i ) { /* initialize matrices */
111
113
init (i /*seed*/ , & mat_hst [i * mn ], m , n );
112
114
}
113
- for (i = 0 ; i < stack_size ; ++ i ) { /* initialize indexes */
115
+ for (i = 0 ; i < offset_stack_size ; ++ i ) { /* initialize indexes */
114
116
#if defined(SHUFFLE )
115
- const int j = mn * (int )((shuffle * i ) % stack_size );
117
+ const int j = mn * (int )((shuffle * i ) % offset_stack_size );
116
118
#else
117
119
const int j = mn * i ;
118
120
#endif
119
121
stack_hst [i ] = j ;
120
122
}
121
- CHECK (acc_dev_mem_allocate ((void * * )& mat_dev , sizeof (ELEM_TYPE ) * mn * stack_size ), & result );
122
- CHECK (acc_dev_mem_allocate ((void * * )& stack_dev , sizeof (int ) * stack_size ), & result );
123
+ CHECK (acc_dev_mem_allocate ((void * * )& mat_dev , sizeof (ELEM_TYPE ) * mn * offset_stack_size ), & result );
124
+ CHECK (acc_dev_mem_allocate ((void * * )& stack_dev , sizeof (int ) * offset_stack_size ), & result );
123
125
#if defined(USE_LIBXSMM )
124
126
CHECK (acc_stream_sync (stream ), & result );
125
127
start = libxsmm_timer_tick ();
126
128
#endif
127
- CHECK (acc_memcpy_h2d (mat_hst , mat_dev , sizeof (ELEM_TYPE ) * mn * stack_size , stream ), & result );
128
- CHECK (acc_memcpy_h2d (stack_hst , stack_dev , sizeof (int ) * stack_size , stream ), & result );
129
+ CHECK (acc_memcpy_h2d (mat_hst , mat_dev , sizeof (ELEM_TYPE ) * mn * offset_stack_size , stream ), & result );
130
+ CHECK (acc_memcpy_h2d (stack_hst , stack_dev , sizeof (int ) * offset_stack_size , stream ), & result );
129
131
#if defined(USE_LIBXSMM )
130
132
CHECK (acc_stream_sync (stream ), & result );
131
133
duration = libxsmm_timer_duration (start , libxsmm_timer_tick ());
132
134
printf ("copy-in: %.1f ms %.1f GB/s\n" , 1000.0 * duration ,
133
135
(sizeof (ELEM_TYPE ) * mn + sizeof (int ))
134
- * stack_size / (duration * (1ULL << 30 )));
136
+ * offset_stack_size / (duration * (1ULL << 30 )));
135
137
#endif
136
138
/* warmup execution and prebuild JIT kernels */
137
139
for (r = 0 ; r < warmup / 2 ; ++ r ) {
@@ -156,25 +158,25 @@ int main(int argc, char* argv[])
156
158
assert (0 < nodd && (nodd & 1 /*odd*/ ));
157
159
printf ("device: %.1f ms %.1f GB/s\n" , 1000.0 * duration / nodd ,
158
160
(sizeof (ELEM_TYPE ) * mn + sizeof (int ))
159
- * stack_size / (duration * (1ULL << 30 ) / nodd ));
161
+ * offset_stack_size / (duration * (1ULL << 30 ) / nodd ));
160
162
mm = m ; nn = n ;
161
163
start = libxsmm_timer_tick ();
162
164
for (r = 0 ; r < nodd ; ++ r ) {
163
165
libxsmm_itrans_batch_omp (mat_hst , sizeof (ELEM_TYPE ), mm , nn , mm , nn ,
164
- 0 /*index_base*/ , sizeof (int )/*index_stride*/ , stack_hst , stack_size );
166
+ 0 /*index_base*/ , sizeof (int )/*index_stride*/ , stack_hst + offset , stack_size );
165
167
swap (& mm , & nn );
166
168
}
167
169
duration = libxsmm_timer_duration (start , libxsmm_timer_tick ());
168
170
printf ("host: %.1f ms %.1f GB/s\n" , 1000.0 * duration / nodd ,
169
171
(sizeof (ELEM_TYPE ) * mn + sizeof (int ))
170
- * stack_size / (duration * (1ULL << 30 ) / nodd ));
172
+ * offset_stack_size / (duration * (1ULL << 30 ) / nodd ));
171
173
/* transfer result from device to host for validation */
172
174
CHECK (acc_memcpy_d2h (mat_dev , mat_hst ,
173
- sizeof (ELEM_TYPE ) * mn * stack_size , stream ), & result );
175
+ sizeof (ELEM_TYPE ) * mn * offset_stack_size , stream ), & result );
174
176
CHECK (acc_stream_sync (stream ), & result );
175
177
if (EXIT_SUCCESS == result ) {
176
178
unsigned int nerrors = 0 ;
177
- for (i = 0 ; i < stack_size ; ++ i ) {
179
+ for (i = offset ; i < offset_stack_size ; ++ i ) {
178
180
ELEM_TYPE gold [MAX_KERNEL_DIM * MAX_KERNEL_DIM ];
179
181
const ELEM_TYPE * const test = mat_hst + mn * i ;
180
182
init (i /*seed*/ , gold , m , n );
0 commit comments