Skip to content

Commit 9665be4

Browse files
committed
LU to do L before U; end-of-exec tweaks
1 parent 6366384 commit 9665be4

File tree

3 files changed

+91
-58
lines changed

3 files changed

+91
-58
lines changed

jsrc/cip.c

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,7 +1123,7 @@ DF1(jtludecompg){F1PREFIP;PROLOG(823);
11231123
}
11241124

11251125

1126-
// 128!:10 LU decomposition for square real arrays LU=A
1126+
// 128!:10 LU decomposition for square real arrays LUP=A
11271127
// returns permutation ; L+U-I (Doolittle form)
11281128
// the ith element of the permutation is the original row of row i of LU
11291129
DF1(jtludecomp){F1PREFIP;PROLOG(823);
@@ -1132,8 +1132,8 @@ DF1(jtludecomp){F1PREFIP;PROLOG(823);
11321132
// and the U blocks are ordered in transpose form to speed up the dot-product operations.
11331133
// For each row of A, we subtract the matrix product of (preceding U) * (preceding L) to get the new U row and the first L entry L0.
11341134
// Then we go down the column of A, multiplying by 1/L0, to get the new L column. Repeat for each gamma-shaped LU row/col.
1135-
// Processing in 4x4 blocks means we have one block for the first 4x4 (the corner) and another for the rest of the row/column. The
1136-
// code to multiply (preceding U) * (preceding L) is common between the two
1135+
// Processing in 4x4 blocks means we have one code for the first 4x4 (the corner), another for L blocks, a third for U. The
1136+
// code to multiply (preceding U) * (preceding L) is common between all three.
11371137
// Allocate area for the cacheblocks of L and U
11381138
#define BLKSZ 4 // size of cache block
11391139
#define LGBLKSZ 2 // lg(BLKSZ)
@@ -1151,7 +1151,7 @@ DF1(jtludecomp){F1PREFIP;PROLOG(823);
11511151
I nr=(wn+BLKSZ-1)>>LGBLKSZ; // nr=total # blocks on a side (including partial ones)
11521152
#define CORNERBLOCK(rc) (cb+(rc)*(nr+1)) // address of corner cblock in row&col rc
11531153
#define LBLOCK(r,c) (cb+(r)*nr+(c)) // address of L cblock for row r col c (0<=c<r)
1154-
#define UBLOCK(r,c) (cb+(nr-(c))*nr - (c) + (r)) // address of U cblock for row r col c (0<=r<c) - transposed order
1154+
#define UBLOCK(r,c) (cb+(nr-(c))*nr - (c) + (r)) // address of U cblock for row r col c (0<=r<c) - transposed order going left-to-right, starting after corner block
11551155
#define LBIT(r,c) (lb + (((nr+63)>>6)+2)*(r) + ((c)>>6)) // address of 64-bit word containing L bitmask bit for (r,c) (0<=c<r) stride of ((nr+63)>>6)+2, advances east
11561156
#define LBITX(r,c) ((c)&(BW-1)) // index of bit (r,c) in selected word
11571157
#define UBIT(r,c) (ub - (((nr+63)>>6)+2)*(c) - ((r)>>6)) // address of 64-bit word containing U bitmask bit for (r,c) (0<=r<c) stride of -(((nr+63)>>6)+2), advances west
@@ -1163,13 +1163,13 @@ DF1(jtludecomp){F1PREFIP;PROLOG(823);
11631163

11641164
__m256i endmask = _mm256_loadu_si256((__m256i*)(validitymask+((-wn)&(BLKSZ-1)))); // mask for storing last block in a row
11651165
__m256d ones = _mm256_set1_pd(1.0); // numerator of reciprocals, or value for an identity matrix
1166-
D (*scv0)[BLKSZ][BLKSZ]=LBLOCK(0,0), (*suv0)[BLKSZ][BLKSZ]=UBLOCK(0,1); // store pointers for blocks in the upcoming ring
1166+
D (*scv0)[BLKSZ][BLKSZ]=LBLOCK(0,0), (*suv1)[BLKSZ][BLKSZ]=UBLOCK(0,1); // store pointers for blocks in the upcoming ring
11671167
D *wclv=DAV(w); // pointer to corner block of the input
11681168
I r; // size-1 of ring being processed. The ring has 2r-1 cblocks. The corner block is (nr-1-r,nr-1-r)
11691169
for(r=nr-1;r>=0;--r){
11701170
__m256d a00,a01,a02,a03,a10,a11,a12,a13,nexta0,nexta1,nexta2,nexta3,recips; // double accumulators for the 4x4 block; staging area for A data; reciprocals to use to propagate down the column of L
1171-
// process one ring: the corner block, a row of U, a column of L
1172-
D (*scv)[BLKSZ][BLKSZ]=suv0; // start pointer for storing cblocks: cl going south, u going northeast
1171+
// process one ring: the corner block, a column of L, a row of U
1172+
D (*scv)[BLKSZ][BLKSZ]=scv0; // start pointer for storing blocks: c/l going south, u going northwest
11731173
D __attribute__((aligned(CACHELINESIZE))) linv[BLKSZ][BLKSZ], uinv[BLKSZ][BLKSZ]; // 'inverses' of the corner block, used to calculate L and U blocks
11741174
// initialize A[nr-1-r,nr-1-r] (pointed to by wclv) into nexta0..3
11751175
if(r>0){
@@ -1187,12 +1187,14 @@ DF1(jtludecomp){F1PREFIP;PROLOG(823);
11871187
// zero blocks. Since each zero block zaps multiple dot-product blocks, it doesn't take many to be worthwhile
11881188
lookfor0blocks=nzeroblocks*64>(nr-1-r)*(2*r+1); // # blocks so far is (nr-1-r) * (2nr-1-2*(nr-1-r)). If 1 in 64 is 0, look for it. But never first time
11891189
nzeroblocks|=(lookfor0blocks|((nr-1-r)!=10))-1; // on the 10th ring, disable zero checks if we aren't using them
1190-
D *wluv=wclv; I wlustride=BLKSZ; // pointer to next input values in A, and offset to next. We start going east
1191-
D (*llv)[BLKSZ][BLKSZ]=LBLOCK(nr-1-r,0), (*luv)[BLKSZ][BLKSZ]=UBLOCK(0,nr-1-r), (*prechv)[BLKSZ][BLKSZ]=luv-(nr+1); // start point of dot-products, startpoint of next dot-product
1190+
D *wluv=wclv; I wlustride=BLKSZ*wn; // pointer to next input values in A, and offset to next. We start going south
1191+
D (*llv)[BLKSZ][BLKSZ]=LBLOCK(nr-1-r,0), (*luv)[BLKSZ][BLKSZ]=UBLOCK(0,nr-1-r), (*prechv)[BLKSZ][BLKSZ]=llv+nr; // start point of dot-products (both going L-to-R), startpoint of next dot-product (first L block)
11921192
UI *lbv0=LBIT(nr-1-r,0), *ubv0=UBIT(0,nr-1-r); // point to the bit vectors for the corner position. These pointers are advanced after we finish each block, to handle the dot-product for the next block
11931193
I r0; // index of corner-, L- or U-block being processed: -r for corner, -r+1..0 for U, 1..r for L
1194-
D *nextfetchaddr=wclv; // the address of the block being fetched into nexta0..3
1195-
for(r0=-r;r0<=r;++r0){
1194+
D *nextfetchaddr=wclv; // the address of the block being fetched into nexta0..3. Init to the corner which we just prefetched
1195+
for(r0=-r;r0<=r;++r0){ // corner then L then U
1196+
__m256d perma[7]; // for perm calc, we save the good values from previous rows in the first up to 3 locations, and new candidates in the next 4 locs
1197+
I ngoodperma; // number of good values in perma. Used only on corner blocks
11961198
// move the next A block into the accumulators a00..a03
11971199
a00=nexta0; a01=nexta1; a02=nexta2; a03=nexta3;
11981200
D *currfetchaddr=nextfetchaddr; // the source address of the block being processed now. We will store back into the same relative position in the result
@@ -1203,14 +1205,14 @@ DF1(jtludecomp){F1PREFIP;PROLOG(823);
12031205
wluv+=wlustride; // advance to beginning of next block
12041206
nextfetchaddr=wluv; // remember addr being fetched
12051207
// Since the corner block was handled first, the block we are fetching can go out on only one side. We fill with zeros
1206-
if(unlikely(r0==-1)){ // fetching the LAST block in the U row
1208+
if(unlikely(r0==r-1)){ // fetching the LAST block in the U row
12071209
nexta0=_mm256_maskload_pd(wluv,endmask); nexta1=_mm256_maskload_pd(wluv+1*wn,endmask); nexta2=_mm256_maskload_pd(wluv+2*wn,endmask); nexta3=_mm256_maskload_pd(wluv+3*wn,endmask);
1208-
wluv=wclv; wlustride=BLKSZ*wn; // reset to corner and change direction for the next prefetch, which will start on L
1209-
}else if(unlikely(r0==r-1)){ // fetching the LAST block in the L column
1210+
}else if(unlikely(r0==-1)){ // fetching the LAST block in the L column
12101211
nexta0=_mm256_loadu_pd(wluv);
12111212
nexta1=((wn-1)&(BLKSZ-1))>0?_mm256_loadu_pd(wluv+1*wn):_mm256_setzero_pd();
12121213
nexta2=((wn-1)&(BLKSZ-1))>1?_mm256_loadu_pd(wluv+2*wn):_mm256_setzero_pd();
12131214
nexta3=((wn-1)&(BLKSZ-1))>2?_mm256_loadu_pd(wluv+3*wn):_mm256_setzero_pd();
1215+
wluv=wclv; wlustride=BLKSZ; // reset to corner and change direction for the next prefetch to east, which will start on U
12141216
}else{
12151217
// normal fetch of full block
12161218
nexta0=_mm256_loadu_pd(wluv); nexta1=_mm256_loadu_pd(wluv+1*wn); nexta2=_mm256_loadu_pd(wluv+2*wn); nexta3=_mm256_loadu_pd(wluv+3*wn);
@@ -1225,7 +1227,7 @@ DF1(jtludecomp){F1PREFIP;PROLOG(823);
12251227
// Create & use sparse sections
12261228
// we go through the sparse maps and convert the OR of the rows & columns into a sequence of (#non0,#0) pairs which we process later
12271229
UI4 *runv=rleb; // pointer to next run
1228-
if(lookfor0blocks){ // if we think it's worthwhile to skip over zero blocks...
1230+
if(lookfor0blocks){ // if we think it's worthwhile to skip over zero blocks...
12291231
UI *lbv=lbv0, *ubv=ubv0; // point to the start of the bit vectors
12301232
UI polarity=~0; // 00.00 if we are looking for 0s, 11..11 if we are looking for 1s. We start looking for skipped blocks (1s)
12311233
I bitsleft=nr-1-r, lensofar=0, bitsinstack; // number of bits left to process, length carried over from previous maskword
@@ -1255,7 +1257,7 @@ finrle: ;
12551257
// The full read bandwidth of L1 is taken with reading arguments, and the full write bandwidth is taken by the prefetch
12561258
// *llv is the horizontal L-strip, *luv is the horizontal U-strip, *prechv is the first block to prefetch. We prefetch one block
12571259
// for each block we process
1258-
// To avoid lots of casting we use D* in this loop. Each loop handles BLKSZ^2 atoms stored in rwo-major order
1260+
// To avoid lots of casting we use D* in this loop. Each loop handles BLKSZ^2 atoms stored in row-major order
12591261
// establish pointers & offsets for the args. We increment only the L pointer & use indexes for the rest
12601262
D *lvv=(D*)llv; I uofst=(D*)luv-lvv, pofst=(D*)prechv-lvv; // the 2 args+prefetch: one pointer, 2 offsets
12611263
a10=_mm256_setzero_pd(); a11=_mm256_setzero_pd(); a12=_mm256_setzero_pd(); a13=_mm256_setzero_pd(); // clear the uninitialized accumulators
@@ -1285,7 +1287,7 @@ finrle: ;
12851287
} // end of dot-product block, executed except first time
12861288
// A[x,y]-L*U (product over nr-1-r blocks) is now in the register block on the 0 side
12871289

1288-
// We are solving L(x,0..n-1)*U(0..n-1,y)=A(x,y) for L(x,nr-1-r) and/or U(nr-1-r,y) where xy>=nr-1-r. Because of the triangularity of L and U,
1290+
// We are solving L(x,0..n-1)*U(0..n-1,y)=A(x,y) for L(x,nr-1-r) and/or U(nr-1-r,y) where x,y>=nr-1-r and one is equal. Because of the triangularity of L and U,
12891291
// this reduces to
12901292
// L(nr-1-r,0..nr-1-r-1)*U(0..nr-1-r-1,y) + L(nr-1-r,nr-1-r)*U(nr-1-r,y)=A(x,y) for the row of U
12911293
// L(x,0..nr-1-r-1)*U(0..x-1,nr-1-r) + L(x,nr-1-r)*U(nr-1-r,nr-1-r)=A(x,y) for the column of L
@@ -1299,8 +1301,16 @@ finrle: ;
12991301
// For x>nr-1-r we calculate L(x,r) = D * U-1(nr-1-r,nr-1-r), doing the matrix multiply
13001302
//
13011303
// When we calculate the corner block we also write out coefficients that will ne needed to calculate the other blocks of L and U.
1304+
D (*scvi)[BLKSZ][BLKSZ]=scv; // save output address before update - we will store to it
13021305
if(r0==-r){
1303-
// corner block. First row of U is set. Alternate creating columns of L and rows of U
1306+
// corner block.
1307+
1308+
// First row of U is set. Alternate creating columns of L and rows of U
1309+
1310+
// We have to check for dangerous pivots and exchange rows if we find any. For each row of U, if the |pivot| is too small, rotate that row with the rows following
1311+
// until we get a nondangerous pivot. This will usually find a permutation within the 4-row block. If not, save the goof rows and loop back to dot-products on another
1312+
// 4 rows to search
1313+
13041314
__m256d tmp; // where we build inverse lines to write out
13051315
recips=_mm256_div_pd(ones,a00); // 1/U00 x x x
13061316

@@ -1322,7 +1332,7 @@ finrle: ;
13221332
a03=_mm256_blend_pd(a03,_mm256_fnmadd_pd(a02,_mm256_permute4x64_pd(a03,0b010101010),a03),0b1000); // a03 is A3x-L30*A0x-L31*U1x-L32*U3x = L30 L31 L32 U33
13231333
recips=_mm256_blend_pd(recips,_mm256_div_pd(ones,a03),0b1000); // 1/U00 1/U11 1/U22 1/U33
13241334

1325-
_mm256_storeu_pd(&scv0[0][0][0],a00); _mm256_storeu_pd(&scv0[0][1][0],a01); _mm256_storeu_pd(&scv0[0][2][0],a02); _mm256_storeu_pd(&scv0[0][3][0],a03); // Store the 4x4 in the corner
1335+
// obsolete _mm256_storeu_pd(&scv[0][0][0],a00); _mm256_storeu_pd(&scv[0][1][0],a01); _mm256_storeu_pd(&scv[0][2][0],a02); _mm256_storeu_pd(&scv[0][3][0],a03); // Store the 4x4 in the corner
13261336

13271337
// Now calculate uinv, the inverse of the remaining U matrix. Do this by backsubstitution up the line. Leave register a00-a03 holding the block result
13281338
a13=_mm256_blend_pd(_mm256_setzero_pd(),recips,0b1000);
@@ -1337,14 +1347,16 @@ finrle: ;
13371347
_mm256_storeu_pd(&uinv[0][0],a10); // row 0 is I00=1/U00 I01 I02 I03
13381348

13391349
// block created; advance input pointers. Output pointer still set to initial value
1340-
luv=prechv; prechv-=(nr+1); // repeat L row; advance U column
1341-
ubv0-=((nr+63)>>6)+2; // back up to bitvector for next column
1350+
// obsolete luv=prechv; prechv-=(nr+1); // repeat L row; advance U column
1351+
// obsolete ubv0-=((nr+63)>>6)+2; // back up to bitvector for next column
1352+
llv=prechv; prechv+=nr; // advance L row; repeat U column
1353+
lbv0+=((nr+63)>>6)+2; // advance bitvector for next row
1354+
scv+=nr; // move output south, to the first L block
13421355
// the bitmask for a corner block is never used
13431356
}else{ // U or L block, not corner
1344-
D (*scvi)[BLKSZ][BLKSZ]=scv; // save output address before update
13451357
UI *bma; I bmx; // address and bit# of the bitmap address to store all-0 status into
13461358
// We build the result in place in a00..a03
1347-
if(r0<=0){
1359+
if(r0>0){
13481360
// U block. Simulate L^-1 * D. Subtract multiples of each row from all the following rows, cumulatively
13491361
a01=_mm256_fnmadd_pd(a00,_mm256_set1_pd(linv[0][0]),a01); // a1-=-a0*U10/U00
13501362
a02=_mm256_fnmadd_pd(a00,_mm256_set1_pd(linv[1][0]),a02); // a2-=-a0*U20/U00
@@ -1354,8 +1366,9 @@ finrle: ;
13541366
a03=_mm256_fnmadd_pd(a02,_mm256_set1_pd(linv[2][2]),a03); // a3-=-a2*U32/U22
13551367
// block created; advance output and input pointers
13561368
scv-=nr+1; // move output northwest, to the next U block
1357-
luv=prechv; ubv0-=((nr+63)>>6)+2; if(r0<-1){prechv-=(nr+1);}else{prechv=llv+nr;} // repeat L row; advance U column and bitmap; advance prefetch but if next col of U is the last, move prefetch to L
1358-
if(unlikely(r0==0)){scv=scv0+nr; llv+=nr; prechv+=nr; luv=UBLOCK(0,nr-1-r); ubv0=UBIT(0,nr-1-r); lbv0+=((nr+63)>>6)+2;} // last col of U: L store/load point to 2d row; U load point to first col; proceed down the L rows
1369+
// obsolete luv=prechv; ubv0-=((nr+63)>>6)+2; if(r0<-1){prechv-=(nr+1);}else{prechv=llv+nr;} // repeat L row; advance U column and bitmap; advance prefetch but if next col of U is the last, move prefetch to L
1370+
// obsolete if(unlikely(r0==0)){scv=scv0+nr; llv+=nr; prechv+=nr; luv=UBLOCK(0,nr-1-r); ubv0=UBIT(0,nr-1-r); lbv0+=((nr+63)>>6)+2;} // last col of U: L store/load point to 2d row; U load point to first col; proceed down the L rows
1371+
luv=prechv; ubv0-=((nr+63)>>6)+2; if(r0!=r-1){prechv-=nr+1;} // (repeat L row); advance U northwest including bitmap; advance prefetch but if next col of U is the last, prefetch it again
13591372
// get the address of the bitmask for this block, in the U bitmap
13601373
bma=UBIT(nr-1-r,r0+nr-1); bmx=UBITX(nr-1-r,r0+nr-1); // point to the bit to store all-0 status to. col is (nr-1-r)+(r0-(-r))
13611374
}else{
@@ -1371,31 +1384,40 @@ finrle: ;
13711384
a00=_MM256_FMADD_PD(_mm256_set1_pd(lmem[0][2]),tmp,a00); a01=_MM256_FMADD_PD(_mm256_set1_pd(lmem[1][2]),tmp,a01); a02=_MM256_FMADD_PD(_mm256_set1_pd(lmem[2][2]),tmp,a02); a03=_MM256_FMADD_PD(_mm256_set1_pd(lmem[3][2]),tmp,a03);
13721385
tmp=_mm256_loadu_pd(&uinv[3][0]); // row 3 of U^-1
13731386
a00=_MM256_FMADD_PD(_mm256_set1_pd(lmem[0][3]),tmp,a00); a01=_MM256_FMADD_PD(_mm256_set1_pd(lmem[1][3]),tmp,a01); a02=_MM256_FMADD_PD(_mm256_set1_pd(lmem[2][3]),tmp,a02); a03=_MM256_FMADD_PD(_mm256_set1_pd(lmem[3][3]),tmp,a03);
1374-
// block created; advance pointers
1375-
llv=prechv; lbv0+=((nr+63)>>6)+2; if(r0!=r-1){prechv+=nr;} // repeat U col; advance L row including bitmap; advance prefetch but if next col of U is the last, prefetch it again
1376-
scv+=nr; // move output south, to the next L block
1387+
// block created; advance pointers
1388+
if(r0!=0){ // if not last line of L...
1389+
llv=prechv; lbv0+=((nr+63)>>6)+2; if(r0<-1){prechv+=nr;}else{prechv=luv-(nr+1);} // repeat U col; advance L row and bitmap; advance prefetch to next row of L but if next row of L is the last, move prefetch to U
1390+
scv+=nr; // move output south, to the next L block
1391+
}else{ // last line of L, we must switch to U (if r==1, precharge has not been switched and should be made to refetch U; otherwise prefetch should continue in U)
1392+
// obsolete scv=suv1; luv=prechv; prechv-=nr+1; llv=UBLOCK(nr-1-r,0); lbv0=LBIT(nr-1-r,0); ubv0-=((nr+63)>>6)+2;} // last row of L: U store/load point to 1st ele; L load point to first col; its accordingly
1393+
scv=suv1; luv-=nr+1; prechv=r==1?luv:luv-(nr+1); llv=LBLOCK(nr-1-r,0); lbv0=LBIT(nr-1-r,0); ubv0-=((nr+63)>>6)+2; // last row of L: U store/load point to col 1; L load point to first row; bits accordingly
1394+
}
1395+
// obsolete llv=prechv; lbv0+=((nr+63)>>6)+2; if(r0!=r-1){prechv+=nr;} // repeat U col; advance L row including bitmap; advance prefetch but if next row of L is the last, prefetch it again
13771396
// get the address of the bitmask for this block, in the U bitmap
13781397
bma=LBIT((nr-1-r)+r0,nr-1-r); bmx=LBITX((nr-1-r)+r0,nr-1-r); // point to the bit to store all-0 status to row is (nr-1-r)+r0
13791398
}
1399+
1400+
// update sparse bitmap
13801401
if(nzeroblocks>=0){ // if we haven't given up on sparse checking
13811402
// check for all-zero block, and update the sparse bitmap
13821403
a10=_mm256_or_pd(a01,a00); a11=_mm256_or_pd(a02,a03); a10=_mm256_or_pd(a11,a10); // OR of all values
13831404
a10=_mm256_cmp_pd(a10,_mm256_setzero_pd(),_CMP_NEQ_OQ); I blkis0=_mm256_testz_pd(a10,a10)==1; // see if block is all 0
13841405
*bma=((*bma)&~(1LL<<bmx))|(blkis0<<bmx); // set bit to (all values are not NE)
13851406
nzeroblocks+=blkis0; // increment count of zero blocks
13861407
}
1387-
// write the block to the result address from before update
1388-
_mm256_storeu_pd(&scvi[0][0][0],a00); _mm256_storeu_pd(&scvi[0][1][0],a01); _mm256_storeu_pd(&scvi[0][2][0],a02); _mm256_storeu_pd(&scvi[0][3][0],a03); // Store the 4x4 in the corner
1389-
13901408
}
1409+
1410+
// write the block to the result address from before update
1411+
_mm256_storeu_pd(&scvi[0][0][0],a00); _mm256_storeu_pd(&scvi[0][1][0],a01); _mm256_storeu_pd(&scvi[0][2][0],a02); _mm256_storeu_pd(&scvi[0][3][0],a03); // Store the 4x4 in the corner
1412+
13911413
// write the block result to the overall result area. We do this now because it's going to be a cache miss and we want to dribble out the data during the processing. Also, the data is in registers now so we don't have to read it
13921414
// the output area has the same relative offset in the output as the read area in the input
13931415
D *resultaddr=(D*)((C*)currfetchaddr+resultoffset); // place to store result
13941416
if(r>0){
13951417
// not the bottom-right corner.
1396-
if(r0==0){ // last block of U - truncated on the right
1418+
if(r0==r){ // last block of U - truncated on the right
13971419
_mm256_maskstore_pd(resultaddr,endmask,a00); _mm256_maskstore_pd(resultaddr+wn,endmask,a01); _mm256_maskstore_pd(resultaddr+2*wn,endmask,a02); _mm256_maskstore_pd(resultaddr+3*wn,endmask,a03);
1398-
}else if(r0==r){ // last block of L - truncated at bottom
1420+
}else if(r0==0){ // last block of L - truncated at bottom
13991421
_mm256_storeu_pd(resultaddr,a00); if(((wn-1)&(BLKSZ-1))>0){_mm256_storeu_pd(resultaddr+wn,a01); if(((wn-1)&(BLKSZ-1))>1){_mm256_storeu_pd(resultaddr+2*wn,a02); if(((wn-1)&(BLKSZ-1))>2){_mm256_storeu_pd(resultaddr+3*wn,a03); }}}
14001422
}else{ // normal full block
14011423
_mm256_storeu_pd(resultaddr,a00); _mm256_storeu_pd(resultaddr+wn,a01); _mm256_storeu_pd(resultaddr+2*wn,a02); _mm256_storeu_pd(resultaddr+3*wn,a03);
@@ -1405,7 +1427,7 @@ finrle: ;
14051427
}
14061428
}
14071429
wclv+=BLKSZ*(wn+1); // move input pointer to corner block of next ring
1408-
scv0+=nr+1; suv0-=nr; // advance storage pointers to next ring.
1430+
scv0+=nr+1; suv1-=nr; // advance storage pointers to next ring.
14091431
}
14101432
EPILOG(jlink(IX(wn),z));
14111433
#endif

0 commit comments

Comments
 (0)