You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Inr=(wn+BLKSZ-1)>>LGBLKSZ; // nr=total # blocks on a side (including partial ones)
1152
1152
#defineCORNERBLOCK(rc) (cb+(rc)*(nr+1)) // address of corner cblock in row&col rc
1153
1153
#defineLBLOCK(r,c) (cb+(r)*nr+(c)) // address of L cblock for row r col c (0<=c<r)
1154
-
#defineUBLOCK(r,c) (cb+(nr-(c))*nr - (c) + (r)) // address of U cblock for row r col c (0<=r<c) - transposed order
1154
+
#defineUBLOCK(r,c) (cb+(nr-(c))*nr - (c) + (r)) // address of U cblock for row r col c (0<=r<c) - transposed order going left-to-right, starting after corner block
1155
1155
#defineLBIT(r,c) (lb + (((nr+63)>>6)+2)*(r) + ((c)>>6)) // address of 64-bit word containing L bitmask bit for (r,c) (0<=c<r) stride of ((nr+63)>>6)+2, advances east
1156
1156
#defineLBITX(r,c) ((c)&(BW-1)) // index of bit (r,c) in selected word
1157
1157
#defineUBIT(r,c) (ub - (((nr+63)>>6)+2)*(c) - ((r)>>6)) // address of 64-bit word containing U bitmask bit for (r,c) (0<=r<c) stride of -(((nr+63)>>6)+2), advances west
__m256iendmask=_mm256_loadu_si256((__m256i*)(validitymask+((-wn)&(BLKSZ-1)))); // mask for storing last block in a row
1165
1165
__m256dones=_mm256_set1_pd(1.0); // numerator of reciprocals, or value for an identity matrix
1166
-
D (*scv0)[BLKSZ][BLKSZ]=LBLOCK(0,0), (*suv0)[BLKSZ][BLKSZ]=UBLOCK(0,1); // store pointers for blocks in the upcoming ring
1166
+
D (*scv0)[BLKSZ][BLKSZ]=LBLOCK(0,0), (*suv1)[BLKSZ][BLKSZ]=UBLOCK(0,1); // store pointers for blocks in the upcoming ring
1167
1167
D*wclv=DAV(w); // pointer to corner block of the input
1168
1168
Ir; // size-1 of ring being processed. The ring has 2r-1 cblocks. The corner block is (nr-1-r,nr-1-r)
1169
1169
for(r=nr-1;r>=0;--r){
1170
1170
__m256da00,a01,a02,a03,a10,a11,a12,a13,nexta0,nexta1,nexta2,nexta3,recips; // double accumulators for the 4x4 block; staging area for A data; reciprocals to use to propagate down the column of L
1171
-
// process one ring: the corner block, a row of U, a column of L
1172
-
D (*scv)[BLKSZ][BLKSZ]=suv0; // start pointer for storing cblocks: cl going south, u going northeast
1171
+
// process one ring: the corner block, a column of L, a row of U
1172
+
D (*scv)[BLKSZ][BLKSZ]=scv0; // start pointer for storing blocks: c/l going south, u going northwest
1173
1173
D __attribute__((aligned(CACHELINESIZE))) linv[BLKSZ][BLKSZ], uinv[BLKSZ][BLKSZ]; // 'inverses' of the corner block, used to calculate L and U blocks
1174
1174
// initialize A[nr-1-r,nr-1-r] (pointed to by wclv) into nexta0..3
// zero blocks. Since each zero block zaps multiple dot-product blocks, it doesn't take many to be worthwhile
1188
1188
lookfor0blocks=nzeroblocks*64>(nr-1-r)*(2*r+1); // # blocks so far is (nr-1-r) * (2nr-1-2*(nr-1-r)). If 1 in 64 is 0, look for it. But never first time
1189
1189
nzeroblocks|=(lookfor0blocks|((nr-1-r)!=10))-1; // on the 10th ring, disable zero checks if we aren't using them
1190
-
D*wluv=wclv; Iwlustride=BLKSZ; // pointer to next input values in A, and offset to next. We start going east
1191
-
D (*llv)[BLKSZ][BLKSZ]=LBLOCK(nr-1-r,0), (*luv)[BLKSZ][BLKSZ]=UBLOCK(0,nr-1-r), (*prechv)[BLKSZ][BLKSZ]=luv-(nr+1); // start point of dot-products, startpoint of next dot-product
1190
+
D*wluv=wclv; Iwlustride=BLKSZ*wn; // pointer to next input values in A, and offset to next. We start going south
1191
+
D (*llv)[BLKSZ][BLKSZ]=LBLOCK(nr-1-r,0), (*luv)[BLKSZ][BLKSZ]=UBLOCK(0,nr-1-r), (*prechv)[BLKSZ][BLKSZ]=llv+nr; // start point of dot-products (both going L-to-R), startpoint of next dot-product (first L block)
1192
1192
UI*lbv0=LBIT(nr-1-r,0), *ubv0=UBIT(0,nr-1-r); // point to the bit vectors for the corner position. These pointers are advanced after we finish each block, to handle the dot-product for the next block
1193
1193
Ir0; // index of corner-, L- or U-block being processed: -r for corner, -r+1..0 for U, 1..r for L
1194
-
D*nextfetchaddr=wclv; // the address of the block being fetched into nexta0..3
1195
-
for(r0=-r;r0<=r;++r0){
1194
+
D*nextfetchaddr=wclv; // the address of the block being fetched into nexta0..3. Init to the corner which we just prefetched
1195
+
for(r0=-r;r0<=r;++r0){ // corner then L then U
1196
+
__m256dperma[7]; // for perm calc, we save the good values from previous rows in the first up to 3 locations, and new candidates in the next 4 locs
1197
+
Ingoodperma; // number of good values in perma. Used only on corner blocks
1196
1198
// move the next A block into the accumulators a00..a03
1197
1199
a00=nexta0; a01=nexta1; a02=nexta2; a03=nexta3;
1198
1200
D*currfetchaddr=nextfetchaddr; // the source address of the block being processed now. We will store back into the same relative position in the result
// we go through the sparse maps and convert the OR of the rows & columns into a sequence of (#non0,#0) pairs which we process later
1227
1229
UI4*runv=rleb; // pointer to next run
1228
-
if(lookfor0blocks){ // if we think it's worthwhile to skip over zero blocks...
1230
+
if(lookfor0blocks){ // if we think it's worthwhile to skip over zero blocks...
1229
1231
UI*lbv=lbv0, *ubv=ubv0; // point to the start of the bit vectors
1230
1232
UIpolarity=~0; // 00.00 if we are looking for 0s, 11..11 if we are looking for 1s. We start looking for skipped blocks (1s)
1231
1233
Ibitsleft=nr-1-r, lensofar=0, bitsinstack; // number of bits left to process, length carried over from previous maskword
@@ -1255,7 +1257,7 @@ finrle: ;
1255
1257
// The full read bandwidth of L1 is taken with reading arguments, and the full write bandwidth is taken by the prefetch
1256
1258
// *llv is the horizontal L-strip, *luv is the horizontal U-strip, *prechv is the first block to prefetch. We prefetch one block
1257
1259
// for each block we process
1258
-
// To avoid lots of casting we use D* in this loop. Each loop handles BLKSZ^2 atoms stored in rwo-major order
1260
+
// To avoid lots of casting we use D* in this loop. Each loop handles BLKSZ^2 atoms stored in row-major order
1259
1261
// establish pointers & offsets for the args. We increment only the L pointer & use indexes for the rest
1260
1262
D*lvv=(D*)llv; Iuofst=(D*)luv-lvv, pofst=(D*)prechv-lvv; // the 2 args+prefetch: one pointer, 2 offsets
1261
1263
a10=_mm256_setzero_pd(); a11=_mm256_setzero_pd(); a12=_mm256_setzero_pd(); a13=_mm256_setzero_pd(); // clear the uninitialized accumulators
@@ -1285,7 +1287,7 @@ finrle: ;
1285
1287
} // end of dot-product block, executed except first time
1286
1288
// A[x,y]-L*U (product over nr-1-r blocks) is now in the register block on the 0 side
1287
1289
1288
-
// We are solving L(x,0..n-1)*U(0..n-1,y)=A(x,y) for L(x,nr-1-r) and/or U(nr-1-r,y) where xy>=nr-1-r. Because of the triangularity of L and U,
1290
+
// We are solving L(x,0..n-1)*U(0..n-1,y)=A(x,y) for L(x,nr-1-r) and/or U(nr-1-r,y) where x,y>=nr-1-r and one is equal. Because of the triangularity of L and U,
1289
1291
// this reduces to
1290
1292
// L(nr-1-r,0..nr-1-r-1)*U(0..nr-1-r-1,y) + L(nr-1-r,nr-1-r)*U(nr-1-r,y)=A(x,y) for the row of U
1291
1293
// L(x,0..nr-1-r-1)*U(0..x-1,nr-1-r) + L(x,nr-1-r)*U(nr-1-r,nr-1-r)=A(x,y) for the column of L
@@ -1299,8 +1301,16 @@ finrle: ;
1299
1301
// For x>nr-1-r we calculate L(x,r) = D * U-1(nr-1-r,nr-1-r), doing the matrix multiply
1300
1302
//
1301
1303
// When we calculate the corner block we also write out coefficients that will ne needed to calculate the other blocks of L and U.
1304
+
D (*scvi)[BLKSZ][BLKSZ]=scv; // save output address before update - we will store to it
1302
1305
if(r0==-r){
1303
-
// corner block. First row of U is set. Alternate creating columns of L and rows of U
1306
+
// corner block.
1307
+
1308
+
// First row of U is set. Alternate creating columns of L and rows of U
1309
+
1310
+
// We have to check for dangerous pivots and exchange rows if we find any. For each row of U, if the |pivot| is too small, rotate that row with the rows following
1311
+
// until we get a nondangerous pivot. This will usually find a permutation within the 4-row block. If not, save the goof rows and loop back to dot-products on another
1312
+
// 4 rows to search
1313
+
1304
1314
__m256dtmp; // where we build inverse lines to write out
1305
1315
recips=_mm256_div_pd(ones,a00); // 1/U00 x x x
1306
1316
@@ -1322,7 +1332,7 @@ finrle: ;
1322
1332
a03=_mm256_blend_pd(a03,_mm256_fnmadd_pd(a02,_mm256_permute4x64_pd(a03,0b010101010),a03),0b1000); // a03 is A3x-L30*A0x-L31*U1x-L32*U3x = L30 L31 L32 U33
_mm256_storeu_pd(&scv0[0][0][0],a00); _mm256_storeu_pd(&scv0[0][1][0],a01); _mm256_storeu_pd(&scv0[0][2][0],a02); _mm256_storeu_pd(&scv0[0][3][0],a03); // Store the 4x4 in the corner
1335
+
// obsolete_mm256_storeu_pd(&scv[0][0][0],a00); _mm256_storeu_pd(&scv[0][1][0],a01); _mm256_storeu_pd(&scv[0][2][0],a02); _mm256_storeu_pd(&scv[0][3][0],a03); // Store the 4x4 in the corner
1326
1336
1327
1337
// Now calculate uinv, the inverse of the remaining U matrix. Do this by backsubstitution up the line. Leave register a00-a03 holding the block result
// block created; advance output and input pointers
1356
1368
scv-=nr+1; // move output northwest, to the next U block
1357
-
luv=prechv; ubv0-=((nr+63)>>6)+2; if(r0<-1){prechv-=(nr+1);}else{prechv=llv+nr;} // repeat L row; advance U column and bitmap; advance prefetch but if next col of U is the last, move prefetch to L
1358
-
if(unlikely(r0==0)){scv=scv0+nr; llv+=nr; prechv+=nr; luv=UBLOCK(0,nr-1-r); ubv0=UBIT(0,nr-1-r); lbv0+=((nr+63)>>6)+2;} // last col of U: L store/load point to 2d row; U load point to first col; proceed down the L rows
1369
+
// obsolete luv=prechv; ubv0-=((nr+63)>>6)+2; if(r0<-1){prechv-=(nr+1);}else{prechv=llv+nr;} // repeat L row; advance U column and bitmap; advance prefetch but if next col of U is the last, move prefetch to L
1370
+
// obsolete if(unlikely(r0==0)){scv=scv0+nr; llv+=nr; prechv+=nr; luv=UBLOCK(0,nr-1-r); ubv0=UBIT(0,nr-1-r); lbv0+=((nr+63)>>6)+2;} // last col of U: L store/load point to 2d row; U load point to first col; proceed down the L rows
1371
+
luv=prechv; ubv0-=((nr+63)>>6)+2; if(r0!=r-1){prechv-=nr+1;} // (repeat L row); advance U northwest including bitmap; advance prefetch but if next col of U is the last, prefetch it again
1359
1372
// get the address of the bitmask for this block, in the U bitmap
1360
1373
bma=UBIT(nr-1-r,r0+nr-1); bmx=UBITX(nr-1-r,r0+nr-1); // point to the bit to store all-0 status to. col is (nr-1-r)+(r0-(-r))
llv=prechv; lbv0+=((nr+63)>>6)+2; if(r0!=r-1){prechv+=nr;} // repeat U col; advance L row including bitmap; advance prefetch but if next col of U is the last, prefetch it again
1376
-
scv+=nr; // move output south, to the next L block
1387
+
// block created; advance pointers
1388
+
if(r0!=0){ // if not last line of L...
1389
+
llv=prechv; lbv0+=((nr+63)>>6)+2; if(r0<-1){prechv+=nr;}else{prechv=luv-(nr+1);} // repeat U col; advance L row and bitmap; advance prefetch to next row of L but if next row of L is the last, move prefetch to U
1390
+
scv+=nr; // move output south, to the next L block
1391
+
}else{ // last line of L, we must switch to U (if r==1, precharge has not been switched and should be made to refetch U; otherwise prefetch should continue in U)
1392
+
// obsolete scv=suv1; luv=prechv; prechv-=nr+1; llv=UBLOCK(nr-1-r,0); lbv0=LBIT(nr-1-r,0); ubv0-=((nr+63)>>6)+2;} // last row of L: U store/load point to 1st ele; L load point to first col; its accordingly
1393
+
scv=suv1; luv-=nr+1; prechv=r==1?luv:luv-(nr+1); llv=LBLOCK(nr-1-r,0); lbv0=LBIT(nr-1-r,0); ubv0-=((nr+63)>>6)+2; // last row of L: U store/load point to col 1; L load point to first row; bits accordingly
1394
+
}
1395
+
// obsolete llv=prechv; lbv0+=((nr+63)>>6)+2; if(r0!=r-1){prechv+=nr;} // repeat U col; advance L row including bitmap; advance prefetch but if next row of L is the last, prefetch it again
1377
1396
// get the address of the bitmask for this block, in the U bitmap
1378
1397
bma=LBIT((nr-1-r)+r0,nr-1-r); bmx=LBITX((nr-1-r)+r0,nr-1-r); // point to the bit to store all-0 status to row is (nr-1-r)+r0
1379
1398
}
1399
+
1400
+
// update sparse bitmap
1380
1401
if(nzeroblocks>=0){ // if we haven't given up on sparse checking
1381
1402
// check for all-zero block, and update the sparse bitmap
1382
1403
a10=_mm256_or_pd(a01,a00); a11=_mm256_or_pd(a02,a03); a10=_mm256_or_pd(a11,a10); // OR of all values
1383
1404
a10=_mm256_cmp_pd(a10,_mm256_setzero_pd(),_CMP_NEQ_OQ); Iblkis0=_mm256_testz_pd(a10,a10)==1; // see if block is all 0
1384
1405
*bma=((*bma)&~(1LL<<bmx))|(blkis0<<bmx); // set bit to (all values are not NE)
1385
1406
nzeroblocks+=blkis0; // increment count of zero blocks
1386
1407
}
1387
-
// write the block to the result address from before update
1388
-
_mm256_storeu_pd(&scvi[0][0][0],a00); _mm256_storeu_pd(&scvi[0][1][0],a01); _mm256_storeu_pd(&scvi[0][2][0],a02); _mm256_storeu_pd(&scvi[0][3][0],a03); // Store the 4x4 in the corner
1389
-
1390
1408
}
1409
+
1410
+
// write the block to the result address from before update
1411
+
_mm256_storeu_pd(&scvi[0][0][0],a00); _mm256_storeu_pd(&scvi[0][1][0],a01); _mm256_storeu_pd(&scvi[0][2][0],a02); _mm256_storeu_pd(&scvi[0][3][0],a03); // Store the 4x4 in the corner
1412
+
1391
1413
// write the block result to the overall result area. We do this now because it's going to be a cache miss and we want to dribble out the data during the processing. Also, the data is in registers now so we don't have to read it
1392
1414
// the output area has the same relative offset in the output as the read area in the input
1393
1415
D*resultaddr=(D*)((C*)currfetchaddr+resultoffset); // place to store result
1394
1416
if(r>0){
1395
1417
// not the bottom-right corner.
1396
-
if(r0==0){ // last block of U - truncated on the right
1418
+
if(r0==r){ // last block of U - truncated on the right
0 commit comments