You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// values in the stripe for each op, in format needed
3352
3346
3353
-
releaseinforinfo[RINGROWS]; // info on all rows that have been released. Must be cleared after each release, i. e. 0 when starting new 8block
3347
+
__m256d*releaseqktringbase[RINGROWS]; // base of Qkt region mapped to ring. Must be on RBLOCK boundary; low bits are corresponding ring row
3348
+
UI8releaserowmask[RINGROWS]; // mask of all the resultblocks that have been modified in this line Must be cleared after each release, i. e. 0 when starting new 8block
3354
3349
3355
3350
Iopno=ti; // op# to create the index for. First one is out thread#
3356
3351
do{
3357
3352
if(opno<nops){Ama; // if the first reservation is too high, we have more threads than ops. skip it then
3358
-
// calculate column index for the op - the offset into mask/cols when we process a given stripe
3353
+
// calculate column index for the op - the offset into mask/cols as we process each stripe
3359
3354
Incvals=AN(opcolvals[opno]); __m256i*cmask=(__m256i*)BAV(opcolmasks[opno]); // # non0s left in op column, pointer to mask
3360
3355
GATV0(ma,INT4,ncvals+1,1) // allocate space for index (incl 1 sentinel), on cacheline boundary
3361
3356
I4*mav=I4AV1(ma); (*colndxs)[opno]=mav; // Get address of index, publish the address to other threads. Lots of false sharing on this store!
bsum+=bsum>>32; bsum+=bsum>>16; bsum+=bsum>>8; (*opstripebsum)[i][opno]=bsumtodate+=(C)bsum;)// add; if last word incomplete, it must have exactly 32 bits
3374
3369
3375
3370
}
3376
3371
opno=__atomic_fetch_add(&ctx->colndxct,1,__ATOMIC_ACQ_REL); // reserve next row. Every thread will finish with one failing reservation
3377
3372
}while(opno<nops);
3378
3373
3379
3374
// initialize internal areas while we wait for indexes to settle.
3380
3375
mvc(sizeof(ring),ring,MEMSET00LEN,MEMSET00); // clear the ring to all 0.0
3381
-
mvc(sizeof(rinfo),rinfo,MEMSET00LEN,MEMSET00); // clear the release info to 0
3382
-
Ib8start=0; // start of ring area where next 8block is built AND end+1 pointer of data released to ring (could be US)
3376
+
mvc(sizeof(releaserowmask),releaserowmask,MEMSET00LEN,MEMSET00); // clear the release info to 0
3377
+
DO(RINGROWS, releaseqktringbase[i]=(__m256d*)i;)
3383
3378
3384
3379
DO(nops, opstat[i].acolvals=EAV(opcolvals[i]);) // get pointer to values in each column
Istripex=ti; // initial stripe reservation, from thread#
3392
3387
// state needed to release one row of ring (viz relstart).
3393
-
UIreleaseblockmask=0; Irelstart=0, releasenormct=8, releasedelayct=0, releasect; __m256d*releaseqkbase; // mask of blocks in row, index of row being released, normal burst length, amount of processing before next burst, actual burst length, Qkt addr of burst
3388
+
#defineRELEASEBLOCKCT 8 // number of RBLOCKS to handle at a time. This should be as big as we can make it without filling write buffers. 2 blocks=1 cacheline
3389
+
UIreleaseblockmask=0; Ireleasect; __m256d*releaseqktringbasecurr; // mask of blocks in row, index of row being released, normal burst length, amount of processing before next burst, actual burst length, Qkt addr of burst
3390
+
#defineCYCBETWEENRELEASE0 500 // estimated clocks to receive DRAM data. We try to burst only this often so that write buffers can drain
3391
+
#defineCYCPERINSERT 12 // number of cycles per insertion
3392
+
#defineRELEASEDELAYCT0 RELEASEBLOCKCT*(CYCBETWEENRELEASE0/CYCPERINSERT)*sizeof(US) // unbiased delay, measured in #offsets processed
3393
+
Ireleasedelayct0; // biased delay given ring status
3394
+
// portmanteau register holding ring status
3395
+
Iringdctrlb8=0; // delay count, relstart, b8start
3396
+
#defineRINGDCTX 48 // bit position of delayct, which counts USs and goes positive when it is OK to release RELEASEBLOCKCT blocks. Ends at sign bit
3397
+
#defineRINGDCTMASK 0xff000000000000
3398
+
#defineRINGRELSTARTX 8 // bit position of relstart, the next/current row of the ring to relesse
// when relstart==b8start, ring is empty. releaseblockmask is always 0 then. releasedelayct is set negative to delay the next batch; it increments as blocks are put into the ring. releasenormct gives the normal burst size, which increases
3395
-
// if the ring fills. releasect counts the actual burst kength, which is releasenormct unless the ring is full or processing is over, in which case it is set to high-value to flush the ring
3404
+
// if the ring fills. releasect counts the actual burst length, which is RELEASEBLOCKCT unless the ring is full or processing is over, in which case it is set to high-value to flush the ring
3396
3405
while(stripex<nstripes){ // ... for each reservation...
3397
3406
Istripe=stripegrade[stripex]; // get the actual stripe# to process
__m256im32=_mm256_loadu_si256(smask+j32); // read 32 bits. May overfetch
3417
3426
((C*)&opstat[io].rbmask)[j32]=(UI)(UI4)_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(m32,_mm256_setzero_si256()))); // create touched mask for each 4-value section (i. e. 1 resultblock), save in rbmask
3418
3427
Ibits=(UI)(UI4)_mm256_movemask_epi8(_mm256_cmpgt_epi8(m32,_mm256_setzero_si256())); // extract the 32 bits
3419
-
while(bits){lastoffset=nextstripeofst[nofst]=(j32*32+CTTZI(bits))*sizeof(E); bits&=-bits; ++nofst; if(nofst==ssize)goto finmask;} // turn each 1-bit into a byte offset; stop if we have hit # offsets
3428
+
while(bits){lastoffset=nextstripeofst[nofst]=(j32*32+CTTZI(bits))*sizeof(E); bits&=bits-1; ++nofst; if(nofst==ssize)goto finmask;} // turn each 1-bit into a byte offset; stop if we have hit # offsets
3420
3429
}
3421
3430
finmask:;
3422
3431
opstat[io].rbmask&=(UI)~0>>(BW-(((*stripestartend1)[stripe][1]-sstart)>>LGRESBLKE)); // mask off blocks past the valid region
3423
3432
// read the row values for this stripe and convert them to 0213 form
3424
3433
Ij4; C*sval; // byte offset of 4-E reads to date; pointer to start of values for this section
3425
3434
opstat[io].arow0213=nextstripe0213; // remember where the offsets start
__m256dh0l0h1l1=_mm256_loadu_pd((D*)(sval+j4)), h2l2h3l3=_mm256_loadu_pd((D*)(sval+j4+sizeof(E)*RESBLKE/2)); // read the next 4 values. This wipes out lots of D2$; perhaps should stream into temp area
3428
3437
__m256dh0h2h1h3=_mm256_shuffle_pd(h0l0h1l1,h2l2h3l3,0b0000), l0l2l1l3=_mm256_shuffle_pd(h0l0h1l1,h2l2h3l3,0b1111); // convert to 0213 order
3429
3438
_mm256_store_pd((D*)((C*)nextstripe0213+j4),h0h2h1h3); _mm256_store_pd((D*)((C*)nextstripe0213+j4+sizeof(E)*RESBLKE/2),l0l2l1l3); // store in 0213 order
3430
3439
}
3431
3440
// if last block of 4 values is not all valid, we must repeat the last valid offset to the end of the block, and put the last value into the last slot
3432
3441
lastoffset+=lastoffset<<(sizeof(US)*BB); lastoffset+=lastoffset<<(2*sizeof(US)*BB); *(I*)&nextstripeofst[nofst]=lastoffset; // append 4 copies of last offset; 0-3 are needed
3433
-
Ilastvalidlane=(0b01100011>>(ssize&(NPAR-1)))&(NPAR-1); // lane# holding last valid value: 0 1 2 3 -> 3 0 2 1 (since values are 0213)
3442
+
Ilastvalidlane=(0b01100011>>(2*(ssize&(NPAR-1))))&(NPAR-1); // lane# holding last valid value: 0 1 2 3 -> 3 0 2 1 (since values are 0213)
3434
3443
D (*lastvals)[2][4]=(D (*)[2][4])((C*)nextstripe0213+j4-RESBLKE*sizeof(E)); // address of resblk containing last valid pointer
3435
3444
(*lastvals)[0][3]=(*lastvals)[0][lastvalidlane]; (*lastvals)[1][3]=(*lastvals)[1][lastvalidlane]; // transfer value to last value in block, which is written last
3436
3445
@@ -3449,12 +3458,12 @@ finmask:;
3449
3458
I4nextrowinop; // next row to process in this op
3450
3459
while((nextrowinop=currop->colndxahead)-b8qktrow<B8ROWS){ // if next row is in 8block
3451
3460
// next row can be processed in this 8block. Load the column info and update the readahead
3452
-
Iringx=(b8start+(nextrowinop-b8qktrow))&(RINGROWS-1); // ring row to fill
3461
+
Iringx=(I)releaseqktringbase[(ringdctrlb8+(nextrowinop-b8qktrow))&RINGB8STARTMASK]; // ring row to fill
3453
3462
E*r0=ring[ringx]; // base the offsets will be applied against
3454
3463
US*aof=currop->arowoffsets; __m256d*ava=currop->arow0213; Ialen=currop->nofsts; // loop boundaries for processing the row of the stripe
3455
3464
__m256dcolh=_mm256_set1_pd(currop->colvalahead.hi), coll=_mm256_set1_pd(currop->colvalahead.lo); // copy column value into all lanes
3456
3465
currop->colndxahead=currop->acolndxs[++currop->rowindex]; currop->colvalahead=currop->acolvals[currop->rowindex]; // read ahead for next row
3457
-
rinfo[ringx].rowmask|=currop->rbmask; // make note of the resultblocks that will be modified by this row
3466
+
releaserowmask[ringx]|=currop->rbmask; // make note of the resultblocks that will be modified by this row
3458
3467
3459
3468
// Calculate one row of the op
3460
3469
Iandx=0; // counts in steps of RESBLKE*sizeof(one offset). With this stride we can use andx to point to offsets and andx*sizeof(E)/sizeof(one offset) (=8) to point to values
@@ -3489,11 +3498,13 @@ finmask:;
3489
3498
}while((andx+=(RESBLKE*sizeof(aof[0])))<alen); // end after last block
3490
3499
3491
3500
// send a few values to Qkt
3492
-
releasedelayct+=andx; // add the number of blocks we processed. When the total goes nonnegative we can release again
3493
-
if(releaseblockmask>(UI)REPSGN(releasedelayct)){ // if there are released values... (mask>0 and delayct nonneg)
3494
-
releasect=releasenormct; // set releasect to the number of blocks to take. We will stop when the row is empty in any case
3501
+
ringdctrlb8+=andx<<RINGDCTX; // add the number of blocks we processed. When the total goes nonnegative we can release again
3502
+
if(releaseblockmask>(UI)REPSGN(ringdctrlb8)){ // if there are released values... (mask>0 and delayct nonneg)
3503
+
releasect=RELEASEBLOCKCT; // set releasect to the number of blocks to take. We will stop when the row is empty in any case
3504
+
ringdctrlb8=(ringdctrlb8&~RINGDCTMASK)+releasedelayct0; // reset delay till next block release
3495
3505
releaserow:; // entered from below to drain the ring, either on buffer-full or at end-of-operation. releasect is set to a high value in that case
3496
-
__m256*releaseringbase=(__m256*)ring[relstart]; // get address in ring
3506
+
__m256d*releaseringbase=(__m256d*)ring[(I)releaseqktringbasecurr&(RINGROWS-1)]; // get base address in ring
3507
+
__m256d*releaseqkbase=(__m256d*)((I)releaseqktringbasecurr&-RINGROWS); // get base address in Qkt
3497
3508
do{
3498
3509
// calculate a result block
3499
3510
Iblockbyteofst=CTTZI(releaseblockmask)*sizeof(E)*RESBLKE; // get offset to next modified block in this row
@@ -3515,12 +3526,14 @@ releaserow:; // entered from below to drain the ring, either on buffer-full or
3515
3526
if((releaseblockmask&=(releaseblockmask-1))==0)goto rowfin; // advance to next block, exit if none
3516
3527
}while(--releasect); // could use PEXT & block mask to avoid need for releasect here
3517
3528
if(0){rowfin:; // come here when a row has been fully sent to Qkt
3518
-
rinfo[relstart].rowmask=0; // when we finish a row, we must leave it with an empty mask
3519
-
relstart=(relstart+1)&(RINGROWS-1); // advance to next row
3520
-
if(relstart!=b8start){ // if the release area is not empty after removing the finished row...
3521
-
releaseblockmask=rinfo[relstart].rowmask; releaseringbase=(__m256*)ring[relstart]; releaseqkbase=rinfo[relstart].qktbase; // move next row to the release variables. blockmask=0 means no work
releaserowmask[relstart]=0; // when we finish a row, we must leave it with an empty mask
3531
+
releaseqktringbase[relstart]=(__m256d*)((I)releaseqktringbase[relstart]&(RINGROWS-1)); // clear Qkt, leaving ring row#
3532
+
relstart=(relstart+1)&(RINGROWS-1); ringdctrlb8=(ringdctrlb8+(1<<RINGRELSTARTX))&RINGRELSTARTWRAP; // advance to next released row
3533
+
if(((ringdctrlb8-relstart)&(RINGROWS-1))!=0){ // if the release area is not empty after removing the finished row...
3534
+
releaseblockmask=releaserowmask[relstart]; releaseqktringbasecurr=releaseqktringbase[relstart]; // move next row to the release variables. blockmask=0 means no work
3522
3535
// Here we loop back to handle exception cases: (1) ring full; (2) operation finished. We set releasect to high-value
3523
-
if(unlikely(((b8start-relstart)&(RINGROWS-1)))>=(RINGROWS-B8ROWS))goto releaserow; // if ring is still full, wait for it to drain
3536
+
if(unlikely(((ringdctrlb8-relstart)&(RINGROWS-1))>=(RINGROWS-B8ROWS)))goto releaserow; // if ring is still full, wait for it to drain
3524
3537
if(unlikely(stripex>=nstripes))goto releaserow; // if the problem is over, flush the entire ring
3525
3538
}
3526
3539
if(unlikely(releasect>100))if(stripex>=nstripes)goto finis; else goto caughtup; // if we are coming out of a loopback, go to the right place: end, or just after we released into the full ring
@@ -3536,12 +3549,24 @@ releaserow:; // entered from below to drain the ring, either on buffer-full or
3536
3549
3537
3550
// 8block finished. close up the rows and release them to the output stage. For each nonempty row we write out the address of the Qkt data, and copy the
3538
3551
// mask. We also have to make sure the mask is cleared to 0 in all rows that go unreleased
3539
-
Isx=b8start; Iqktrcol=(I)&qktcol[b8qktrow*qktncols]; // store index, address of the modified row of Qkt corresponding to b8start
if(releaseblockmask==0){releaseblockmask=rinfo[relstart].rowmask; releaseqkbase=rinfo[relstart].qktbase;} // if release queue was empty, move first row to the release variables. blockmask=0 means no work
3543
-
elseif(unlikely(((b8start-relstart)&(RINGROWS-1)))>=(RINGROWS-B8ROWS)){releasect=(UI4)~0>>1; goto releaserow;} // if ring is full, wait for it to drain
3552
+
Iorigb8=ringdctrlb8&RINGB8STARTMASK, b8start=origb8, sx=b8start; Iqktrcol=(I)&qktcol[b8qktrow*qktncols]; // store index, address of the modified row of Qkt corresponding to b8start
3553
+
UI8skiprows=0; // mask of empty rows that must be added to the end
3554
+
DONOUNROLL(B8ROWS,
3555
+
skiprows=(skiprows&-RINGROWS)+(I)releaseqktringbase[b8start]; // remember new row in case we skip it
// b8start has been advanced over all the nonskipped rows, which releases them.
3563
+
ringdctrlb8=(ringdctrlb8&~RINGB8STARTMASK)+b8start; // install b8start in portmanteau
3564
+
if(releaseblockmask==0){releaseblockmask=releaserowmask[origb8]; releaseqktringbasecurr=releaseqktringbase[origb8]; ringdctrlb8&=~RINGDCTMASK;} // if release queue was empty, move first row to the release variables. blockmask=0 means no work.
3565
+
// delayct has been incrementing continuously, perhaps overflowing - clear it to start releasing
3566
+
elseif(unlikely(((ringdctrlb8-(ringdctrlb8>>RINGRELSTARTX))&(RINGROWS-1)))>=(RINGROWS-B8ROWS)){releasect=(UI4)~0>>1; goto releaserow;} // if ring is full, wait for it to drain
3544
3567
caughtup:; // here when we have removed the ring-full situation
3568
+
// throttle the release depending on ring-full status. We figure this only when we add rows because it's not important to keep it exactly right
3569
+
releasedelayct0=(-((((RINGROWS-((ringdctrlb8-(ringdctrlb8>>RINGRELSTARTX)-1)&(RINGROWS-1)))>>3)+1)*RELEASEDELAYCT0)>>3)<<RINGDCTX; // decrease delay (which is negative) with each eight-row section filled
3545
3570
} // end 'while aops'
3546
3571
stripex=__atomic_fetch_add(&ctx->resvx,1,__ATOMIC_ACQ_REL); // reserve next row. Every thread will finish with one failing reservation
0 commit comments