You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#definePTISCAVNX 23 // this flag used in a register here
149
149
#definePTISCAVN(pt) ((pt)&(1LL<<PTISCAVNX))
150
150
#definePTISRPAR0(pt) ((pt)&0x7fff)
151
-
#definePTISMARKBACKORRPAR(s) (((s).pt&0xffff0000)==(PTRPAR&0xffff0000)) // s.pt is ) or MARK
151
+
// clang17 wastes an instruction #define PTISMARKBACKORRPAR(s) (((s).pt>>16)==(PTRPAR>>16)) // s.pt is ) or MARK.
152
+
#definePTISMARKBACKORRPAR(s) ((((US*)&(s).pt)[1])==(PTRPAR>>16)) // s.pt is ) or MARK.
152
153
_Static_assert((PTRPAR^PTMARKBACK&0xffff0000)==0,"MARKBACK must equal RPAR for end purposes");
153
154
#definePTISMARKFRONT(pt) (((pt)&0xff000000)==(PTMARKFRONT&0xff000000)) // pt is MARKFRONT
154
155
// obsolete #define PTOKEND(t2,t3) (((PTISCAVN(~(t2).pt))+((t3).pt^PTMARKBACK))==0) // t2 is CAVN and t3 is MARK
@@ -667,15 +668,19 @@ endname: ;
667
668
// and finally returning the new front-of-stack pointer
668
669
// First, create the bitmask of parser lines that are eligible to execute
669
670
// register pressure is severe where we do subroutine calls below
670
-
Ipmask=(I)((C*)&stack[1].pt)[1] & (I)((C*)&stack[2].pt)[2]; // stkpos 0-2 are enough to detect a match on line 0
671
-
pmask&=GETSTACK0PT; // finish 1st 3 columns of parse, which are enough to decide bit 0
672
-
PSTK*fsa=&stack[2-(pmask&1)]; // pointer to stack slot for the CAV to be executed, for lines 0-4
671
+
Ipmask=(I)((C*)&stack[1].pt)[1] & (I)((C*)&stack[2].pt)[2]; // stkpos 2 is enough to detect bit 0 if result is 0-4
672
+
PSTK*fsa=(PSTK*)((I)stack+((2*sizeof(PSTK))>>((I)((C*)&stack[2].pt)[2]&1))); // pointer to stack slot for the CAV to be executed, for lines 0-4 1 2 2 (2 2)
673
+
Afs=QCWORD(__atomic_load_n(&fsa->a,__ATOMIC_ACQUIRE)); // the action to be executed if lines 0-4. Must read early: dependency is pmask[0]->fsa->fs->fsflag to settle before we check assignments
674
+
pmask&=GETSTACK0PT; // finish 1st 3 columns of parse
675
+
// obsolete PSTK *fsa=&stack[2-(pmask&1)]; // pointer to stack slot for the CAV to be executed, for lines 0-4 1 2 2 (2 2)
676
+
// obsolete not in J32 PSTK *fsa=(PSTK *)((I)&stack[2]+((pmask<<(BW-1))>>(BW-4))); // pointer to stack slot for the CAV to be executed, for lines 0-4 1 2 2 (2 2)
677
+
// clang creates a branch! PSTK *fsa=&stack[2], *fsa1=&stack[1]; fsa=pmask&1?fsa1:fsa; // pointer to stack slot for the CAV to be executed, for lines 0-2 1 2 2
673
678
pmask&=(I)((C*)&stack[3].pt)[3]; // finish 3d column of parse
674
-
Afs1=QCWORD(stack[1].a); // in case of line 1 V0 V1 N2, we will need the flags from V1. path is fs1,fs->fs1flag to settle before the second assignment check
675
-
Afs=QCWORD(fsa->a); // the action to be executed if lines 0-4. Must read early: dependency is pmask[0]->fsa->fs->fsflag to settle before we check assignments
679
+
Afs1=QCWORD(__atomic_load_n(&stack[1].a,__ATOMIC_ACQUIRE)); // in case of line 1 V0 V1 N2, we will need the flags from V1. path is fs1,fs->fs1flag to settle before the second assignment check
676
680
pt0ecam&=~(VJTFLGOK1+VJTFLGOK2+VASGSAFE+PTNOTLPAR+NOTFINALEXEC+(7LL<<PMASKSAVEX)); // clear all the flags we will use
677
681
678
-
if(pmask){ // If all 0, nothing is dispatchable, go push next word after checking for (
682
+
if(likely(pmask!=0)){ // If all 0, nothing is dispatchable, go push next word after checking for (
683
+
// likely is an overstatement, but without it the calculation of fsa is deferred
679
684
// We are going to execute an action routine. This will be an indirect branch, and it will mispredict. To reduce the cost of the misprediction,
680
685
// we want to pile up as many instructions as we can before the branch, preferably getting out of the way as many loads as possible so that they can finish
681
686
// during the pipeline restart. The perfect scenario would be that the branch restarts while the loads for the stack arguments are still loading.
@@ -728,9 +733,11 @@ endname: ;
728
733
pmask=(pt0ecam>>PMASKSAVEX)&7; // restore after calls
729
734
}
730
735
AFactionfn=(AF)__atomic_load_n(&jt->fillv,__ATOMIC_RELAXED); // refetch the routine address early. This may chain 2 fetches, which finishes about when the indirect branch is executed
PSTK*arga=fsa; arga=pmask&4?stack:arga; Aarg1=arga[1].a;// 1st arg, monad or left dyad 2 3 1
738
+
// this requires fsa to survive over the assignment, but it's faster than the alternative
739
+
// obsolete A arg2=stack[(pmask>>=1)+1].a; // 2nd arg, fs or right dyad 1 2 3 (2 3) pmask shifted right 1
740
+
arga=pmask&4?&stack[3]:arga; Aarg2=arga[0].a; // 2nd arg, fs or right dyad 1 2 3 (2 3)
734
741
// Create what we need to free arguments after the execution. We keep the information needed to two registers so they can persist over the call as they are needed right away on return
735
742
// (1) When the args return from the verb, we will check to see if any were inplaceable and unused. Those can be freed right away, returning them to the
736
743
// // pool and allowing their cache space to be reused. But there is a problem:
@@ -742,7 +749,7 @@ endname: ;
742
749
// The calculation of tpopa/w will run to completion while the expected indirect-branch misprediction is being processed
0 commit comments