Skip to content

Commit 7a3945d

Browse files
committed
Merge branch 'master' of jsoftware.com:jsource
2 parents f92242f + 2d26122 commit 7a3945d

File tree

22 files changed

+783
-266
lines changed

22 files changed

+783
-266
lines changed

jsrc/ar.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ REDUCCPFX(tymesinsO, D, I, TYMESO)
228228
acc1=prim(acc1,acc5); acc2=prim(acc2,acc6); acc3=prim(acc3,acc7); acc0=prim(acc0,acc4); \
229229
acc2=prim(acc2,acc3); acc0=prim(acc0,acc1); acc0=prim(acc0,acc2); /* combine accumulators vertically */ \
230230
acc0=prim(acc0,_mm256_permute4x64_pd(acc0,0b11111110)); acc0=prim(acc0,_mm256_permute_pd(acc0,0xf)); /* combine accumulators horizontally 01+=23, 0+=1 */ \
231-
*(I*)z=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++z; /* store the single result from 0 */ \
231+
*(I*)z=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); /* AVX2 *z=_mm256_cvtsd_f64(acc0); */ ++z; /* store the single result */ \
232232
)
233233

234234
// f/ on rank>1, going down columns to save bandwidth
@@ -429,8 +429,8 @@ DF1(jtcompsum){
429429
c0=_mm256_add_pd(c0,_mm256_permute_pd(c0,0xf)); acc1=_mm256_permute_pd(acc0,0xf); // combine c0+c1, acc1<-1
430430
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // combine 0123, combine all low parts
431431
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
432-
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++zv; // store the single result
433-
// _mm_storel_pd(zv++,_mm256_castpd256_pd128(acc0));
432+
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); /* AVX2 *zv=_mm256_cvtsd_f64(acc0);*/ ++zv; // store the single result
433+
// obsolete _mm_storel_pd(zv++,_mm256_castpd256_pd128(acc0));
434434
}
435435
}else{
436436
// rank>1, going down columns to save bandwidth and add accuracy

jsrc/cv.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,10 @@ F2(jtfit){F2PREFIP;A f;C c;I k,l,m,r;V*sv;
7474
case CNOT: case CXCO: case CSPARSE: case CEBAR:
7575
R fitct(a,w,cno);
7676
case CQQ: ;
77-
RE(wval=i0(w)); ASSERT(wval==0,EVDOMAIN); // only f"r!.0 is supported
78-
ASSERT(sv->valencefns[1]==jtsumattymes1,EVDOMAIN) // Must be +/@:*"1!:0
79-
R CDERIV(CFIT,0,jtsumattymes1,VIRS2, m,l,r); // supports IRS
77+
RE(wval=i0(w)); ASSERT(BETWEENC(wval,0,1),EVDOMAIN); // only f"r!.[01] is supported
78+
ASSERT(sv->valencefns[1]==jtsumattymes1,EVDOMAIN) // Must be +/@:*"1!:[01]
79+
RZ(f=CDERIV(CFIT,0,jtsumattymes1,VIRS2, m,l,r)); // supports IRS
80+
FAV(f)->localuse.lu1.fittype=wval; R f;
8081
case CSLASH: ;
8182
RE(wval=i0(w)); ASSERT(wval==0,EVDOMAIN); // only f/!.0 is supported
8283
ASSERT(FAV(sv->fgh[0])->id==CPLUS,EVDOMAIN) // Must be +/!:0

jsrc/d.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ void jtdebdisp(J jt,DC d){A*x,y;I e,t;
171171
switch(t){
172172
case DCPARSE: dhead(3,d); seeparse(d); if(NETX==jt->etxn)--jt->etxn; eputc(CLF); break;
173173
case DCCALL: dhead(0,d); seecall(d); eputc(CLF); break;
174-
case DCSCRIPT: dhead(0,d); efmt("[-"FMTI"] ", d->dcn-1);
174+
case DCSCRIPT: dhead(0,d); efmt("[-"FMTI"] ", d->dcn); // keep the line number as 1-origin since that's what editors do
175175
if(0<=d->dcm){READLOCK(JT(jt,startlock)) y=AAV(JT(jt,slist))[d->dcm]; ep(AN(y),CAV(y)); READUNLOCK(JT(jt,startlock))}
176176
eputc(CLF); break;
177177
}}

jsrc/j.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,18 @@ struct Bd2 {I hdr[AKXR(0)/SZI]; D v[2];};
1414
#define CREBLOCKATOMV2(name,t,v1,v2) struct Bd2 __attribute__((aligned(CACHELINESIZE))) B##name={{AKXR(0),(t)&TRAVERSIBLE,0,(t),ACPERMANENT,1,0},{v1,v2}};
1515
CREBLOCKATOMV2(a0j1,CMPX,0.0,1.0) // 0j1
1616
#if SY_64
17-
#define CBAIVAL(t,v) {7*SZI,(t)&TRAVERSIBLE,0,(t),ACPERMANENT,1,0,(v)}
17+
#define CBAIVALM(t,v,m) {7*SZI,(t)&TRAVERSIBLE,m,(t),ACPERMANENT,1,0,(v)}
1818
#else
19-
#define CBAIVAL(t,v) {8*SZI,(t)&TRAVERSIBLE,0,(t),ACPERMANENT,1,0,0,(v)}
19+
#define CBAIVALM(t,v,m) {8*SZI,(t)&TRAVERSIBLE,m,(t),ACPERMANENT,1,0,0,(v)}
2020
#endif
21+
#define CBAIVAL(t,v) CBAIVALM(t,v,0)
2122
#define CREBLOCKATOMI(name,t,v) I __attribute__((aligned(CACHELINESIZE))) B##name[9-SY_64]=CBAIVAL(t,v);
23+
#define CREBLOCKATOMGMP(name,t,v,m) I __attribute__((aligned(CACHELINESIZE))) B##name[9-SY_64]=CBAIVALM(t,v,m);
24+
CREBLOCKATOMGMP(X0,LIT,0,0) // X block representing GMP 0 - AN=1, AM=0, val=immaterial
25+
CREBLOCKATOMGMP(X1,LIT,1,1) // X block representing GMP 1 - AN=1, AM=1, val=1
26+
struct Bxnum0 {I hdr[AKXR(0)/SZI]; X v[1];};
27+
#define CREBLOCKATOMXNUM(name,v) struct Bxnum0 __attribute__((aligned(CACHELINESIZE))) B##name={{AKXR(0),XNUM&TRAVERSIBLE,0,XNUM,ACPERMANENT,1,0},{(X)B##v}};
28+
CREBLOCKATOMXNUM(xnum1,X1) // XNUM block for 1
2229
#define CREBLOCKVEC0(name,t) I __attribute__((aligned(CACHELINESIZE))) B##name[8]={8*SZI,(t)&TRAVERSIBLE,0,(t),ACPERMANENT,0,1,0}; // no padding at end - no atoms should be referenced
2330
CREBLOCKVEC0(aqq,LIT) // ''
2431
CREBLOCKVEC0(mtv,B01) // i.0 boolean

jsrc/j.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,7 +1237,7 @@ if(likely(!((I)jtinplace&JTWILLBEOPENED)))z=EPILOGNORET(z); RETF(z); \
12371237
#define GAE0(v,t,n,r,erraction) {HISTOCALL if(unlikely(!(v=jtga0(jt,(I)(t),(I)(r),(I)(n)))))erraction; AN(v)=(n);} // used when shape=0 and rank is never 1 or will always be filled in by user even if rank 1
12381238
#endif
12391239
#define GAE(v,t,n,r,s,erraction) {GAE0(v,t,n,r,erraction) MCISH(AS(v),(I*)(s),(r))} // error action
1240-
#define GA00(v,t,n,r) {GAE0(v,t,n,r,R 0)} // used when rank will always be filled in by user. Default error action is to exit
1240+
#define GA00(v,t,n,r) {GAE0(v,t,n,r,R 0)} // used when shape will always be filled in by user. Default error action is to exit
12411241
#define GA(v,t,n,r,s) {GA00(v,t,n,r) MCISH(AS(v),(I*)(s),(r))} // s points to shape
12421242
#define GA0(v,t,n,r) {GA00(v,t,n,r) *((r)==1?AS(v):jt->shapesink)=(n);} // used when shape=0 but rank may be 1 and must fill in with AN if so - never for sparse blocks
12431243
#define GA10(v,t,n) {GA00(v,t,n,1) AS(v)[0]=(n);} // used when rank is known to be 1
@@ -1906,12 +1906,18 @@ if(likely(type _i<3)){z=(I)&oneone; z=type _i>1?(I)_zzt:z; _zzt=type _i<1?(I*)z:
19061906
#define LGSZS 1 // lg (bytes in an S)
19071907

19081908
#if (C_AVX2&&SY_64) || EMU_AVX2
1909-
// create double-precision product of inputs
1909+
// create double-precision product of inputs. outhi must not be an input; outlo can
19101910
#define TWOPROD(in0,in1,outhi,outlo) outhi=_mm256_mul_pd(in0,in1); outlo=_mm256_fmsub_pd(in0,in1,outhi);
1911-
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. Needs t and signbit.
1911+
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. outlo must not be an input. Needs sgnbit.
19121912
#define TWOSUM(in0,in1,outhi,outlo) {__m256d t=_mm256_andnot_pd(sgnbit,in0); outlo=_mm256_andnot_pd(sgnbit,in1); t=_mm256_sub_pd(t,outlo); \
1913-
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); \
1914-
outhi=_mm256_add_pd(in0,in1); outlo=_mm256_sub_pd(outlo,outhi); outlo=_mm256_add_pd(outlo,t);} // 1 if in1 larger; select outlo=max t=min
1913+
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); /* outlo=val with larger abs t=val with smaller abs */ \
1914+
outhi=_mm256_add_pd(in0,in1); /* single-prec sum */ \
1915+
outlo=_mm256_sub_pd(outlo,outhi); /* big-(big+small): implied val of -small after rounding */ \
1916+
outlo=_mm256_add_pd(outlo,t);} // amt by which actual value exceeds implied: this is the lost low precision
1917+
// Same, but we know which argument is bigger. outhi cannot be an input; outlo can be the same as inbig
1918+
#define TWOSUMBS(inbig,insmall,outhi,outlo) {outhi=_mm256_add_pd(inbig,insmall); /* single-prec sum */ \
1919+
outlo=_mm256_sub_pd(inbig,outhi); /* big-(big+small): implied val of -small after rounding */ \
1920+
outlo=_mm256_add_pd(outlo,insmall);} // amt by which actual value exceeds implied: this is the lost low precision
19151921
#define DPADD(hi0,lo0,hi1,lo1,outhi,outlo) outhi=_mm256_add_pd(hi0,hi1); outlo=_mm256_add_pd(lo0,lo1);
19161922
#else
19171923
#define TWOSPLIT(a,x,y) y=(a)*134217730.0; x=y-(a); x=y-x; y=(a)-x; // must avoid compiler tuning

jsrc/ja.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,10 @@
428428
#define fplus(x,y) jtfplus(jt,(x),(y))
429429
#define fpoly(x,y) jtfpoly(jt,(x),(y))
430430
#define fpolyc(x) jtfpolyc(jt,(x))
431-
#define fr(x) {if(likely((x)!=0)){I Zs = AC(x); if(likely(!ACISPERM(Zs))){if(likely(--Zs<=0))mf(x);else AC(x)=Zs;}}} // use fr for known nonrecursives, and for locales
431+
#define gmpmfree(x) {I allocsize = AN(x)+AKXR(0); jt->bytes-=allocsize; jt->malloctotal-=allocsize; jt->mfreegenallo-=allocsize; free(x);}
432+
#define frcommon(x,f) {if(likely((x)!=0)){I Zs = AC(x); if(likely(!ACISPERM(Zs))){if(likely(--Zs<=0)){f(x);}else AC(x)=Zs;}}} // use fr for known nonrecursives, and for locales
433+
#define fr(x) frcommon(x,mf)
434+
#define frgmp(x) frcommon(x,gmpmfree) // to free GMP blocks
432435
#define fram(x0,x1,x2,x3,x4) jtfram(jt,(x0),(x1),(x2),(x3),(x4))
433436
#define from(x,y) jtfrom(jt,(x),(y))
434437
#define frombs(x,y) jtfrombs(jt,(x),(y))

jsrc/je.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ extern F2(jtdomainerr2);
448448
extern F2(jtdot);
449449
extern F2(jtdrop);
450450
extern F2(jtebar);
451+
extern F2(jtekupdate);
451452
extern F2(jteps);
452453
extern F2(jtetoiso8601);
453454
extern F2(jtiso8601toe);

jsrc/jtype.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,7 @@ typedef struct {
10061006
A cachedref; // for namerefs ('name'~), the cached value, or 0 if not cached
10071007
AF fork2hfn; // for dyad fork that is NOT a comparison combination or jtintersect, the function to call to process h (might be in h@][)
10081008
I forcetask; // for t., the flags extracted from n. Bits 0-7=thread pool; bit 8=worker thread only
1009+
I fittype; // for u!.t where t is a code, its value is stored here in the CFIT block
10091010
} lu1; // this is the high-use stuff in the second cacheline
10101011
};
10111012
} localuse; // always 16 bytes, 4 I4s

jsrc/m.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -976,7 +976,8 @@ void jtfamftrav(J jt,AD* RESTRICT wd,I t){I n=AN(wd);
976976
}else if(t&SYMB){wd=jtfreesymtab(jt,wd,AR(wd)); // SYMB is used as a flag; we test here AFTER NAME and ADV which are lower bits
977977
} else if(t&(RAT|XNUM|XD)) {A* RESTRICT v=AAV(wd);
978978
// single-level indirect forms. handle each block
979-
DQ(t&RAT?2*n:n, if(*v)fr(*v); ++v;);
979+
DQ(t&RAT?2*n:n, if(*v)if(AT(*v)&LIT){frgmp(*v);}else fr(*v); ++v;);
980+
// obsolete DQ(t&RAT?2*n:n, if(*v)fr(*v); ++v;);
980981
}else if(ISSPARSE(t)){P* RESTRICT v=PAV(wd);
981982
fana(SPA(v,a)); fana(SPA(v,e)); fana(SPA(v,i)); fana(SPA(v,x));
982983
// for sparse, decrement the usecount

0 commit comments

Comments
 (0)