Skip to content

Commit 104626d

Browse files
authored
Merge pull request #1624 from GillesDuvert/improvements_transpose
Improvements for transpose, and more.
2 parents 907e95f + 0757621 commit 104626d

8 files changed

+66
-74
lines changed

src/basegdl.cpp

+26-12
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "basegdl.hpp"
2121
#include "nullgdl.hpp"
22+
#include "objects.hpp"
2223

2324
using namespace std;
2425

@@ -843,16 +844,29 @@ void GDLDelete( BaseGDL* toDelete)
843844
}
844845
int GDL_NTHREADS=1;
845846

846-
int parallelize(SizeT n, int modifier) {
847-
//below, please modify if you find a way to persuade behaviour of those different cases to be better if they return different number of threads.
848-
switch(modifier)
849-
{
850-
case TP_DEFAULT: //the same as IDL, reserved for routines that use the thread pool, ideally check the special thread pool keywords.
851-
case TP_ARRAY_INITIALISATION: // used by GDL array initialisation (new, convert, gdlarray): probably needs som special tuning
852-
case TP_MEMORY_ACCESS: // concurrent memory access, probably needs to be capped to preserve bandwidth
853-
case TP_CPU_INTENSIVE: // benefit from max number of threads
854-
return (n >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS >= n))?CpuTPOOL_NTHREADS:1;
855-
default:
856-
return 1;
857-
}
847+
int parallelize(SizeT nEl, int modifier) {
848+
int nThreads = (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS >= nEl)) ? CpuTPOOL_NTHREADS : 1;
849+
if (useSmartTpool) {
850+
//below, please modify if you find a way to persuade behaviour of those different cases to be better if they return different number of threads.
851+
switch (modifier) {
852+
case TP_DEFAULT: //the same as IDL, reserved for routines that use the thread pool, ideally check the special thread pool keywords.
853+
case TP_ARRAY_INITIALISATION: // used by GDL array initialisation (new, convert, gdlarray): need to concern only 1 thread/code whicj is not possible AFAIK.
854+
case TP_MEMORY_ACCESS: // concurrent memory access, probably needs to be capped to preserve bandwidth
855+
{
856+
if (nThreads == 1) return nThreads;
857+
// here we have more than 1 thread, so n operations will be divided between nt threads. It becomes inefficient if nt is large, to start so many threads for diminishing returns.
858+
// I propose to enable as many threads as necessary so that each thread will compute at least CpuTPOOL_MIN_ELTS:
859+
if (CpuTPOOL_MIN_ELTS < 1) return CpuTPOOL_NTHREADS; // the user did not understand IDL's doc about threadpools?.
860+
int nchunk = nEl / CpuTPOOL_MIN_ELTS;
861+
nchunk++; //to be sure
862+
if (nThreads > nchunk) nThreads = nchunk;
863+
// std::cerr << nThreads;
864+
return nThreads;
865+
}
866+
case TP_CPU_INTENSIVE: // benefit from max number of threads if possible given MIN and MAX elts etc
867+
return nThreads;
868+
default:
869+
return 1;
870+
}
871+
} else return nThreads;
858872
}

src/basic_fun.cpp

-44
Original file line numberDiff line numberDiff line change
@@ -4091,50 +4091,6 @@ namespace lib {
40914091
}
40924092

40934093

4094-
// BaseGDL* matrix_multiply( EnvT* e)
4095-
// {
4096-
// SizeT nParam=e->NParam( 2);
4097-
//
4098-
// BaseGDL* a = e->GetNumericArrayParDefined( 0);
4099-
// BaseGDL* b = e->GetNumericArrayParDefined( 1);
4100-
//
4101-
// static int aTIx = e->KeywordIx("ATRANSPOSE");
4102-
// bool aT = e->KeywordPresent(aTIx);
4103-
// static int bTIx = e->KeywordIx("BTRANSPOSE");
4104-
// bool bT = e->KeywordPresent(bTIx);
4105-
//
4106-
// static int strassenIx = e->KeywordIx("STRASSEN_ALGORITHM");
4107-
// bool strassen = e->KeywordPresent(strassenIx);
4108-
//
4109-
//
4110-
// if( p1->N_Elements() != rank)
4111-
// e->Throw("Incorrect number of elements in permutation.");
4112-
//
4113-
// DUInt* perm = new DUInt[rank];
4114-
// Guard<DUInt> perm_guard( perm);
4115-
//
4116-
// DUIntGDL* p1L = static_cast<DUIntGDL*>
4117-
// (p1->Convert2( GDL_UINT, BaseGDL::COPY));
4118-
// for( SizeT i=0; i<rank; ++i) perm[i] = (*p1L)[ i];
4119-
// delete p1L;
4120-
//
4121-
// // check permutaion vector
4122-
// for( SizeT i=0; i<rank; ++i)
4123-
// {
4124-
// DUInt j;
4125-
// for( j=0; j<rank; ++j) if( perm[j] == i) break;
4126-
// if (j == rank)
4127-
// e->Throw( "Incorrect permutation vector.");
4128-
// }
4129-
// return p0->Transpose( perm);
4130-
// }
4131-
//
4132-
// return a->Transpose( NULL);
4133-
// }
4134-
4135-
// helper function for sort_fun, recursive
4136-
// optimized version
4137-
41384094
template< typename IndexT>
41394095
void MergeSortOpt(BaseGDL* p0, IndexT* hhS, IndexT* h1, IndexT* h2,
41404096
SizeT len) {

src/datatypes.cpp

+14-8
Original file line numberDiff line numberDiff line change
@@ -1320,13 +1320,16 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
13201320
for (SizeT d = 0; d < rank; ++d) {
13211321
resDim[ d] = this->dim[ perm[ d]];
13221322
}
1323-
1323+
13241324
Data_* res = new Data_(dimension(resDim, rank), BaseGDL::NOZERO);
13251325

13261326
// src stride
13271327
SizeT srcStride[ MAXRANK + 1];
13281328
this->dim.Stride(srcStride, rank);
1329-
1329+
1330+
// GD: Tests show that we are way faster than eigen (below) with our 'parallell' method in ALL CASES on my intel I7.
1331+
// But this may not be true on other platforms, so keep the possibility via a -- switch.
1332+
if (useEigenForTransposeOps) {
13301333
#ifdef USE_EIGEN
13311334
//for some reason, this simple eigen::code dos not like dimensions == 1, so cannot be used if this is the case.
13321335
bool try_eigen=true;
@@ -1344,6 +1347,7 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
13441347
return res;
13451348
}
13461349
#endif
1350+
13471351
#ifdef EIGEN_HAS_TENSOR
13481352
else if (try_eigen && rank == 3) // special case: eigen x 3
13491353
{
@@ -1391,11 +1395,13 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
13911395

13921396
#endif
13931397

1398+
} //will have returned if eigen ops exist.
1399+
13941400
SizeT nElem = dd.size();
13951401
long chunksize = nElem;
13961402
long nchunk = 1;
13971403
bool do_parallel = false;
1398-
GDL_NTHREADS=parallelize( nElem, TP_MEMORY_ACCESS);
1404+
GDL_NTHREADS=parallelize( nElem, TP_CPU_INTENSIVE);
13991405
if (GDL_NTHREADS > 1) { //no use start parallel threading for small numbers.
14001406
chunksize = nElem / GDL_NTHREADS;
14011407
nchunk = nElem / chunksize;
@@ -1485,7 +1491,7 @@ void Data_<Sp>::Reverse(DLong dim) { TRACE_ROUTINE(__FUNCTION__,__FILE__,__LINE_
14851491
if (this->dim[dim]%2) halfDim++;
14861492
SizeT outerStride = this->dim.Stride(dim + 1);
14871493
SizeT span=outerStride - revStride;
1488-
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS))==1) { //most frequent
1494+
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE))==1) { //most frequent
14891495
for (SizeT o = 0; o < nEl; o += outerStride) {
14901496
for (SizeT i = o; i < o+revStride; ++i) {
14911497
for (SizeT s = i, opp=span+i; s < halfDim+i ; s += revStride, opp-=revStride) {
@@ -1523,7 +1529,7 @@ BaseGDL* Data_<Sp>::DupReverse(DLong dim) { TRACE_ROUTINE(__FUNCTION__,__FILE__,
15231529
if (this->dim[dim]%2) halfDim++;
15241530
SizeT outerStride = this->dim.Stride(dim + 1);
15251531
SizeT span=outerStride - revStride;
1526-
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS))==1) { //most frequent
1532+
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE))==1) { //most frequent
15271533
for (SizeT o = 0; o < nEl; o += outerStride) {
15281534
for (SizeT i = o; i < o+revStride; ++i) {
15291535
for (SizeT s = i, opp=span+i; s < halfDim+i ; s += revStride, opp-=revStride) {
@@ -1563,7 +1569,7 @@ BaseGDL* Data_<SpDPtr>::DupReverse(DLong dim) {
15631569
if (this->dim[dim] % 2) halfDim++;
15641570
SizeT outerStride = this->dim.Stride(dim + 1);
15651571
SizeT span = outerStride - revStride;
1566-
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS)) == 1) { //most frequent
1572+
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE)) == 1) { //most frequent
15671573
for (SizeT o = 0; o < nEl; o += outerStride) {
15681574
for (SizeT i = o; i < o + revStride; ++i) {
15691575
for (SizeT s = i, opp = span + i; s < halfDim + i; s += revStride, opp -= revStride) {
@@ -1605,7 +1611,7 @@ BaseGDL* Data_<SpDObj>::DupReverse(DLong dim) {
16051611
if (this->dim[dim] % 2) halfDim++;
16061612
SizeT outerStride = this->dim.Stride(dim + 1);
16071613
SizeT span = outerStride - revStride;
1608-
if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS)) == 1) { //most frequent
1614+
if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE)) == 1) { //most frequent
16091615
for (SizeT o = 0; o < nEl; o += outerStride) {
16101616
for (SizeT i = o; i < o + revStride; ++i) {
16111617
for (SizeT s = i, opp = span + i; s < halfDim + i; s += revStride, opp -= revStride) {
@@ -3817,7 +3823,7 @@ void Data_<Sp>::CatInsert (const Data_* srcArr, const SizeT atDim, SizeT& at)
38173823
SizeT gap = this->dim.Stride (atDim + 1); // dest array
38183824

38193825
//GD: speed up by using indexing that permit parallel and collapse.
3820-
if ((GDL_NTHREADS=parallelize( len*nCp, TP_MEMORY_ACCESS))==1) { //most frequent
3826+
if ((GDL_NTHREADS=parallelize( len*nCp, TP_CPU_INTENSIVE))==1) { //most frequent
38213827
for (OMPInt c = 0; c < nCp; ++c) {
38223828
for (SizeT destIx = 0; destIx < len; destIx++) (*this)[destIx + destStart + c * gap] = (*srcArr)[ destIx + c * len];
38233829
}

src/gdl.cpp

+13-3
Original file line numberDiff line numberDiff line change
@@ -368,14 +368,16 @@ int main(int argc, char *argv[])
368368
cerr << " --sloppy Sets the traditional (default) compiling option where \"()\" can be used both with functions and arrays." << endl;
369369
cerr << " Needed to counteract temporarily the effect of the enviromnment variable \"GDL_IS_FUSSY\"." << endl;
370370
cerr << " --MAC Graphic device will be called 'MAC' on MacOSX. (default: 'X')" << endl;
371-
cerr << " --no-use-wx Tells GDL not to use WxWidgets graphics." << endl;
371+
cerr << " [--no-use-wx | -X] Tells GDL not to use WxWidgets graphics and resort to X11 (if available)." << endl;
372372
cerr << " Also enabled by setting the environment variable GDL_DISABLE_WX_PLOTS to a non-null value." << endl;
373373
cerr << " --notebook Force SVG-only device, used only when GDL is a Python Notebook Kernel." << endl;
374374
cerr << " --widget-compat Tells GDL to use a default (rather ugly) fixed pitch font for compatiblity with IDL widgets." << endl;
375375
cerr << " Also enabled by setting the environment variable GDL_WIDGET_COMPAT to a non-null value." << endl;
376-
cerr << " Using this option may render some historical widgets unworkable (as they are based on fixed sizes)." << endl;
376+
cerr << " Using this option may render some historical widgets more readable (as they are based on fixed sizes)." << endl;
377377
cerr << " --no-dSFMT Tells GDL not to use double precision SIMD oriented Fast Mersenne Twister(dSFMT) for random doubles." << endl;
378378
cerr << " Also disable by setting the environment variable GDL_NO_DSFMT to a non-null value." << endl;
379+
cerr << " --with-eigen-transpose lets GDL use Eigen::transpose and related functions instead of our accelerated transpose function. Normally slower." <<endl;
380+
cerr << " --smart-tpool switch to a mode where the number of threads is adaptive (experimental). Should enable better perfs on many core machines." <<endl;
379381
#ifdef _WIN32
380382
cerr << " --posix (Windows only): paths will be posix paths (experimental)." << endl;
381383
#endif
@@ -483,10 +485,18 @@ int main(int argc, char *argv[])
483485
{
484486
usePlatformDeviceName = true;
485487
}
486-
else if (string(argv[a]) == "--no-use-wx")
488+
else if (string(argv[a]) == "--no-use-wx" | string(argv[a]) == "-X")
487489
{
488490
force_no_wxgraphics = true;
489491
}
492+
else if (string(argv[a]) == "--with-eigen-transpose")
493+
{
494+
useEigenForTransposeOps = true;
495+
}
496+
else if (string(argv[a]) == "--smart-tpool")
497+
{
498+
useSmartTpool = true;
499+
}
490500
else if (string(argv[a]) == "--notebook")
491501
{
492502
iAmANotebook = true;

src/math_fun_jmg.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,7 @@ namespace lib {
927927
}
928928

929929
/* Double loop on the output image */
930-
if ((GDL_NTHREADS=parallelize( nEl))==1) {
930+
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
931931
for (OMPInt j = 0; j < nRows; ++j) {
932932
for (OMPInt i = 0; i < nCols; ++i) {
933933
// Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
@@ -1027,7 +1027,7 @@ namespace lib {
10271027
}
10281028

10291029
/* Double loop on the output image */
1030-
if ((GDL_NTHREADS=parallelize( nEl))==1) {
1030+
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
10311031
for (OMPInt j = 0; j < nRows; ++j) {
10321032
for (OMPInt i = 0; i < nCols; ++i) {
10331033
// Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
@@ -1225,7 +1225,7 @@ namespace lib {
12251225
}
12261226

12271227
/* Double loop on the output image */
1228-
if ((GDL_NTHREADS=parallelize( nEl))==1) {
1228+
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
12291229
for (OMPInt j = 0; j < nRows; ++j) {
12301230
for (OMPInt i = 0; i < nCols; ++i) {
12311231
// Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
@@ -1373,7 +1373,7 @@ namespace lib {
13731373
}
13741374

13751375
/* Double loop on the output image */
1376-
if ((GDL_NTHREADS=parallelize( nEl))==1) {
1376+
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
13771377
for (OMPInt j = 0; j < nRows; ++j) {
13781378
for (OMPInt i = 0; i < nCols; ++i) {
13791379
// Compute the original source for this pixel, note order of j and i.
@@ -1485,7 +1485,7 @@ namespace lib {
14851485
}
14861486

14871487
/* Double loop on the output image */
1488-
if ((GDL_NTHREADS=parallelize( nEl))==1) {
1488+
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
14891489
for (OMPInt j = 0; j < nRows; ++j) {
14901490
for (OMPInt i = 0; i < nCols; ++i) {
14911491
// Compute the original source for this pixel, note order of j and i.
@@ -1691,7 +1691,7 @@ namespace lib {
16911691
}
16921692

16931693
/* Double loop on the output image */
1694-
if ((GDL_NTHREADS=parallelize( nEl))==1) {
1694+
if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
16951695
for (OMPInt j = 0; j < nRows; ++j) {
16961696
for (OMPInt i = 0; i < nCols; ++i) {
16971697
// Compute the original source for this pixel, note order of j and i.

src/minmax_include.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545

4646
SizeT nElem = (stop - start) / step;
47-
GDL_NTHREADS=parallelize( nElem);
47+
GDL_NTHREADS=parallelize( nElem, TP_CPU_INTENSIVE);
4848
//trap existence of ABSFUNC and create something that stands cppchekck useage (needed by contiunous integration scripts!)
4949
#ifndef ABSFUNC
5050
#define FUNCABS

src/objects.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ volatile bool tryToMimicOriginalWidgets;
114114
volatile bool useLocalDrivers;
115115
//do we favor SIMD-accelerated random number generation?
116116
volatile bool useDSFMTAcceleration;
117+
//Transpose() operations are faster with our method, but setting this may test if this is still true for future Eigen:: versions or platforms.
118+
volatile bool useEigenForTransposeOps=false;
119+
//experimental TPOOL use adaptive number of threads.
120+
volatile bool useSmartTpool=false;
117121

118122
void ResetObjects()
119123
{

src/objects.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ extern volatile bool useDSFMTAcceleration;
8686
//do we use our own copy of (better?) drivers?
8787
extern volatile bool useLocalDrivers;
8888
extern volatile bool usePlatformDeviceName;
89+
extern volatile bool useEigenForTransposeOps;
90+
extern volatile bool useSmartTpool;
8991
extern int debugMode;
9092

9193
enum DebugCode {

0 commit comments

Comments
 (0)