Merge pull request #1624 from GillesDuvert/improvements_transpose

GillesDuvert · web-flow · commit 104626d66800 · 2023-08-30T10:38:45.000+02:00
Improvements for transpose, and more.
diff --git a/src/basegdl.cpp b/src/basegdl.cpp
@@ -19,6 +19,7 @@
 
 #include "basegdl.hpp"
 #include "nullgdl.hpp"
+#include "objects.hpp"
 
 using namespace std;
 
@@ -843,16 +844,29 @@ void GDLDelete( BaseGDL* toDelete)
 }
 int GDL_NTHREADS=1;
 
-int parallelize(SizeT n, int modifier) {
-//below, please modify if you find a way to persuade behaviour of those different cases to be better if they return different number of threads.
-  switch(modifier)
-  {
-  case TP_DEFAULT: //the same as IDL, reserved for routines that use the thread pool, ideally check the special thread pool keywords.
-  case TP_ARRAY_INITIALISATION: // used by GDL array initialisation (new, convert, gdlarray): probably needs som special tuning
-  case TP_MEMORY_ACCESS: // concurrent memory access, probably needs to be capped to preserve bandwidth 
-  case TP_CPU_INTENSIVE:  // benefit from max number of threads
-    return (n >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS >= n))?CpuTPOOL_NTHREADS:1;
-  default:
-    return 1;
-  }    
+int parallelize(SizeT nEl, int modifier) {
+  int nThreads = (nEl >= CpuTPOOL_MIN_ELTS && (CpuTPOOL_MAX_ELTS == 0 || CpuTPOOL_MAX_ELTS >= nEl)) ? CpuTPOOL_NTHREADS : 1;
+  if (useSmartTpool) {
+	//below, please modify if you find a way to persuade behaviour of those different cases to be better if they return different number of threads.
+	switch (modifier) {
+	case TP_DEFAULT: //the same as IDL, reserved for routines that use the thread pool, ideally check the special thread pool keywords.
+	case TP_ARRAY_INITIALISATION: // used by GDL array initialisation (new, convert, gdlarray): need to concern only 1 thread/code whicj is not possible AFAIK.
+	case TP_MEMORY_ACCESS: // concurrent memory access, probably needs to be capped to preserve bandwidth 
+	{
+	  if (nThreads == 1) return nThreads;
+	  // here we have more than 1 thread, so n operations will be divided between nt threads. It becomes inefficient if nt is large, to start so many threads for diminishing returns.
+	  // I propose to enable as many threads as necessary so that each thread will compute at least CpuTPOOL_MIN_ELTS:
+	  if (CpuTPOOL_MIN_ELTS < 1) return CpuTPOOL_NTHREADS; // the user did not understand IDL's doc about threadpools?.
+	  int nchunk = nEl / CpuTPOOL_MIN_ELTS;
+	  nchunk++; //to be sure
+	  if (nThreads > nchunk) nThreads = nchunk;
+	  //	std::cerr << nThreads;
+	  return nThreads;
+	}
+	case TP_CPU_INTENSIVE: // benefit from max number of threads if possible given MIN and MAX elts etc
+	  return nThreads;
+	default:
+	  return 1;
+	}
+  } else return nThreads;
 }
diff --git a/src/basic_fun.cpp b/src/basic_fun.cpp
@@ -4091,50 +4091,6 @@ namespace lib {
   }
 
 
-  // BaseGDL* matrix_multiply( EnvT* e)
-  //   {
-  //     SizeT nParam=e->NParam( 2);
-  //
-  //     BaseGDL* a = e->GetNumericArrayParDefined( 0);
-  //     BaseGDL* b = e->GetNumericArrayParDefined( 1);
-  //
-  //     static int aTIx = e->KeywordIx("ATRANSPOSE");
-  //     bool aT = e->KeywordPresent(aTIx);
-  //     static int bTIx = e->KeywordIx("BTRANSPOSE");
-  //     bool bT = e->KeywordPresent(bTIx);
-  //
-  //     static int strassenIx = e->KeywordIx("STRASSEN_ALGORITHM");
-  //     bool strassen = e->KeywordPresent(strassenIx);
-  //
-  //
-  //     if( p1->N_Elements() != rank)
-  //      e->Throw("Incorrect number of elements in permutation.");
-  //
-  //    DUInt* perm = new DUInt[rank];
-  //    Guard<DUInt> perm_guard( perm);
-  //
-  //    DUIntGDL* p1L = static_cast<DUIntGDL*>
-  //      (p1->Convert2( GDL_UINT, BaseGDL::COPY));
-  //    for( SizeT i=0; i<rank; ++i) perm[i] = (*p1L)[ i];
-  //    delete p1L;
-  //
-  //    // check permutaion vector
-  //    for( SizeT i=0; i<rank; ++i)
-  //      {
-  //        DUInt j;
-  //        for( j=0; j<rank; ++j) if( perm[j] == i) break;
-  //        if (j == rank)
-  //          e->Throw( "Incorrect permutation vector.");
-  //      }
-  //    return p0->Transpose( perm);
-  //       }
-  //
-  //     return a->Transpose( NULL);
-  //   }
-
-  // helper function for sort_fun, recursive
-  // optimized version
-
   template< typename IndexT>
   void MergeSortOpt(BaseGDL* p0, IndexT* hhS, IndexT* h1, IndexT* h2,
     SizeT len) {
diff --git a/src/datatypes.cpp b/src/datatypes.cpp
@@ -1320,13 +1320,16 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
   for (SizeT d = 0; d < rank; ++d) {
     resDim[ d] = this->dim[ perm[ d]];
   }
-
+ 
   Data_* res = new Data_(dimension(resDim, rank), BaseGDL::NOZERO);
 
   // src stride
   SizeT srcStride[ MAXRANK + 1];
   this->dim.Stride(srcStride, rank);
-  
+
+// GD: Tests show that we are way faster than eigen (below) with our 'parallell' method in ALL CASES on my intel I7.  
+// But this may not be true on other platforms, so keep the possibility via a -- switch.
+  if (useEigenForTransposeOps) {
 #ifdef USE_EIGEN
   //for some reason, this simple eigen::code dos not like dimensions == 1, so cannot be used if this is the case.
   bool try_eigen=true;
@@ -1344,6 +1347,7 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
     return res;
   }
 #endif
+  
 #ifdef EIGEN_HAS_TENSOR  
   else if (try_eigen && rank == 3) // special case: eigen x 3
   {
@@ -1391,11 +1395,13 @@ BaseGDL* Data_<Sp>::Transpose(DUInt* perm) { TRACE_ROUTINE(__FUNCTION__,__FILE__
 
 #endif
   
+  } //will have returned if eigen ops exist.
+  
   SizeT nElem = dd.size();
   long chunksize = nElem;
   long nchunk = 1;
   bool do_parallel = false;
-  GDL_NTHREADS=parallelize( nElem, TP_MEMORY_ACCESS);
+  GDL_NTHREADS=parallelize( nElem, TP_CPU_INTENSIVE);
   if (GDL_NTHREADS > 1) { //no use start parallel threading for small numbers.
     chunksize = nElem /  GDL_NTHREADS;
     nchunk = nElem / chunksize;
@@ -1485,7 +1491,7 @@ void Data_<Sp>::Reverse(DLong dim) { TRACE_ROUTINE(__FUNCTION__,__FILE__,__LINE_
   if (this->dim[dim]%2) halfDim++;
   SizeT outerStride = this->dim.Stride(dim + 1);
   SizeT span=outerStride - revStride;
-  if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS))==1) {  //most frequent
+  if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE))==1) {  //most frequent
     for (SizeT o = 0; o < nEl; o += outerStride) {
       for (SizeT i = o; i < o+revStride; ++i) {
         for (SizeT s = i, opp=span+i; s < halfDim+i  ; s += revStride, opp-=revStride) {
@@ -1523,7 +1529,7 @@ BaseGDL* Data_<Sp>::DupReverse(DLong dim) { TRACE_ROUTINE(__FUNCTION__,__FILE__,
   if (this->dim[dim]%2) halfDim++;
   SizeT outerStride = this->dim.Stride(dim + 1);
   SizeT span=outerStride - revStride;
-  if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS))==1) {  //most frequent
+  if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE))==1) {  //most frequent
     for (SizeT o = 0; o < nEl; o += outerStride) {
       for (SizeT i = o; i < o+revStride; ++i) {
         for (SizeT s = i, opp=span+i; s < halfDim+i  ; s += revStride, opp-=revStride) {
@@ -1563,7 +1569,7 @@ BaseGDL* Data_<SpDPtr>::DupReverse(DLong dim) {
   if (this->dim[dim] % 2) halfDim++;
   SizeT outerStride = this->dim.Stride(dim + 1);
   SizeT span = outerStride - revStride;
-  if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS)) == 1) { //most frequent
+  if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE)) == 1) { //most frequent
     for (SizeT o = 0; o < nEl; o += outerStride) {
       for (SizeT i = o; i < o + revStride; ++i) {
         for (SizeT s = i, opp = span + i; s < halfDim + i; s += revStride, opp -= revStride) {
@@ -1605,7 +1611,7 @@ BaseGDL* Data_<SpDObj>::DupReverse(DLong dim)  {
   if (this->dim[dim] % 2) halfDim++;
   SizeT outerStride = this->dim.Stride(dim + 1);
   SizeT span = outerStride - revStride;
-  if ((GDL_NTHREADS=parallelize(nEl, TP_MEMORY_ACCESS)) == 1) { //most frequent
+  if ((GDL_NTHREADS=parallelize(nEl, TP_CPU_INTENSIVE)) == 1) { //most frequent
     for (SizeT o = 0; o < nEl; o += outerStride) {
       for (SizeT i = o; i < o + revStride; ++i) {
         for (SizeT s = i, opp = span + i; s < halfDim + i; s += revStride, opp -= revStride) {
@@ -3817,7 +3823,7 @@ void Data_<Sp>::CatInsert (const Data_* srcArr, const SizeT atDim, SizeT& at)
   SizeT gap = this->dim.Stride (atDim + 1); // dest array
   
 //GD: speed up by using indexing that permit parallel and collapse.
-  if ((GDL_NTHREADS=parallelize( len*nCp, TP_MEMORY_ACCESS))==1) { //most frequent
+  if ((GDL_NTHREADS=parallelize( len*nCp, TP_CPU_INTENSIVE))==1) { //most frequent
     for (OMPInt c = 0; c < nCp; ++c) {
       for (SizeT destIx = 0; destIx < len; destIx++) (*this)[destIx + destStart + c * gap] = (*srcArr)[ destIx + c * len];
     }
diff --git a/src/gdl.cpp b/src/gdl.cpp
@@ -368,14 +368,16 @@ int main(int argc, char *argv[])
       cerr << "  --sloppy           Sets the traditional (default) compiling option where \"()\"  can be used both with functions and arrays." << endl;
       cerr << "                     Needed to counteract temporarily the effect of the enviromnment variable \"GDL_IS_FUSSY\"." << endl;
       cerr << "  --MAC              Graphic device will be called 'MAC' on MacOSX. (default: 'X')" << endl;
-      cerr << "  --no-use-wx        Tells GDL not to use WxWidgets graphics." << endl;
+      cerr << "  [--no-use-wx | -X] Tells GDL not to use WxWidgets graphics and resort to X11 (if available)." << endl;
       cerr << "                     Also enabled by setting the environment variable GDL_DISABLE_WX_PLOTS to a non-null value." << endl;
       cerr << "  --notebook         Force SVG-only device, used only when GDL is a Python Notebook Kernel." << endl;
       cerr << "  --widget-compat    Tells GDL to use a default (rather ugly) fixed pitch font for compatiblity with IDL widgets." << endl;
       cerr << "                     Also enabled by setting the environment variable GDL_WIDGET_COMPAT to a non-null value." << endl;
-      cerr << "                     Using this option may render some historical widgets unworkable (as they are based on fixed sizes)." << endl;
+      cerr << "                     Using this option may render some historical widgets more readable (as they are based on fixed sizes)." << endl;
       cerr << "  --no-dSFMT         Tells GDL not to use double precision SIMD oriented Fast Mersenne Twister(dSFMT) for random doubles." << endl;
       cerr << "                     Also disable by setting the environment variable GDL_NO_DSFMT to a non-null value." << endl;
+      cerr << "  --with-eigen-transpose lets GDL use Eigen::transpose and related functions instead of our accelerated transpose function. Normally slower." <<endl;
+      cerr << "  --smart-tpool      switch to a mode where the number of threads is adaptive (experimental). Should enable better perfs on many core machines." <<endl;
 #ifdef _WIN32
       cerr << "  --posix (Windows only): paths will be posix paths (experimental)." << endl;
 #endif
@@ -483,10 +485,18 @@ int main(int argc, char *argv[])
       {
          usePlatformDeviceName = true;
       }
-      else if (string(argv[a]) == "--no-use-wx")
+      else if (string(argv[a]) == "--no-use-wx" |  string(argv[a]) == "-X")
       {
          force_no_wxgraphics = true;
       }
+      else if (string(argv[a]) == "--with-eigen-transpose")
+      {
+         useEigenForTransposeOps = true;
+      }
+      else if (string(argv[a]) == "--smart-tpool")
+      {
+         useSmartTpool = true;
+      }
       else if (string(argv[a]) == "--notebook")
       {
          iAmANotebook = true;
diff --git a/src/math_fun_jmg.cpp b/src/math_fun_jmg.cpp
@@ -927,7 +927,7 @@ namespace lib {
     }
 
     /* Double loop on the output image  */
-    if ((GDL_NTHREADS=parallelize( nEl))==1) {
+    if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
       for (OMPInt j = 0; j < nRows; ++j) {
         for (OMPInt i = 0; i < nCols; ++i) {
           // Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
@@ -1027,7 +1027,7 @@ namespace lib {
     }
 
     /* Double loop on the output image  */
-    if ((GDL_NTHREADS=parallelize( nEl))==1) {
+    if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
       for (OMPInt j = 0; j < nRows; ++j) {
         for (OMPInt i = 0; i < nCols; ++i) {
           // Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
@@ -1225,7 +1225,7 @@ namespace lib {
     }
 
     /* Double loop on the output image  */
-    if ((GDL_NTHREADS=parallelize( nEl))==1) {
+    if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
       for (OMPInt j = 0; j < nRows; ++j) {
         for (OMPInt i = 0; i < nCols; ++i) {
           // Compute the original source for this pixel, note order of j and i in P and Q definition of IDL doc.
@@ -1373,7 +1373,7 @@ namespace lib {
     }
 
     /* Double loop on the output image  */
-    if ((GDL_NTHREADS=parallelize( nEl))==1) {
+    if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
       for (OMPInt j = 0; j < nRows; ++j) {
         for (OMPInt i = 0; i < nCols; ++i) {
           // Compute the original source for this pixel, note order of j and i.
@@ -1485,7 +1485,7 @@ namespace lib {
     }
 
     /* Double loop on the output image  */
-    if ((GDL_NTHREADS=parallelize( nEl))==1) {
+    if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
       for (OMPInt j = 0; j < nRows; ++j) {
         for (OMPInt i = 0; i < nCols; ++i) {
           // Compute the original source for this pixel, note order of j and i.
@@ -1691,7 +1691,7 @@ namespace lib {
     }
 
     /* Double loop on the output image  */
-    if ((GDL_NTHREADS=parallelize( nEl))==1) {
+    if ((GDL_NTHREADS=parallelize( nEl, TP_CPU_INTENSIVE))==1) {
       for (OMPInt j = 0; j < nRows; ++j) {
         for (OMPInt i = 0; i < nCols; ++i) {
           // Compute the original source for this pixel, note order of j and i.
diff --git a/src/minmax_include.cpp b/src/minmax_include.cpp
@@ -44,7 +44,7 @@
   
 
   SizeT nElem = (stop - start) / step;
-  GDL_NTHREADS=parallelize( nElem);
+  GDL_NTHREADS=parallelize( nElem, TP_CPU_INTENSIVE);
   //trap existence of ABSFUNC and create something that stands cppchekck useage (needed by contiunous integration scripts!) 
 #ifndef ABSFUNC
 #define FUNCABS
diff --git a/src/objects.cpp b/src/objects.cpp
@@ -114,6 +114,10 @@ volatile bool tryToMimicOriginalWidgets;
 volatile bool useLocalDrivers;
 //do we favor SIMD-accelerated random number generation?
 volatile bool useDSFMTAcceleration;
+//Transpose() operations are faster with our method, but setting this may test if this is still true for future Eigen:: versions or platforms.
+volatile bool useEigenForTransposeOps=false;
+//experimental TPOOL use adaptive number of threads.
+volatile bool useSmartTpool=false;
 
 void ResetObjects()
 {
diff --git a/src/objects.hpp b/src/objects.hpp
@@ -86,6 +86,8 @@ extern volatile bool useDSFMTAcceleration;
 //do we use our own copy of (better?) drivers?
 extern volatile bool useLocalDrivers;
 extern volatile bool usePlatformDeviceName;
+extern volatile bool useEigenForTransposeOps;
+extern volatile bool useSmartTpool;
 extern          int  debugMode;
 
 enum DebugCode {