diff --git a/cpp/daal/src/algorithms/covariance/covariance_impl.i b/cpp/daal/src/algorithms/covariance/covariance_impl.i index 6813801297f..acdc51af770 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_impl.i +++ b/cpp/daal/src/algorithms/covariance/covariance_impl.i @@ -91,16 +91,6 @@ template class CovarianceReducer : public daal::Reducer { public: - enum ErrorCode - { - ok = 0, /// No error - memAllocationFailed = 1, /// Memory allocation failed - intOverflow = 2, /// Integer overflow - badCast = 3 /// Cannot cast base daal::Reducer to derived class - }; - /// Status of the computation. - ErrorCode errorCode; - /// Get pointer to the array of partial sums. inline algorithmFPType * sums() { return _sumsArray.get(); } /// Get pointer to the partial cross-product matrix. diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h b/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h index e2505c1b588..da26db3b623 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h +++ b/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h @@ -45,7 +45,8 @@ enum HyperparameterId denseUpdateMaxColsBatched = 1, denseSmallRowsThreshold = 2, denseSmallRowsMaxColsBatched = 3, - hyperparameterIdCount = 4 + denseGrainSize = 4, + hyperparameterIdCount = 5 }; enum DoubleHyperparameterId diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h index 31951db9cdc..50e65dd7eb9 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h +++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h @@ -125,57 +125,71 @@ class FinalizeKernel static Status solveSystem(DAAL_INT p, algorithmFPType * a, DAAL_INT ny, algorithmFPType * b, const ErrorID & internalError); }; -/** - * Thread local storage used on the partial results update stage - */ template -class ThreadingTask +class LinearModelReducer : public daal::Reducer { - typedef ReadRows ReadRowsType; - public: - DAAL_NEW_DELETE(); + /// Construct and initialize the thread-local partial results + /// + /// @param[in] xTable Input data table that stores matrix X with feature vectors + /// @param[in] yTable Input data table that stores matrix Y with target values + /// @param[in] nBetasIntercept Number of trainable parameters + /// @param[in] numRowsInBlock Number of rows in the block of the input data table - a mininal number of rows to be processed by a thread + /// @param[in] numBlocks Number of blocks of rows in the input data table + LinearModelReducer(const NumericTable & xTable, const NumericTable & yTable, DAAL_INT nBetasIntercept, DAAL_INT numRowsInBlock, size_t numBlocks); - /** - * Creates thread local storage of the requested size - * \param[in] nBetasIntercept Number of colums in the partial result - * \param[in] nResponses Number of responses - * \return Pointer on the thread local storage object if the object was created successfully, NULL otherwise - */ - static ThreadingTask * create(size_t nBetasIntercept, size_t nResponses); - virtual ~ThreadingTask(); + /// New and delete operators are overloaded to use scalable memory allocator that doesn't block threads + /// if memory allocations are executed concurrently. + void * operator new(size_t size); + void operator delete(void * p); - /** - * Updates local partial result with the new block of data - * \param[in] startRow Index of the starting row of the block - * \param[in] nRows Number of rows in the block of data - * \param[in] xTable Input data set of size N x P - * \param[in] yTable Input array of responses of size N x Ny - * \return Status of the computations - */ - Status update(DAAL_INT startRow, DAAL_INT nRows, const NumericTable & xTable, const NumericTable & yTable); + /// Get pointer to the array of partial X^tY matrix. + inline algorithmFPType * xty(); + /// Get pointer to the partial X^tX matrix. + inline algorithmFPType * xtx(); - /** - * Reduces thread local partial results into global partial result - * \param[out] xtx Global partial result of size P' x P' - * \param[out] xty Global partial result of size Ny x P' - */ - void reduce(algorithmFPType * xtx, algorithmFPType * xty); + /// Get constant pointer to the array of partial X^tY matrix. + inline const algorithmFPType * xty() const; + /// Get constant pointer to the partial X^tX matrix. + inline const algorithmFPType * xtx() const; -protected: - /** - * Construct thread local storage of the requested size - * \param[in] nBetasIntercept Number of colums in the partial result - * \param[in] nResponses Number of responses - * \param[out] st Status of the object construction - */ - ThreadingTask(size_t nBetasIntercept, size_t nResponses, Status & st); - algorithmFPType * _xtx; /*!< Partial result of size P' x P' */ - algorithmFPType * _xty; /*!< Partial result of size Ny x P' */ - ReadRowsType _xBlock; /*!< Object that manages memory block of the input data set */ - ReadRowsType _yBlock; /*!< Object that manages memory block of the input array of responses */ - DAAL_INT _nBetasIntercept; /*!< P' - number of columns in the partial result */ - DAAL_INT _nResponses; /*!< Ny - number of responses */ + /// Constructs a thread-local partial result and initializes it with zeros. + /// Must be able to run concurrently with `update` and `join` methods. + /// + /// @return Pointer to the partial result of the linear regression algorithm. + virtual ReducerUniquePtr create() const override; + + /// Updates partial X^tX and X^tY matrices + /// from the blocks of input data table in the sub-interval [begin, end). + /// + /// @param begin Index of the starting block of the input data table. + /// @param end Index of the block after the last one in the sub-range. + virtual void update(size_t begin, size_t end) override; + + /// Merge the partial result with the data from another thread. + /// + /// @param otherReducer Pointer to the other thread's partial result. + virtual void join(Reducer * otherReducer) override; + +private: + /// Reference to the input data table that stores matrix X with feature vectors. + const NumericTable & _xTable; + /// Reference to the input data table that stores matrix Y with target values. + const NumericTable & _yTable; + /// Number of features in the input data table. + DAAL_INT _nFeatures; + /// Number of targets to predict. + DAAL_INT _nResponses; + /// Number of trainable parameters (including intercept). + DAAL_INT _nBetasIntercept; + /// Number of rows in the block of the input data table - a mininal number of rows to be processed by a thread. + DAAL_INT _numRowsInBlock; + /// Number of blocks of rows in the input data table. + size_t _numBlocks; + /// Thread-local array of X^tX partial sums of size `_nBetasIntercept` * `_nBetasIntercept`. + TArrayScalableCalloc _xtx; + /// Thread-local array of X^tY partial sums of size `_nBetasIntercept` * `_nLabels`. + TArrayScalableCalloc _xty; }; /** @@ -186,7 +200,7 @@ class UpdateKernel { typedef WriteRows WriteRowsType; typedef ReadRows ReadRowsType; - typedef ThreadingTask ThreadingTaskType; + // typedef ThreadingTask ThreadingTaskType; public: typedef linear_model::internal::Hyperparameter HyperparameterType; diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i index 132b3df9cc0..8b941e2e499 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i +++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i @@ -47,123 +47,234 @@ using namespace daal::internal; using namespace daal::services::internal; template -ThreadingTask::ThreadingTask(size_t nBetasIntercept, size_t nResponses, Status & st) - : _nBetasIntercept(nBetasIntercept), _nResponses(nResponses) +LinearModelReducer::LinearModelReducer(const NumericTable & xTable, const NumericTable & yTable, DAAL_INT nBetasIntercept, + DAAL_INT numRowsInBlock, size_t numBlocks) + : _xTable(xTable), + _yTable(yTable), + _nBetasIntercept(nBetasIntercept), + _numRowsInBlock(numRowsInBlock), + _numBlocks(numBlocks), + _nFeatures(xTable.getNumberOfColumns()), + _nResponses(yTable.getNumberOfColumns()) { - _xtx = service_scalable_calloc(nBetasIntercept * nBetasIntercept); - _xty = service_scalable_calloc(nBetasIntercept * nResponses); - if (!_xtx || !_xty) st.add(ErrorMemoryAllocationFailed); + bool isOverflow = false; + DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _nBetasIntercept, _nBetasIntercept, isOverflow); + if (isOverflow) + { + errorCode = ErrorCode::intOverflow; + return; + } + if (!_xtx.reset(_nBetasIntercept * _nBetasIntercept)) + { + errorCode = ErrorCode::memAllocationFailed; + return; + } + + DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _nBetasIntercept, _nResponses, isOverflow); + if (isOverflow) + { + errorCode = ErrorCode::intOverflow; + return; + } + if (!_xty.reset(_nBetasIntercept * _nResponses)) + { + errorCode = ErrorCode::memAllocationFailed; + return; + } + errorCode = ErrorCode::ok; } template -ThreadingTask * ThreadingTask::create(size_t nBetasIntercept, size_t nResponses) +void * LinearModelReducer::operator new(size_t size) { - Status st; - ThreadingTask * res = new ThreadingTask(nBetasIntercept, nResponses, st); - if (!st) - { - delete res; - return nullptr; - } - return res; + return service_scalable_malloc(size); } template -Status ThreadingTask::update(DAAL_INT startRow, DAAL_INT nRows, const NumericTable & xTable, const NumericTable & yTable) +void LinearModelReducer::operator delete(void * p) { - DAAL_INT nFeatures(xTable.getNumberOfColumns()); + service_scalable_free((unsigned char *)p); +} - /* SYRK and GEMM parameters */ - char up = 'U'; - char trans = 'T'; - char notrans = 'N'; - algorithmFPType alpha(1.0); +template +inline algorithmFPType * LinearModelReducer::xty() +{ + return _xty.get(); +} - _xBlock.set(const_cast(xTable), startRow, nRows); - DAAL_CHECK_BLOCK_STATUS(_xBlock); - const algorithmFPType * x = _xBlock.get(); +template +inline algorithmFPType * LinearModelReducer::xtx() +{ + return _xtx.get(); +} - _yBlock.set(const_cast(yTable), startRow, nRows); - DAAL_CHECK_BLOCK_STATUS(_yBlock); - const algorithmFPType * y = _yBlock.get(); +template +inline const algorithmFPType * LinearModelReducer::xty() const +{ + return _xty.get(); +} +template +inline const algorithmFPType * LinearModelReducer::xtx() const +{ + return _xtx.get(); +} + +template +ReducerUniquePtr LinearModelReducer::create() const +{ + return daal::internal::makeUnique, DAAL_BASE_CPU>(_xTable, _yTable, _nBetasIntercept, _numRowsInBlock, + _numBlocks); +} + +template +void LinearModelReducer::update(size_t begin, size_t end) +{ + /* SYRK and GEMM parameters */ + char up = 'U'; + char trans = 'T'; + char notrans = 'N'; + algorithmFPType one = 1.0; + DAAL_PROFILER_THREADING_TASK(reducer.update); + if (errorCode != ErrorCode::ok) { - DAAL_PROFILER_THREADING_TASK(update.syrkX); - BlasInst::xxsyrk(&up, ¬rans, &nFeatures, &nRows, &alpha, const_cast(x), &nFeatures, &alpha, _xtx, - &_nBetasIntercept); + return; } + algorithmFPType * xtxPtr = xtx(); + algorithmFPType * xtyPtr = xty(); - if (nFeatures < _nBetasIntercept) + if (!xtxPtr || !xtyPtr) { - DAAL_PROFILER_THREADING_TASK(update.gemm1X); - algorithmFPType * xtxPtr = _xtx + nFeatures * _nBetasIntercept; - const algorithmFPType * xPtr = x; - - for (DAAL_INT i = 0; i < nRows; i++, xPtr += nFeatures) - { - PRAGMA_FORCE_SIMD - PRAGMA_VECTOR_ALWAYS - for (DAAL_INT j = 0; j < nFeatures; j++) - { - xtxPtr[j] += xPtr[j]; - } - } - - xtxPtr[nFeatures] += algorithmFPType(nRows); + errorCode = ErrorCode::memAllocationFailed; + return; } + bool isOverflow = false; + DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _numRowsInBlock, _nFeatures, isOverflow); + if (isOverflow) { - DAAL_PROFILER_THREADING_TASK(update.gemmXY); - BlasInst::xxgemm(¬rans, &trans, &nFeatures, &_nResponses, &nRows, &alpha, x, &nFeatures, y, &_nResponses, &alpha, - _xty, &_nBetasIntercept); + errorCode = ErrorCode::intOverflow; + return; } + DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _numRowsInBlock, _nResponses, isOverflow); + if (isOverflow) + { + errorCode = ErrorCode::intOverflow; + return; + } + const size_t numRowsInLastBlock = _numRowsInBlock + _xTable.getNumberOfRows() - _numBlocks * _numRowsInBlock; - if (nFeatures < _nBetasIntercept) + /// Process blocks of the input data table + for (size_t iBlock = begin; iBlock < end; ++iBlock) { - DAAL_PROFILER_THREADING_TASK(update.gemm1Y); - const algorithmFPType * yPtr = y; - for (DAAL_INT i = 0; i < nRows; i++, yPtr += _nResponses) + size_t nRows = ((iBlock + 1 < _numBlocks) ? _numRowsInBlock : numRowsInLastBlock); + size_t startRow = iBlock * _numRowsInBlock; + + ReadRows xTableBD(const_cast(_xTable), startRow, nRows); + ReadRows yTableBD(const_cast(_yTable), startRow, nRows); + if (!xTableBD.get() || !yTableBD.get()) + { + errorCode = ErrorCode::memAllocationFailed; + return; + } + algorithmFPType * xBlock = const_cast(xTableBD.get()); + algorithmFPType * yBlock = const_cast(yTableBD.get()); + + /// Update the cross-product matrix with the data from the block + { + DAAL_PROFILER_THREADING_TASK(reducer.update.syrkXX); + BlasInst::xsyrk(&up, ¬rans, &_nFeatures, reinterpret_cast(&nRows), &one, xBlock, &_nFeatures, &one, + xtxPtr, &_nBetasIntercept); + } + + if (_nFeatures < _nBetasIntercept) + { + // TODO: Substitute this part with a call to gemv or reduce by column/row primitive + DAAL_PROFILER_THREADING_TASK(reducer.update.gemm1X); + algorithmFPType * xtxLastRowPtr = xtxPtr + _nFeatures * _nBetasIntercept; + const algorithmFPType * xPtr = xBlock; + + for (DAAL_INT i = 0; i < nRows; i++, xPtr += _nFeatures) + { + PRAGMA_FORCE_SIMD + PRAGMA_VECTOR_ALWAYS + for (DAAL_INT j = 0; j < _nFeatures; j++) + { + xtxLastRowPtr[j] += xPtr[j]; + } + } + xtxLastRowPtr[_nFeatures] += algorithmFPType(nRows); + } { - PRAGMA_FORCE_SIMD - PRAGMA_VECTOR_ALWAYS - for (DAAL_INT j = 0; j < _nResponses; j++) + DAAL_PROFILER_THREADING_TASK(reducer.update.gemmXY); + BlasInst::xgemm(¬rans, &trans, &_nFeatures, &_nResponses, reinterpret_cast(&nRows), &one, xBlock, + &_nFeatures, yBlock, &_nResponses, &one, xtyPtr, &_nBetasIntercept); + } + if (_nFeatures < _nBetasIntercept) + { + // TODO: Substitute this part with call to gemv or reduce by column/row primitive + DAAL_PROFILER_THREADING_TASK(reducer.update.gemm1Y); + const algorithmFPType * yPtr = yBlock; + for (DAAL_INT i = 0; i < nRows; i++, yPtr += _nResponses) { - _xty[j * _nBetasIntercept + nFeatures] += yPtr[j]; + PRAGMA_FORCE_SIMD + PRAGMA_VECTOR_ALWAYS + for (DAAL_INT j = 0; j < _nResponses; j++) + { + xtyPtr[j * _nBetasIntercept + _nFeatures] += yPtr[j]; + } } } } - return Status(); } template -void ThreadingTask::reduce(algorithmFPType * xtx, algorithmFPType * xty) +void LinearModelReducer::join(daal::Reducer * otherReducer) { + if (errorCode != ErrorCode::ok) { - DAAL_PROFILER_THREADING_TASK(reduce.syrkX); - PRAGMA_FORCE_SIMD - PRAGMA_VECTOR_ALWAYS - for (size_t i = 0; i < (_nBetasIntercept * _nBetasIntercept); i++) - { - xtx[i] += _xtx[i]; - } + return; + } + DAAL_PROFILER_THREADING_TASK(reducer.join); + LinearModelReducer * other = dynamic_cast *>(otherReducer); + if (!other) + { + errorCode = ErrorCode::badCast; + return; + } + if (other->errorCode != ErrorCode::ok) + { + errorCode = other->errorCode; + return; } + const algorithmFPType * otherXTX = other->xtx(); + algorithmFPType * thisXTX = xtx(); + const algorithmFPType * otherXTY = other->xty(); + algorithmFPType * thisXTY = xty(); + + if (!otherXTX || !otherXTY || !thisXTX || !thisXTY) { - DAAL_PROFILER_THREADING_TASK(reduce.gemmXY); - PRAGMA_FORCE_SIMD - PRAGMA_VECTOR_ALWAYS - for (size_t i = 0; i < (_nBetasIntercept * _nResponses); i++) - { - xty[i] += _xty[i]; - } + errorCode = ErrorCode::memAllocationFailed; + return; + } + /// It is safe to use aligned loads and stores because the data in TArrayScalableCalloc data structures is aligned + PRAGMA_FORCE_SIMD + PRAGMA_VECTOR_ALWAYS + PRAGMA_VECTOR_ALIGNED + for (size_t i = 0; i < (_nBetasIntercept * _nBetasIntercept); i++) + { + thisXTX[i] += otherXTX[i]; } -} -template -ThreadingTask::~ThreadingTask() -{ - service_scalable_free(_xtx); - service_scalable_free(_xty); + /// It is safe to use aligned loads and stores because the data in TArrayScalableCalloc data structures is aligned + PRAGMA_FORCE_SIMD + PRAGMA_VECTOR_ALWAYS + PRAGMA_VECTOR_ALIGNED + for (size_t i = 0; i < (_nBetasIntercept * _nResponses); i++) + { + thisXTY[i] += otherXTY[i]; + } } template @@ -296,51 +407,67 @@ Status UpdateKernel::compute(const NumericTable & xTable, /* Split rows by blocks */ size_t nRowsInBlock = 128; + size_t grainSize = 1; // minimal number of data blocks to be processed by a thread if (hyperparameter != nullptr) { DAAL_INT64 nRowsInBlockInt64 = 0l; services::Status status = hyperparameter->find(denseUpdateStepBlockSize, nRowsInBlockInt64); DAAL_CHECK(0l < nRowsInBlockInt64, services::ErrorIncorrectDataRange); DAAL_CHECK_STATUS_VAR(status); - nRowsInBlock = static_cast(nRowsInBlockInt64); + + DAAL_INT64 grainSizeInt64 = 0l; + status = hyperparameter->find(denseGrainSize, grainSizeInt64); + DAAL_CHECK(0l < grainSizeInt64, services::ErrorIncorrectDataRange); + DAAL_CHECK_STATUS_VAR(status); + grainSize = static_cast(grainSizeInt64); } size_t nBlocks = nRows / nRowsInBlock; nBlocks += bool(nRows % nRowsInBlock); - /* Create TLS */ - daal::static_tls tls([=]() -> ThreadingTaskType * { return ThreadingTaskType::create(nBetasIntercept, nResponses); }); - - SafeStatus safeStat; - daal::static_threader_for(nBlocks, [=, &tls, &xTable, &yTable, &safeStat](int iBlock, size_t tid) { - ThreadingTaskType * tlsLocal = tls.local(tid); + LinearModelReducer result(xTable, yTable, nBetasIntercept, nRowsInBlock, nBlocks); + if (!result.xtx() || !result.xty()) + { + return services::Status(services::ErrorMemoryAllocationFailed); + } + /* Reduce input matrices X, Y into cross product X^tX and matrix-matrix product X^tY */ + daal::static_threader_reduce(nBlocks, grainSize, result); - if (!tlsLocal) + if (result.errorCode != LinearModelReducer::ok) + { + if (result.errorCode == LinearModelReducer::memAllocationFailed) { - safeStat.add(services::ErrorMemoryAllocationFailed); - return; + return services::Status(services::ErrorMemoryAllocationFailed); } - - size_t startRow = iBlock * nRowsInBlock; - size_t endRow = startRow + nRowsInBlock; - if (endRow > nRows) + if (result.errorCode == LinearModelReducer::intOverflow) { - endRow = nRows; + return services::Status(services::ErrorBufferSizeIntegerOverflow); } + if (result.errorCode == LinearModelReducer::badCast) + { + return services::Status(services::ErrorLinearRegressionInternal); + } + } - Status localSt = tlsLocal->update(startRow, endRow - startRow, xTable, yTable); - DAAL_CHECK_STATUS_THR(localSt); - }); + const algorithmFPType * resultXTX = result.xtx(); + const algorithmFPType * resultXTY = result.xty(); - Status st = safeStat.detach(); - tls.reduce([=, &st](ThreadingTaskType * tlsLocal) -> void { - if (!tlsLocal) return; - if (st) tlsLocal->reduce(xtx, xty); - delete tlsLocal; - }); + PRAGMA_FORCE_SIMD + PRAGMA_VECTOR_ALWAYS + for (size_t i = 0; i < (nBetasIntercept * nBetasIntercept); i++) + { + xtx[i] += resultXTX[i]; + } + + PRAGMA_FORCE_SIMD + PRAGMA_VECTOR_ALWAYS + for (size_t i = 0; i < (nBetasIntercept * nResponses); i++) + { + xty[i] += resultXTY[i]; + } - return st; + return services::Status(); } } // namespace internal diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp index 6ae7f1ad869..bcdc8bd6816 100644 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp +++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp @@ -68,6 +68,7 @@ services::Status convert(const Hyperparameter * params, services::SharedPtrfind(HyperparameterId::denseSmallRowsMaxColsBatched, denseSmallRowsMaxColsBatched); DAAL_CHECK(st, services::ErrorHyperparameterNotFound); + st |= params->find(HyperparameterId::denseGrainSize, denseGrainSize); + DAAL_CHECK(st, services::ErrorHyperparameterNotFound); + /// Setters st |= result->set(LinearModelHyperparameterId::denseUpdateStepBlockSize, denseUpdateStepBlockSize); DAAL_CHECK(st, services::ErrorHyperparameterCanNotBeSet); @@ -98,6 +102,9 @@ services::Status convert(const Hyperparameter * params, services::SharedPtrset(LinearModelHyperparameterId::denseSmallRowsMaxColsBatched, denseSmallRowsMaxColsBatched); DAAL_CHECK(st, services::ErrorHyperparameterCanNotBeSet); + + st |= result->set(LinearModelHyperparameterId::denseGrainSize, denseGrainSize); + DAAL_CHECK(st, services::ErrorHyperparameterCanNotBeSet); } else { diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h index 51ef54cf80d..8409fdf52a9 100644 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h +++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h @@ -49,7 +49,8 @@ enum HyperparameterId denseUpdateMaxColsBatched = 1, denseSmallRowsThreshold = 2, denseSmallRowsMaxColsBatched = 3, - hyperparameterIdCount = 4 + denseGrainSize = 4, + hyperparameterIdCount = 5 }; enum DoubleHyperparameterId diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h index f0e23ed8777..76ac87f78fe 100644 --- a/cpp/daal/src/threading/threading.h +++ b/cpp/daal/src/threading/threading.h @@ -78,6 +78,16 @@ class Reducer virtual void join(Reducer * other) = 0; virtual ~Reducer() {} + + enum ErrorCode + { + ok = 0, /// No error + memAllocationFailed = 1, /// Memory allocation failed + intOverflow = 2, /// Integer overflow + badCast = 3 /// Cannot cast base daal::Reducer to derived class + }; + /// Status of the computation. + ErrorCode errorCode; }; typedef internal::UniquePtr ReducerUniquePtr; diff --git a/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp b/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp index 886cf68013f..5d0f6cd609f 100644 --- a/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp @@ -33,6 +33,7 @@ static daal_lr_hyperparameters_t convert_parameters(const detail::train_paramete const std::int64_t max_cols_batched = params.get_cpu_max_cols_batched(); const std::int64_t small_rows_threshold = params.get_cpu_small_rows_threshold(); const std::int64_t small_rows_max_cols_batched = params.get_cpu_small_rows_max_cols_batched(); + const std::int64_t grain_size = params.get_cpu_grain_size(); daal_lr_hyperparameters_t daal_hyperparameter; auto status = daal_hyperparameter.set(HyperparameterId::denseUpdateStepBlockSize, block); @@ -46,6 +47,9 @@ static daal_lr_hyperparameters_t convert_parameters(const detail::train_paramete small_rows_max_cols_batched); interop::status_to_exception(status); + status = daal_hyperparameter.set(HyperparameterId::denseGrainSize, grain_size); + interop::status_to_exception(status); + return daal_hyperparameter; } diff --git a/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp b/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp index 2bcf9d2d9d9..1d3c64f990c 100644 --- a/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp @@ -53,6 +53,11 @@ std::int64_t propose_small_rows_max_cols_batched(const std::int64_t f, const std return detail::train_parameters{}.get_cpu_small_rows_max_cols_batched(); } +template +std::int64_t propose_grain_size(const std::int64_t f, const std::int64_t r) { + return detail::train_parameters{}.get_cpu_grain_size(); +} + template struct train_parameters_cpu { using params_t = detail::train_parameters; @@ -72,12 +77,14 @@ struct train_parameters_cpu { propose_small_rows_threshold(f_count, r_count); const std::int64_t small_rows_max_cols_batched = propose_small_rows_max_cols_batched(f_count, r_count); + const std::int64_t grain_size = propose_grain_size(f_count, r_count); params_t out{}; out.set_cpu_macro_block(block); out.set_cpu_max_cols_batched(max_cols_batched); out.set_cpu_small_rows_threshold(small_rows_threshold); out.set_cpu_small_rows_max_cols_batched(small_rows_max_cols_batched); + out.set_cpu_grain_size(grain_size); return out; } params_t operator()(const context_cpu& ctx, @@ -96,12 +103,14 @@ struct train_parameters_cpu { propose_small_rows_threshold(f_count, r_count); const std::int64_t small_rows_max_cols_batched = propose_small_rows_max_cols_batched(f_count, r_count); + const std::int64_t grain_size = propose_grain_size(f_count, r_count); params_t out{}; out.set_cpu_macro_block(block); out.set_cpu_max_cols_batched(max_cols_batched); out.set_cpu_small_rows_threshold(small_rows_threshold); out.set_cpu_small_rows_max_cols_batched(small_rows_max_cols_batched); + out.set_cpu_grain_size(grain_size); return out; } params_t operator()(const context_cpu& ctx, @@ -120,12 +129,14 @@ struct train_parameters_cpu { propose_small_rows_threshold(f_count, r_count); const std::int64_t small_rows_max_cols_batched = propose_small_rows_max_cols_batched(f_count, r_count); + const std::int64_t grain_size = propose_grain_size(f_count, r_count); params_t out{}; out.set_cpu_macro_block(block); out.set_cpu_max_cols_batched(max_cols_batched); out.set_cpu_small_rows_threshold(small_rows_threshold); out.set_cpu_small_rows_max_cols_batched(small_rows_max_cols_batched); + out.set_cpu_grain_size(grain_size); return out; } }; diff --git a/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp b/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp index 634531f432e..2cf023a80f5 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp @@ -58,6 +58,7 @@ class lr_train_params_test : public lr_testmax_cols_batched_ = GENERATE(50); this->small_rows_threshold_ = GENERATE(15, 70); this->small_rows_max_cols_batched_ = GENERATE(40); + this->grain_size_ = GENERATE(1, 10); this->pack_as_struct_ = GENERATE(0, 1); } @@ -67,6 +68,7 @@ class lr_train_params_test : public lr_testmax_cols_batched_); res.set_cpu_small_rows_threshold(this->small_rows_threshold_); res.set_cpu_small_rows_max_cols_batched(this->small_rows_max_cols_batched_); + res.set_cpu_grain_size(this->grain_size_); return res; } @@ -76,6 +78,7 @@ class lr_train_params_test : public lr_testmax_cols_batched_ > 0); REQUIRE(this->small_rows_threshold_ > 0); REQUIRE(this->small_rows_max_cols_batched_ > 0); + REQUIRE(this->grain_size_ > 0); const auto params = this->get_current_parameters(); if (this->pack_as_struct_) { return te::float_algo_fixture::train( @@ -95,6 +98,7 @@ class lr_train_params_test : public lr_test @@ -114,6 +115,16 @@ void train_parameters::set_cpu_small_rows_max_cols_batched_impl(std::int64 impl_->cpu_small_rows_max_cols_batched = val; } +template +std::int64_t train_parameters::get_cpu_grain_size() const { + return impl_->cpu_grain_size; +} + +template +void train_parameters::set_cpu_grain_size_impl(std::int64_t val) { + impl_->cpu_grain_size = val; +} + template class ONEDAL_EXPORT train_parameters; } // namespace detail::v1 diff --git a/cpp/oneapi/dal/algo/linear_regression/train_types.hpp b/cpp/oneapi/dal/algo/linear_regression/train_types.hpp index 423edc6095f..0677c1af5b8 100644 --- a/cpp/oneapi/dal/algo/linear_regression/train_types.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/train_types.hpp @@ -82,12 +82,19 @@ class train_parameters : public dal::detail::system_parameters { return *this; } + std::int64_t get_cpu_grain_size() const; + auto& set_cpu_grain_size(std::int64_t val) { + set_cpu_grain_size_impl(val); + return *this; + } + private: void set_cpu_macro_block_impl(std::int64_t val); void set_gpu_macro_block_impl(std::int64_t val); void set_cpu_max_cols_batched_impl(std::int64_t val); void set_cpu_small_rows_threshold_impl(std::int64_t val); void set_cpu_small_rows_max_cols_batched_impl(std::int64_t val); + void set_cpu_grain_size_impl(std::int64_t val); dal::detail::pimpl> impl_; };