diff --git a/cpp/daal/src/algorithms/covariance/covariance_impl.i b/cpp/daal/src/algorithms/covariance/covariance_impl.i
index 6813801297f..acdc51af770 100644
--- a/cpp/daal/src/algorithms/covariance/covariance_impl.i
+++ b/cpp/daal/src/algorithms/covariance/covariance_impl.i
@@ -91,16 +91,6 @@ template <typename algorithmFPType, CpuType cpu>
 class CovarianceReducer : public daal::Reducer
 {
 public:
-    enum ErrorCode
-    {
-        ok                  = 0, /// No error
-        memAllocationFailed = 1, /// Memory allocation failed
-        intOverflow         = 2, /// Integer overflow
-        badCast             = 3  /// Cannot cast base daal::Reducer to derived class
-    };
-    /// Status of the computation.
-    ErrorCode errorCode;
-
     /// Get pointer to the array of partial sums.
     inline algorithmFPType * sums() { return _sumsArray.get(); }
     /// Get pointer to the partial cross-product matrix.
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h b/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h
index e2505c1b588..da26db3b623 100644
--- a/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_hyperparameter_impl.h
@@ -45,7 +45,8 @@ enum HyperparameterId
     denseUpdateMaxColsBatched    = 1,
     denseSmallRowsThreshold      = 2,
     denseSmallRowsMaxColsBatched = 3,
-    hyperparameterIdCount        = 4
+    denseGrainSize               = 4,
+    hyperparameterIdCount        = 5
 };
 
 enum DoubleHyperparameterId
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h
index 31951db9cdc..50e65dd7eb9 100644
--- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_kernel.h
@@ -125,57 +125,71 @@ class FinalizeKernel
     static Status solveSystem(DAAL_INT p, algorithmFPType * a, DAAL_INT ny, algorithmFPType * b, const ErrorID & internalError);
 };
 
-/**
- * Thread local storage used on the partial results update stage
- */
 template <typename algorithmFPType, CpuType cpu>
-class ThreadingTask
+class LinearModelReducer : public daal::Reducer
 {
-    typedef ReadRows<algorithmFPType, cpu> ReadRowsType;
-
 public:
-    DAAL_NEW_DELETE();
+    /// Construct and initialize the thread-local partial results
+    ///
+    /// @param[in] xTable        Input data table that stores matrix X with feature vectors
+    /// @param[in] yTable        Input data table that stores matrix Y with target values
+    /// @param[in] nBetasIntercept   Number of trainable parameters
+    /// @param[in] numRowsInBlock   Number of rows in the block of the input data table - a mininal number of rows to be processed by a thread
+    /// @param[in] numBlocks        Number of blocks of rows in the input data table
+    LinearModelReducer(const NumericTable & xTable, const NumericTable & yTable, DAAL_INT nBetasIntercept, DAAL_INT numRowsInBlock, size_t numBlocks);
 
-    /**
-     * Creates thread local storage of the requested size
-     * \param[in] nBetasIntercept   Number of colums in the partial result
-     * \param[in] nResponses        Number of responses
-     * \return Pointer on the thread local storage object if the object was created successfully, NULL otherwise
-     */
-    static ThreadingTask<algorithmFPType, cpu> * create(size_t nBetasIntercept, size_t nResponses);
-    virtual ~ThreadingTask();
+    /// New and delete operators are overloaded to use scalable memory allocator that doesn't block threads
+    /// if memory allocations are executed concurrently.
+    void * operator new(size_t size);
+    void operator delete(void * p);
 
-    /**
-     * Updates local partial result with the new block of data
-     * \param[in] startRow  Index of the starting row of the block
-     * \param[in] nRows     Number of rows in the block of data
-     * \param[in] xTable    Input data set of size N x P
-     * \param[in] yTable    Input array of responses of size N x Ny
-     * \return Status of the computations
-     */
-    Status update(DAAL_INT startRow, DAAL_INT nRows, const NumericTable & xTable, const NumericTable & yTable);
+    /// Get pointer to the array of partial X^tY matrix.
+    inline algorithmFPType * xty();
+    /// Get pointer to the partial X^tX matrix.
+    inline algorithmFPType * xtx();
 
-    /**
-     * Reduces thread local partial results into global partial result
-     * \param[out] xtx Global partial result of size P' x P'
-     * \param[out] xty Global partial result of size Ny x P'
-     */
-    void reduce(algorithmFPType * xtx, algorithmFPType * xty);
+    /// Get constant pointer to the array of partial X^tY matrix.
+    inline const algorithmFPType * xty() const;
+    /// Get constant pointer to the partial X^tX matrix.
+    inline const algorithmFPType * xtx() const;
 
-protected:
-    /**
-     * Construct thread local storage of the requested size
-     * \param[in]  nBetasIntercept  Number of colums in the partial result
-     * \param[in]  nResponses       Number of responses
-     * \param[out] st               Status of the object construction
-     */
-    ThreadingTask(size_t nBetasIntercept, size_t nResponses, Status & st);
-    algorithmFPType * _xtx;    /*!< Partial result of size P' x P' */
-    algorithmFPType * _xty;    /*!< Partial result of size Ny x P' */
-    ReadRowsType _xBlock;      /*!< Object that manages memory block of the input data set */
-    ReadRowsType _yBlock;      /*!< Object that manages memory block of the input array of responses */
-    DAAL_INT _nBetasIntercept; /*!< P' - number of columns in the partial result */
-    DAAL_INT _nResponses;      /*!< Ny - number of responses */
+    /// Constructs a thread-local partial result and initializes it with zeros.
+    /// Must be able to run concurrently with `update` and `join` methods.
+    ///
+    /// @return Pointer to the partial result of the linear regression algorithm.
+    virtual ReducerUniquePtr create() const override;
+
+    /// Updates partial X^tX and X^tY matrices
+    /// from the blocks of input data table in the sub-interval [begin, end).
+    ///
+    /// @param begin Index of the starting block of the input data table.
+    /// @param end   Index of the block after the last one in the sub-range.
+    virtual void update(size_t begin, size_t end) override;
+
+    /// Merge the partial result with the data from another thread.
+    ///
+    /// @param otherReducer Pointer to the other thread's partial result.
+    virtual void join(Reducer * otherReducer) override;
+
+private:
+    /// Reference to the input data table that stores matrix X with feature vectors.
+    const NumericTable & _xTable;
+    /// Reference to the input data table that stores matrix Y with target values.
+    const NumericTable & _yTable;
+    /// Number of features in the input data table.
+    DAAL_INT _nFeatures;
+    /// Number of targets to predict.
+    DAAL_INT _nResponses;
+    /// Number of trainable parameters (including intercept).
+    DAAL_INT _nBetasIntercept;
+    /// Number of rows in the block of the input data table - a mininal number of rows to be processed by a thread.
+    DAAL_INT _numRowsInBlock;
+    /// Number of blocks of rows in the input data table.
+    size_t _numBlocks;
+    /// Thread-local array of X^tX partial sums of size `_nBetasIntercept` * `_nBetasIntercept`.
+    TArrayScalableCalloc<algorithmFPType, cpu> _xtx;
+    /// Thread-local array of X^tY partial sums of size `_nBetasIntercept` * `_nLabels`.
+    TArrayScalableCalloc<algorithmFPType, cpu> _xty;
 };
 
 /**
@@ -186,7 +200,7 @@ class UpdateKernel
 {
     typedef WriteRows<algorithmFPType, cpu> WriteRowsType;
     typedef ReadRows<algorithmFPType, cpu> ReadRowsType;
-    typedef ThreadingTask<algorithmFPType, cpu> ThreadingTaskType;
+    // typedef ThreadingTask<algorithmFPType, cpu> ThreadingTaskType;
 
 public:
     typedef linear_model::internal::Hyperparameter HyperparameterType;
diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i
index 132b3df9cc0..8b941e2e499 100644
--- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i
+++ b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_impl.i
@@ -47,123 +47,234 @@ using namespace daal::internal;
 using namespace daal::services::internal;
 
 template <typename algorithmFPType, CpuType cpu>
-ThreadingTask<algorithmFPType, cpu>::ThreadingTask(size_t nBetasIntercept, size_t nResponses, Status & st)
-    : _nBetasIntercept(nBetasIntercept), _nResponses(nResponses)
+LinearModelReducer<algorithmFPType, cpu>::LinearModelReducer(const NumericTable & xTable, const NumericTable & yTable, DAAL_INT nBetasIntercept,
+                                                             DAAL_INT numRowsInBlock, size_t numBlocks)
+    : _xTable(xTable),
+      _yTable(yTable),
+      _nBetasIntercept(nBetasIntercept),
+      _numRowsInBlock(numRowsInBlock),
+      _numBlocks(numBlocks),
+      _nFeatures(xTable.getNumberOfColumns()),
+      _nResponses(yTable.getNumberOfColumns())
 {
-    _xtx = service_scalable_calloc<algorithmFPType, cpu>(nBetasIntercept * nBetasIntercept);
-    _xty = service_scalable_calloc<algorithmFPType, cpu>(nBetasIntercept * nResponses);
-    if (!_xtx || !_xty) st.add(ErrorMemoryAllocationFailed);
+    bool isOverflow = false;
+    DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _nBetasIntercept, _nBetasIntercept, isOverflow);
+    if (isOverflow)
+    {
+        errorCode = ErrorCode::intOverflow;
+        return;
+    }
+    if (!_xtx.reset(_nBetasIntercept * _nBetasIntercept))
+    {
+        errorCode = ErrorCode::memAllocationFailed;
+        return;
+    }
+
+    DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _nBetasIntercept, _nResponses, isOverflow);
+    if (isOverflow)
+    {
+        errorCode = ErrorCode::intOverflow;
+        return;
+    }
+    if (!_xty.reset(_nBetasIntercept * _nResponses))
+    {
+        errorCode = ErrorCode::memAllocationFailed;
+        return;
+    }
+    errorCode = ErrorCode::ok;
 }
 
 template <typename algorithmFPType, CpuType cpu>
-ThreadingTask<algorithmFPType, cpu> * ThreadingTask<algorithmFPType, cpu>::create(size_t nBetasIntercept, size_t nResponses)
+void * LinearModelReducer<algorithmFPType, cpu>::operator new(size_t size)
 {
-    Status st;
-    ThreadingTask<algorithmFPType, cpu> * res = new ThreadingTask<algorithmFPType, cpu>(nBetasIntercept, nResponses, st);
-    if (!st)
-    {
-        delete res;
-        return nullptr;
-    }
-    return res;
+    return service_scalable_malloc<unsigned char, cpu>(size);
 }
 
 template <typename algorithmFPType, CpuType cpu>
-Status ThreadingTask<algorithmFPType, cpu>::update(DAAL_INT startRow, DAAL_INT nRows, const NumericTable & xTable, const NumericTable & yTable)
+void LinearModelReducer<algorithmFPType, cpu>::operator delete(void * p)
 {
-    DAAL_INT nFeatures(xTable.getNumberOfColumns());
+    service_scalable_free<unsigned char, cpu>((unsigned char *)p);
+}
 
-    /* SYRK and GEMM parameters */
-    char up      = 'U';
-    char trans   = 'T';
-    char notrans = 'N';
-    algorithmFPType alpha(1.0);
+template <typename algorithmFPType, CpuType cpu>
+inline algorithmFPType * LinearModelReducer<algorithmFPType, cpu>::xty()
+{
+    return _xty.get();
+}
 
-    _xBlock.set(const_cast<NumericTable &>(xTable), startRow, nRows);
-    DAAL_CHECK_BLOCK_STATUS(_xBlock);
-    const algorithmFPType * x = _xBlock.get();
+template <typename algorithmFPType, CpuType cpu>
+inline algorithmFPType * LinearModelReducer<algorithmFPType, cpu>::xtx()
+{
+    return _xtx.get();
+}
 
-    _yBlock.set(const_cast<NumericTable &>(yTable), startRow, nRows);
-    DAAL_CHECK_BLOCK_STATUS(_yBlock);
-    const algorithmFPType * y = _yBlock.get();
+template <typename algorithmFPType, CpuType cpu>
+inline const algorithmFPType * LinearModelReducer<algorithmFPType, cpu>::xty() const
+{
+    return _xty.get();
+}
 
+template <typename algorithmFPType, CpuType cpu>
+inline const algorithmFPType * LinearModelReducer<algorithmFPType, cpu>::xtx() const
+{
+    return _xtx.get();
+}
+
+template <typename algorithmFPType, CpuType cpu>
+ReducerUniquePtr LinearModelReducer<algorithmFPType, cpu>::create() const
+{
+    return daal::internal::makeUnique<LinearModelReducer<algorithmFPType, cpu>, DAAL_BASE_CPU>(_xTable, _yTable, _nBetasIntercept, _numRowsInBlock,
+                                                                                               _numBlocks);
+}
+
+template <typename algorithmFPType, CpuType cpu>
+void LinearModelReducer<algorithmFPType, cpu>::update(size_t begin, size_t end)
+{
+    /* SYRK and GEMM parameters */
+    char up             = 'U';
+    char trans          = 'T';
+    char notrans        = 'N';
+    algorithmFPType one = 1.0;
+    DAAL_PROFILER_THREADING_TASK(reducer.update);
+    if (errorCode != ErrorCode::ok)
     {
-        DAAL_PROFILER_THREADING_TASK(update.syrkX);
-        BlasInst<algorithmFPType, cpu>::xxsyrk(&up, &notrans, &nFeatures, &nRows, &alpha, const_cast<algorithmFPType *>(x), &nFeatures, &alpha, _xtx,
-                                               &_nBetasIntercept);
+        return;
     }
+    algorithmFPType * xtxPtr = xtx();
+    algorithmFPType * xtyPtr = xty();
 
-    if (nFeatures < _nBetasIntercept)
+    if (!xtxPtr || !xtyPtr)
     {
-        DAAL_PROFILER_THREADING_TASK(update.gemm1X);
-        algorithmFPType * xtxPtr     = _xtx + nFeatures * _nBetasIntercept;
-        const algorithmFPType * xPtr = x;
-
-        for (DAAL_INT i = 0; i < nRows; i++, xPtr += nFeatures)
-        {
-            PRAGMA_FORCE_SIMD
-            PRAGMA_VECTOR_ALWAYS
-            for (DAAL_INT j = 0; j < nFeatures; j++)
-            {
-                xtxPtr[j] += xPtr[j];
-            }
-        }
-
-        xtxPtr[nFeatures] += algorithmFPType(nRows);
+        errorCode = ErrorCode::memAllocationFailed;
+        return;
     }
 
+    bool isOverflow = false;
+    DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _numRowsInBlock, _nFeatures, isOverflow);
+    if (isOverflow)
     {
-        DAAL_PROFILER_THREADING_TASK(update.gemmXY);
-        BlasInst<algorithmFPType, cpu>::xxgemm(&notrans, &trans, &nFeatures, &_nResponses, &nRows, &alpha, x, &nFeatures, y, &_nResponses, &alpha,
-                                               _xty, &_nBetasIntercept);
+        errorCode = ErrorCode::intOverflow;
+        return;
     }
+    DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION_BOOL(size_t, _numRowsInBlock, _nResponses, isOverflow);
+    if (isOverflow)
+    {
+        errorCode = ErrorCode::intOverflow;
+        return;
+    }
+    const size_t numRowsInLastBlock = _numRowsInBlock + _xTable.getNumberOfRows() - _numBlocks * _numRowsInBlock;
 
-    if (nFeatures < _nBetasIntercept)
+    /// Process blocks of the input data table
+    for (size_t iBlock = begin; iBlock < end; ++iBlock)
     {
-        DAAL_PROFILER_THREADING_TASK(update.gemm1Y);
-        const algorithmFPType * yPtr = y;
-        for (DAAL_INT i = 0; i < nRows; i++, yPtr += _nResponses)
+        size_t nRows    = ((iBlock + 1 < _numBlocks) ? _numRowsInBlock : numRowsInLastBlock);
+        size_t startRow = iBlock * _numRowsInBlock;
+
+        ReadRows<algorithmFPType, cpu, NumericTable> xTableBD(const_cast<NumericTable &>(_xTable), startRow, nRows);
+        ReadRows<algorithmFPType, cpu, NumericTable> yTableBD(const_cast<NumericTable &>(_yTable), startRow, nRows);
+        if (!xTableBD.get() || !yTableBD.get())
+        {
+            errorCode = ErrorCode::memAllocationFailed;
+            return;
+        }
+        algorithmFPType * xBlock = const_cast<algorithmFPType *>(xTableBD.get());
+        algorithmFPType * yBlock = const_cast<algorithmFPType *>(yTableBD.get());
+
+        /// Update the cross-product matrix with the data from the block
+        {
+            DAAL_PROFILER_THREADING_TASK(reducer.update.syrkXX);
+            BlasInst<algorithmFPType, cpu>::xsyrk(&up, &notrans, &_nFeatures, reinterpret_cast<DAAL_INT *>(&nRows), &one, xBlock, &_nFeatures, &one,
+                                                  xtxPtr, &_nBetasIntercept);
+        }
+
+        if (_nFeatures < _nBetasIntercept)
+        {
+            // TODO: Substitute this part with a call to gemv or reduce by column/row primitive
+            DAAL_PROFILER_THREADING_TASK(reducer.update.gemm1X);
+            algorithmFPType * xtxLastRowPtr = xtxPtr + _nFeatures * _nBetasIntercept;
+            const algorithmFPType * xPtr    = xBlock;
+
+            for (DAAL_INT i = 0; i < nRows; i++, xPtr += _nFeatures)
+            {
+                PRAGMA_FORCE_SIMD
+                PRAGMA_VECTOR_ALWAYS
+                for (DAAL_INT j = 0; j < _nFeatures; j++)
+                {
+                    xtxLastRowPtr[j] += xPtr[j];
+                }
+            }
+            xtxLastRowPtr[_nFeatures] += algorithmFPType(nRows);
+        }
         {
-            PRAGMA_FORCE_SIMD
-            PRAGMA_VECTOR_ALWAYS
-            for (DAAL_INT j = 0; j < _nResponses; j++)
+            DAAL_PROFILER_THREADING_TASK(reducer.update.gemmXY);
+            BlasInst<algorithmFPType, cpu>::xgemm(&notrans, &trans, &_nFeatures, &_nResponses, reinterpret_cast<DAAL_INT *>(&nRows), &one, xBlock,
+                                                   &_nFeatures, yBlock, &_nResponses, &one, xtyPtr, &_nBetasIntercept);
+        }
+        if (_nFeatures < _nBetasIntercept)
+        {
+            // TODO: Substitute this part with call to gemv or reduce by column/row primitive
+            DAAL_PROFILER_THREADING_TASK(reducer.update.gemm1Y);
+            const algorithmFPType * yPtr = yBlock;
+            for (DAAL_INT i = 0; i < nRows; i++, yPtr += _nResponses)
             {
-                _xty[j * _nBetasIntercept + nFeatures] += yPtr[j];
+                PRAGMA_FORCE_SIMD
+                PRAGMA_VECTOR_ALWAYS
+                for (DAAL_INT j = 0; j < _nResponses; j++)
+                {
+                    xtyPtr[j * _nBetasIntercept + _nFeatures] += yPtr[j];
+                }
             }
         }
     }
-    return Status();
 }
 
 template <typename algorithmFPType, CpuType cpu>
-void ThreadingTask<algorithmFPType, cpu>::reduce(algorithmFPType * xtx, algorithmFPType * xty)
+void LinearModelReducer<algorithmFPType, cpu>::join(daal::Reducer * otherReducer)
 {
+    if (errorCode != ErrorCode::ok)
     {
-        DAAL_PROFILER_THREADING_TASK(reduce.syrkX);
-        PRAGMA_FORCE_SIMD
-        PRAGMA_VECTOR_ALWAYS
-        for (size_t i = 0; i < (_nBetasIntercept * _nBetasIntercept); i++)
-        {
-            xtx[i] += _xtx[i];
-        }
+        return;
+    }
+    DAAL_PROFILER_THREADING_TASK(reducer.join);
+    LinearModelReducer<algorithmFPType, cpu> * other = dynamic_cast<LinearModelReducer<algorithmFPType, cpu> *>(otherReducer);
+    if (!other)
+    {
+        errorCode = ErrorCode::badCast;
+        return;
+    }
+    if (other->errorCode != ErrorCode::ok)
+    {
+        errorCode = other->errorCode;
+        return;
     }
+    const algorithmFPType * otherXTX = other->xtx();
+    algorithmFPType * thisXTX        = xtx();
 
+    const algorithmFPType * otherXTY = other->xty();
+    algorithmFPType * thisXTY        = xty();
+
+    if (!otherXTX || !otherXTY || !thisXTX || !thisXTY)
     {
-        DAAL_PROFILER_THREADING_TASK(reduce.gemmXY);
-        PRAGMA_FORCE_SIMD
-        PRAGMA_VECTOR_ALWAYS
-        for (size_t i = 0; i < (_nBetasIntercept * _nResponses); i++)
-        {
-            xty[i] += _xty[i];
-        }
+        errorCode = ErrorCode::memAllocationFailed;
+        return;
+    }
+    /// It is safe to use aligned loads and stores because the data in TArrayScalableCalloc data structures is aligned
+    PRAGMA_FORCE_SIMD
+    PRAGMA_VECTOR_ALWAYS
+    PRAGMA_VECTOR_ALIGNED
+    for (size_t i = 0; i < (_nBetasIntercept * _nBetasIntercept); i++)
+    {
+        thisXTX[i] += otherXTX[i];
     }
-}
 
-template <typename algorithmFPType, CpuType cpu>
-ThreadingTask<algorithmFPType, cpu>::~ThreadingTask()
-{
-    service_scalable_free<algorithmFPType, cpu>(_xtx);
-    service_scalable_free<algorithmFPType, cpu>(_xty);
+    /// It is safe to use aligned loads and stores because the data in TArrayScalableCalloc data structures is aligned
+    PRAGMA_FORCE_SIMD
+    PRAGMA_VECTOR_ALWAYS
+    PRAGMA_VECTOR_ALIGNED
+    for (size_t i = 0; i < (_nBetasIntercept * _nResponses); i++)
+    {
+        thisXTY[i] += otherXTY[i];
+    }
 }
 
 template <typename algorithmFPType, CpuType cpu>
@@ -296,51 +407,67 @@ Status UpdateKernel<algorithmFPType, cpu>::compute(const NumericTable & xTable,
 
     /* Split rows by blocks */
     size_t nRowsInBlock = 128;
+    size_t grainSize    = 1; // minimal number of data blocks to be processed by a thread
     if (hyperparameter != nullptr)
     {
         DAAL_INT64 nRowsInBlockInt64 = 0l;
         services::Status status      = hyperparameter->find(denseUpdateStepBlockSize, nRowsInBlockInt64);
         DAAL_CHECK(0l < nRowsInBlockInt64, services::ErrorIncorrectDataRange);
         DAAL_CHECK_STATUS_VAR(status);
-
         nRowsInBlock = static_cast<size_t>(nRowsInBlockInt64);
+
+        DAAL_INT64 grainSizeInt64 = 0l;
+        status                    = hyperparameter->find(denseGrainSize, grainSizeInt64);
+        DAAL_CHECK(0l < grainSizeInt64, services::ErrorIncorrectDataRange);
+        DAAL_CHECK_STATUS_VAR(status);
+        grainSize = static_cast<size_t>(grainSizeInt64);
     }
 
     size_t nBlocks = nRows / nRowsInBlock;
     nBlocks += bool(nRows % nRowsInBlock);
 
-    /* Create TLS */
-    daal::static_tls<ThreadingTaskType *> tls([=]() -> ThreadingTaskType * { return ThreadingTaskType::create(nBetasIntercept, nResponses); });
-
-    SafeStatus safeStat;
-    daal::static_threader_for(nBlocks, [=, &tls, &xTable, &yTable, &safeStat](int iBlock, size_t tid) {
-        ThreadingTaskType * tlsLocal = tls.local(tid);
+    LinearModelReducer<algorithmFPType, cpu> result(xTable, yTable, nBetasIntercept, nRowsInBlock, nBlocks);
+    if (!result.xtx() || !result.xty())
+    {
+        return services::Status(services::ErrorMemoryAllocationFailed);
+    }
+    /* Reduce input matrices X, Y into cross product X^tX and matrix-matrix product X^tY */
+    daal::static_threader_reduce(nBlocks, grainSize, result);
 
-        if (!tlsLocal)
+    if (result.errorCode != LinearModelReducer<algorithmFPType, cpu>::ok)
+    {
+        if (result.errorCode == LinearModelReducer<algorithmFPType, cpu>::memAllocationFailed)
         {
-            safeStat.add(services::ErrorMemoryAllocationFailed);
-            return;
+            return services::Status(services::ErrorMemoryAllocationFailed);
         }
-
-        size_t startRow = iBlock * nRowsInBlock;
-        size_t endRow   = startRow + nRowsInBlock;
-        if (endRow > nRows)
+        if (result.errorCode == LinearModelReducer<algorithmFPType, cpu>::intOverflow)
         {
-            endRow = nRows;
+            return services::Status(services::ErrorBufferSizeIntegerOverflow);
         }
+        if (result.errorCode == LinearModelReducer<algorithmFPType, cpu>::badCast)
+        {
+            return services::Status(services::ErrorLinearRegressionInternal);
+        }
+    }
 
-        Status localSt = tlsLocal->update(startRow, endRow - startRow, xTable, yTable);
-        DAAL_CHECK_STATUS_THR(localSt);
-    });
+    const algorithmFPType * resultXTX = result.xtx();
+    const algorithmFPType * resultXTY = result.xty();
 
-    Status st = safeStat.detach();
-    tls.reduce([=, &st](ThreadingTaskType * tlsLocal) -> void {
-        if (!tlsLocal) return;
-        if (st) tlsLocal->reduce(xtx, xty);
-        delete tlsLocal;
-    });
+    PRAGMA_FORCE_SIMD
+    PRAGMA_VECTOR_ALWAYS
+    for (size_t i = 0; i < (nBetasIntercept * nBetasIntercept); i++)
+    {
+        xtx[i] += resultXTX[i];
+    }
+
+    PRAGMA_FORCE_SIMD
+    PRAGMA_VECTOR_ALWAYS
+    for (size_t i = 0; i < (nBetasIntercept * nResponses); i++)
+    {
+        xty[i] += resultXTY[i];
+    }
 
-    return st;
+    return services::Status();
 }
 
 } // namespace internal
diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp
index 6ae7f1ad869..bcdc8bd6816 100644
--- a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp
+++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter.cpp
@@ -68,6 +68,7 @@ services::Status convert(const Hyperparameter * params, services::SharedPtr<Line
         DAAL_INT64 denseUpdateMaxColsBatched    = 0l;
         DAAL_INT64 denseSmallRowsThreshold      = 0l;
         DAAL_INT64 denseSmallRowsMaxColsBatched = 0l;
+        DAAL_INT64 denseGrainSize               = 0l;
 
         auto * const resultPtr = new LinearModelHyperparameter();
         DAAL_CHECK_MALLOC(resultPtr);
@@ -86,6 +87,9 @@ services::Status convert(const Hyperparameter * params, services::SharedPtr<Line
         st |= params->find(HyperparameterId::denseSmallRowsMaxColsBatched, denseSmallRowsMaxColsBatched);
         DAAL_CHECK(st, services::ErrorHyperparameterNotFound);
 
+        st |= params->find(HyperparameterId::denseGrainSize, denseGrainSize);
+        DAAL_CHECK(st, services::ErrorHyperparameterNotFound);
+
         /// Setters
         st |= result->set(LinearModelHyperparameterId::denseUpdateStepBlockSize, denseUpdateStepBlockSize);
         DAAL_CHECK(st, services::ErrorHyperparameterCanNotBeSet);
@@ -98,6 +102,9 @@ services::Status convert(const Hyperparameter * params, services::SharedPtr<Line
 
         st |= result->set(LinearModelHyperparameterId::denseSmallRowsMaxColsBatched, denseSmallRowsMaxColsBatched);
         DAAL_CHECK(st, services::ErrorHyperparameterCanNotBeSet);
+
+        st |= result->set(LinearModelHyperparameterId::denseGrainSize, denseGrainSize);
+        DAAL_CHECK(st, services::ErrorHyperparameterCanNotBeSet);
     }
     else
     {
diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h
index 51ef54cf80d..8409fdf52a9 100644
--- a/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h
+++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_hyperparameter_impl.h
@@ -49,7 +49,8 @@ enum HyperparameterId
     denseUpdateMaxColsBatched    = 1,
     denseSmallRowsThreshold      = 2,
     denseSmallRowsMaxColsBatched = 3,
-    hyperparameterIdCount        = 4
+    denseGrainSize               = 4,
+    hyperparameterIdCount        = 5
 };
 
 enum DoubleHyperparameterId
diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index f0e23ed8777..76ac87f78fe 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -78,6 +78,16 @@ class Reducer
     virtual void join(Reducer * other) = 0;
 
     virtual ~Reducer() {}
+
+    enum ErrorCode
+    {
+        ok                  = 0, /// No error
+        memAllocationFailed = 1, /// Memory allocation failed
+        intOverflow         = 2, /// Integer overflow
+        badCast             = 3  /// Cannot cast base daal::Reducer to derived class
+    };
+    /// Status of the computation.
+    ErrorCode errorCode;
 };
 
 typedef internal::UniquePtr<Reducer, DAAL_BASE_CPU> ReducerUniquePtr;
diff --git a/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp b/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp
index 886cf68013f..5d0f6cd609f 100644
--- a/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp
+++ b/cpp/oneapi/dal/algo/linear_regression/backend/cpu/train_kernel_common.hpp
@@ -33,6 +33,7 @@ static daal_lr_hyperparameters_t convert_parameters(const detail::train_paramete
     const std::int64_t max_cols_batched = params.get_cpu_max_cols_batched();
     const std::int64_t small_rows_threshold = params.get_cpu_small_rows_threshold();
     const std::int64_t small_rows_max_cols_batched = params.get_cpu_small_rows_max_cols_batched();
+    const std::int64_t grain_size = params.get_cpu_grain_size();
 
     daal_lr_hyperparameters_t daal_hyperparameter;
     auto status = daal_hyperparameter.set(HyperparameterId::denseUpdateStepBlockSize, block);
@@ -46,6 +47,9 @@ static daal_lr_hyperparameters_t convert_parameters(const detail::train_paramete
                                      small_rows_max_cols_batched);
     interop::status_to_exception(status);
 
+    status = daal_hyperparameter.set(HyperparameterId::denseGrainSize, grain_size);
+    interop::status_to_exception(status);
+
     return daal_hyperparameter;
 }
 
diff --git a/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp b/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp
index 2bcf9d2d9d9..1d3c64f990c 100644
--- a/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp
+++ b/cpp/oneapi/dal/algo/linear_regression/parameters/cpu/train_parameters.cpp
@@ -53,6 +53,11 @@ std::int64_t propose_small_rows_max_cols_batched(const std::int64_t f, const std
     return detail::train_parameters<Task>{}.get_cpu_small_rows_max_cols_batched();
 }
 
+template <typename Float, typename Task>
+std::int64_t propose_grain_size(const std::int64_t f, const std::int64_t r) {
+    return detail::train_parameters<Task>{}.get_cpu_grain_size();
+}
+
 template <typename Float, typename Task>
 struct train_parameters_cpu<Float, method::norm_eq, Task> {
     using params_t = detail::train_parameters<Task>;
@@ -72,12 +77,14 @@ struct train_parameters_cpu<Float, method::norm_eq, Task> {
             propose_small_rows_threshold<Float, Task>(f_count, r_count);
         const std::int64_t small_rows_max_cols_batched =
             propose_small_rows_max_cols_batched<Float, Task>(f_count, r_count);
+        const std::int64_t grain_size = propose_grain_size<Float, Task>(f_count, r_count);
 
         params_t out{};
         out.set_cpu_macro_block(block);
         out.set_cpu_max_cols_batched(max_cols_batched);
         out.set_cpu_small_rows_threshold(small_rows_threshold);
         out.set_cpu_small_rows_max_cols_batched(small_rows_max_cols_batched);
+        out.set_cpu_grain_size(grain_size);
         return out;
     }
     params_t operator()(const context_cpu& ctx,
@@ -96,12 +103,14 @@ struct train_parameters_cpu<Float, method::norm_eq, Task> {
             propose_small_rows_threshold<Float, Task>(f_count, r_count);
         const std::int64_t small_rows_max_cols_batched =
             propose_small_rows_max_cols_batched<Float, Task>(f_count, r_count);
+        const std::int64_t grain_size = propose_grain_size<Float, Task>(f_count, r_count);
 
         params_t out{};
         out.set_cpu_macro_block(block);
         out.set_cpu_max_cols_batched(max_cols_batched);
         out.set_cpu_small_rows_threshold(small_rows_threshold);
         out.set_cpu_small_rows_max_cols_batched(small_rows_max_cols_batched);
+        out.set_cpu_grain_size(grain_size);
         return out;
     }
     params_t operator()(const context_cpu& ctx,
@@ -120,12 +129,14 @@ struct train_parameters_cpu<Float, method::norm_eq, Task> {
             propose_small_rows_threshold<Float, Task>(f_count, r_count);
         const std::int64_t small_rows_max_cols_batched =
             propose_small_rows_max_cols_batched<Float, Task>(f_count, r_count);
+        const std::int64_t grain_size = propose_grain_size<Float, Task>(f_count, r_count);
 
         params_t out{};
         out.set_cpu_macro_block(block);
         out.set_cpu_max_cols_batched(max_cols_batched);
         out.set_cpu_small_rows_threshold(small_rows_threshold);
         out.set_cpu_small_rows_max_cols_batched(small_rows_max_cols_batched);
+        out.set_cpu_grain_size(grain_size);
         return out;
     }
 };
diff --git a/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp b/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp
index 634531f432e..2cf023a80f5 100644
--- a/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp
+++ b/cpp/oneapi/dal/algo/linear_regression/test/train_parameters.cpp
@@ -58,6 +58,7 @@ class lr_train_params_test : public lr_test<TestType, lr_train_params_test<TestT
         this->max_cols_batched_ = GENERATE(50);
         this->small_rows_threshold_ = GENERATE(15, 70);
         this->small_rows_max_cols_batched_ = GENERATE(40);
+        this->grain_size_ = GENERATE(1, 10);
         this->pack_as_struct_ = GENERATE(0, 1);
     }
 
@@ -67,6 +68,7 @@ class lr_train_params_test : public lr_test<TestType, lr_train_params_test<TestT
         res.set_cpu_max_cols_batched(this->max_cols_batched_);
         res.set_cpu_small_rows_threshold(this->small_rows_threshold_);
         res.set_cpu_small_rows_max_cols_batched(this->small_rows_max_cols_batched_);
+        res.set_cpu_grain_size(this->grain_size_);
         return res;
     }
 
@@ -76,6 +78,7 @@ class lr_train_params_test : public lr_test<TestType, lr_train_params_test<TestT
         REQUIRE(this->max_cols_batched_ > 0);
         REQUIRE(this->small_rows_threshold_ > 0);
         REQUIRE(this->small_rows_max_cols_batched_ > 0);
+        REQUIRE(this->grain_size_ > 0);
         const auto params = this->get_current_parameters();
         if (this->pack_as_struct_) {
             return te::float_algo_fixture<float_t>::train(
@@ -95,6 +98,7 @@ class lr_train_params_test : public lr_test<TestType, lr_train_params_test<TestT
     std::int64_t max_cols_batched_;
     std::int64_t small_rows_threshold_;
     std::int64_t small_rows_max_cols_batched_;
+    std::int64_t grain_size_;
     bool pack_as_struct_;
 };
 
diff --git a/cpp/oneapi/dal/algo/linear_regression/train_types.cpp b/cpp/oneapi/dal/algo/linear_regression/train_types.cpp
index 2fd59706772..5c6a30b2abe 100644
--- a/cpp/oneapi/dal/algo/linear_regression/train_types.cpp
+++ b/cpp/oneapi/dal/algo/linear_regression/train_types.cpp
@@ -59,6 +59,7 @@ struct train_parameters_impl : public base {
     std::int64_t cpu_max_cols_batched = 4'096l;
     std::int64_t cpu_small_rows_threshold = 10'000l;
     std::int64_t cpu_small_rows_max_cols_batched = 1'024l;
+    std::int64_t cpu_grain_size = 10l;
 };
 
 template <typename Task>
@@ -114,6 +115,16 @@ void train_parameters<Task>::set_cpu_small_rows_max_cols_batched_impl(std::int64
     impl_->cpu_small_rows_max_cols_batched = val;
 }
 
+template <typename Task>
+std::int64_t train_parameters<Task>::get_cpu_grain_size() const {
+    return impl_->cpu_grain_size;
+}
+
+template <typename Task>
+void train_parameters<Task>::set_cpu_grain_size_impl(std::int64_t val) {
+    impl_->cpu_grain_size = val;
+}
+
 template class ONEDAL_EXPORT train_parameters<task::regression>;
 
 } // namespace detail::v1
diff --git a/cpp/oneapi/dal/algo/linear_regression/train_types.hpp b/cpp/oneapi/dal/algo/linear_regression/train_types.hpp
index 423edc6095f..0677c1af5b8 100644
--- a/cpp/oneapi/dal/algo/linear_regression/train_types.hpp
+++ b/cpp/oneapi/dal/algo/linear_regression/train_types.hpp
@@ -82,12 +82,19 @@ class train_parameters : public dal::detail::system_parameters {
         return *this;
     }
 
+    std::int64_t get_cpu_grain_size() const;
+    auto& set_cpu_grain_size(std::int64_t val) {
+        set_cpu_grain_size_impl(val);
+        return *this;
+    }
+
 private:
     void set_cpu_macro_block_impl(std::int64_t val);
     void set_gpu_macro_block_impl(std::int64_t val);
     void set_cpu_max_cols_batched_impl(std::int64_t val);
     void set_cpu_small_rows_threshold_impl(std::int64_t val);
     void set_cpu_small_rows_max_cols_batched_impl(std::int64_t val);
+    void set_cpu_grain_size_impl(std::int64_t val);
     dal::detail::pimpl<train_parameters_impl<Task>> impl_;
 };