Revert "FullyConnected Bias performance improvement on GPU (apache#16039)"

eric-haibin-lin · eric-haibin-lin · commit 745c25ccdc47 · 2019-12-14T18:18:58.000Z
This reverts commit a5e698a.
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
@@ -57,8 +57,6 @@ extern __cuda_fake_struct blockIdx;
 #include <cublas_v2.h>
 #include <curand.h>
 
-#include <vector>
-
 #define STATIC_ASSERT_CUDA_VERSION_GE(min_version) \
   static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \
       QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \
@@ -429,41 +427,16 @@ int get_rows_per_block(size_t row_size, int num_threads_per_block);
 }  // namespace common
 }  // namespace mxnet
 
-/*! \brief Maximum number of GPUs */
-constexpr size_t kMaxNumGpus = 64;
-
-// The implementations below assume that accesses of 32-bit ints are inherently atomic and
-// can be read/written by multiple threads without locks.  The values held should be < 2^31.
-
-/*!
- * \brief Return an attribute GPU `device_id`.
- * \param device_id The device index of the cuda-capable gpu of interest.
- * \param cached_values An array of attributes for already-looked-up GPUs.
- * \param attr The attribute, by number.
- * \param attr_name A string representation of the attribute, for error messages.
- * \return the gpu's attribute value.
- */
-inline int cudaAttributeLookup(int device_id, std::vector<int32_t> *cached_values,
-                               cudaDeviceAttr attr, const char *attr_name) {
-  if (device_id < 0 || device_id >= static_cast<int>(cached_values->size())) {
-    LOG(FATAL) << attr_name << "(device_id) called with invalid id: " << device_id;
-  } else if ((*cached_values)[device_id] < 0) {
-    int temp = -1;
-    CUDA_CALL(cudaDeviceGetAttribute(&temp, attr, device_id));
-    (*cached_values)[device_id] = static_cast<int32_t>(temp);
-  }
-  return (*cached_values)[device_id];
-}
-
 /*!
  * \brief Determine major version number of the gpu's cuda compute architecture.
  * \param device_id The device index of the cuda-capable gpu of interest.
  * \return the major version number of the gpu's cuda compute architecture.
  */
 inline int ComputeCapabilityMajor(int device_id) {
-  static std::vector<int32_t> capability_major(kMaxNumGpus, -1);
-  return cudaAttributeLookup(device_id, &capability_major,
-                             cudaDevAttrComputeCapabilityMajor, "ComputeCapabilityMajor");
+  int major = 0;
+  CUDA_CALL(cudaDeviceGetAttribute(&major,
+                                   cudaDevAttrComputeCapabilityMajor, device_id));
+  return major;
 }
 
 /*!
@@ -472,9 +445,10 @@ inline int ComputeCapabilityMajor(int device_id) {
  * \return the minor version number of the gpu's cuda compute architecture.
  */
 inline int ComputeCapabilityMinor(int device_id) {
-  static std::vector<int32_t> capability_minor(kMaxNumGpus, -1);
-  return cudaAttributeLookup(device_id, &capability_minor,
-                             cudaDevAttrComputeCapabilityMinor, "ComputeCapabilityMinor");
+  int minor = 0;
+  CUDA_CALL(cudaDeviceGetAttribute(&minor,
+                                   cudaDevAttrComputeCapabilityMinor, device_id));
+  return minor;
 }
 
 /*!
@@ -488,40 +462,6 @@ inline int SMArch(int device_id) {
   return 10 * major + minor;
 }
 
-/*!
- * \brief Return the number of streaming multiprocessors of GPU `device_id`.
- * \param device_id The device index of the cuda-capable gpu of interest.
- * \return the gpu's count of streaming multiprocessors.
- */
-inline int MultiprocessorCount(int device_id) {
-  static std::vector<int32_t> sm_counts(kMaxNumGpus, -1);
-  return cudaAttributeLookup(device_id, &sm_counts,
-                             cudaDevAttrMultiProcessorCount, "MultiprocessorCount");
-}
-
-/*!
- * \brief Return the shared memory size in bytes of each of the GPU's streaming multiprocessors.
- * \param device_id The device index of the cuda-capable gpu of interest.
- * \return the shared memory size per streaming multiprocessor.
- */
-inline int MaxSharedMemoryPerMultiprocessor(int device_id) {
-  static std::vector<int32_t> max_smem_per_mutiprocessor(kMaxNumGpus, -1);
-  return cudaAttributeLookup(device_id, &max_smem_per_mutiprocessor,
-                             cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                             "MaxSharedMemoryPerMultiprocessor");
-}
-
-/*!
- * \brief Return whether the GPU `device_id` supports cooperative-group kernel launching.
- * \param device_id The device index of the cuda-capable gpu of interest.
- * \return the gpu's ability to run cooperative-group kernels.
- */
-inline bool SupportsCooperativeLaunch(int device_id) {
-  static std::vector<int32_t> coop_launch(kMaxNumGpus, -1);
-  return cudaAttributeLookup(device_id, &coop_launch,
-                             cudaDevAttrCooperativeLaunch, "SupportsCooperativeLaunch");
-}
-
 /*!
  * \brief Determine whether a cuda-capable gpu's architecture supports float16 math.
  *        Assume not if device_id is negative.
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
@@ -249,16 +249,6 @@ inline int get_num_threads<cpu>(const int N) {
     LOG(FATAL) << "Unknown type enum " << type;            \
   }
 
-template <typename T>
-struct AccType {
-  using type = T;
-};
-
-template <>
-struct AccType<mshadow::half::half_t> {
-  using type = float;
-};
-
 #define MXNET_REAL_ACC_TYPE_SWITCH(type, DType, AType, ...)\
   switch (type) {                                          \
   case mshadow::kFloat32:                                  \
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
@@ -32,7 +32,6 @@
 #include <vector>
 #include <string>
 #include <utility>
-#include <algorithm>
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
 #include "../linalg.h"
@@ -60,7 +59,6 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   int num_hidden;
   bool no_bias;
   bool flatten;
-
   DMLC_DECLARE_PARAMETER(FullyConnectedParam) {
     // TODO(bing) add support for boolean
     DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
@@ -77,66 +75,6 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   }
 };
 
-template<typename DType>
-void AddBias(Tensor<cpu, 1, DType> bias, Tensor<cpu, 2, DType> data,
-             Tensor<cpu, 2, DType> out, Stream<cpu>*) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  out += repmat(bias, data.size(0));
-}
-
-#if defined(__CUDACC__)
-
-namespace {
-  constexpr int nthreads_addbias = 256;
-  constexpr int nthreads_addbiasgrad_phase1 = 512;
-  constexpr int nthreads_addbiasgrad_phase2 = 128;
-  constexpr int threads_per_warp = 32;
-
-  inline int ceil_div(int x, int y) {
-    return (x + y - 1) / y;
-  }
-}  // namespace
-
-template <typename DType, typename LType>
-__global__ void add_bias_kernel(DType* mat, DType* bias, size_t lead_dim, size_t bias_length) {
-  __shared__ LType scratch[nthreads_addbias * 2];
-  const index_t N = bias_length * sizeof(DType)/sizeof(LType);
-  const index_t base = blockIdx.x * N;
-  LType* const mat_aligned = reinterpret_cast<LType*>(mat) + base;
-  const LType* const bias_aligned = reinterpret_cast<LType*>(bias);
-  LType* const scratch_bias_load = scratch + threadIdx.x;
-  DType* const scratch_bias = reinterpret_cast<DType*>(scratch_bias_load);
-  LType* const scratch_mat_load = scratch_bias_load + nthreads_addbias;
-  DType* const scratch_mat = reinterpret_cast<DType*>(scratch_mat_load);
-  for (index_t i = threadIdx.x; i < N; i += blockDim.x) {
-    *scratch_bias_load = bias_aligned[i];
-    *scratch_mat_load = mat_aligned[i];
-#pragma unroll
-    for (int j = 0; j < sizeof(LType)/sizeof(DType); ++j) {
-      scratch_mat[j] += scratch_bias[j];
-    }
-    mat_aligned[i] = *scratch_mat_load;
-  }
-}
-
-template<typename DType>
-void AddBias(Tensor<gpu, 1, DType> bias, Tensor<gpu, 2, DType> data,
-             Tensor<gpu, 2, DType> out, Stream<gpu>* s) {
-    int ltype = mxnet::common::cuda::get_load_type(bias.shape_[0] * sizeof(DType));
-    MXNET_LOAD_TYPE_SWITCH(ltype, LType, {
-    add_bias_kernel<DType, LType><<<data.size(0),
-                                    nthreads_addbias,
-                                    0,
-                                    Stream<gpu>::GetStream(s)>>>(out.dptr_,
-                                                                 bias.dptr_,
-                                                                 data.size(0),
-                                                                 bias.shape_[0]);
-    });
-}
-
-#endif  // __CUDACC__
-
 template<typename xpu, typename DType>
 void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
                const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
@@ -184,153 +122,10 @@ void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
       << "Incomplete bias tensor detected: bias.data().shape[1] != weight.data().shape[0]."
          " This is not supported by FCForward. If bias is in row_sparse format, please"
          " make sure all row ids are present.";
-    AddBias(bias, data, out, s);
+    out += repmat(bias, data.size(0));
   }
 }
 
-#if defined (__CUDACC__)
-
-template<typename LType, typename DType, typename AType>
-__global__ void AddBiasGradKernelPhase1(AType * temp_space, const DType* grad,
-                                        const size_t lead_dim, const size_t other_dim) {
-  constexpr int num_warps = nthreads_addbiasgrad_phase1 / threads_per_warp;
-  const int values_per_read = sizeof(LType) >= sizeof(DType) ? sizeof(LType) / sizeof(DType) : 1;
-  const size_t stride = lead_dim / values_per_read;
-  __shared__ AType scratch[threads_per_warp * num_warps  * values_per_read];
-  LType * my_scratch_load = &(reinterpret_cast<LType *>(scratch)[threadIdx.x]);
-  DType * my_values_load = reinterpret_cast<DType *>(my_scratch_load);
-  AType * my_values_acc = &(scratch[threadIdx.x * values_per_read]);
-  AType acc[values_per_read];  // NOLINT(*)
-#pragma unroll
-  for (int i = 0; i < values_per_read; ++i) {
-    acc[i] = 0;
-  }
-  const size_t offset = blockIdx.x * threads_per_warp;
-  const int my_warp = threadIdx.x / threads_per_warp;
-  const int my_id = threadIdx.x % threads_per_warp;
-  const LType* aligned_grad = reinterpret_cast<const LType*>(grad);
-  const int rows_per_block = (other_dim + gridDim.y - 1) / gridDim.y;
-  const size_t start_row = my_warp + rows_per_block * blockIdx.y;
-  const size_t end_row = min(other_dim, static_cast<size_t>(rows_per_block * (blockIdx.y + 1)));
-  if (offset + my_id < stride) {
-    for (size_t i = start_row; i < end_row; i += num_warps) {
-      *my_scratch_load = aligned_grad[i * stride + offset + my_id];
-#pragma unroll
-      for (int j = 0; j < values_per_read; ++j) {
-        acc[j] += static_cast<AType>(my_values_load[j]);
-      }
-    }
-  }
-  __syncthreads();
-#pragma unroll
-  for (int i = 0; i < values_per_read; ++i) {
-    my_values_acc[i] = acc[i];
-  }
-
-  __syncthreads();
-
-  for (int i = num_warps / 2; i > 0; i /= 2) {
-    if (my_warp < i) {
-      const int shared_offset = values_per_read * i * threads_per_warp;
-#pragma unroll
-      for (int j = 0; j < values_per_read; ++j) {
-        my_values_acc[j] += my_values_acc[j + shared_offset];
-      }
-    }
-    __syncthreads();
-  }
-
-  if (threadIdx.x < min(threads_per_warp * values_per_read,
-                        static_cast<int>(lead_dim - values_per_read * offset))) {
-    const size_t offset_out = values_per_read * offset +
-                              blockIdx.y * lead_dim;
-    temp_space[offset_out + threadIdx.x] = scratch[threadIdx.x];
-  }
-}
-
-template <typename DType, typename AType>
-__global__ void AddBiasGradKernelPhase2(const AType * temp_space, DType * out,
-                                        int lead_dim, int n_blocks, OpReqType req) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < lead_dim) {
-    AType acc = 0;
-    for (int i = tid; i < lead_dim * n_blocks; i += lead_dim) {
-      acc += temp_space[i];
-    }
-    KERNEL_ASSIGN(out[tid], req, static_cast<DType>(acc));
-  }
-}
-
-template<typename DType>
-void AddBiasGrad(const TBlob& in_grad,
-                 Tensor<gpu, 2, DType> grad,
-                 OpReqType req,
-                 int num_hidden,
-                 const OpContext& ctx) {
-  if (req == kNullOp) return;
-  using AType = typename mxnet_op::AccType<DType>::type;
-  mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
-  Tensor<gpu, 1, DType> gbias = in_grad.get<gpu, 1, DType>(s);
-  TBlob grad_blob = TBlob(grad);
-  TBlob gbias_blob = TBlob(gbias);
-  mxnet::TShape x(1, 0);
-  mxnet::TShape small;
-  if (shape_assign(&gbias_blob.shape_, Shape2(num_hidden, 1))) {
-    small = gbias_blob.shape_;
-  } else {
-    small = ReduceAxesShapeImpl(grad_blob.shape_, dmlc::optional<mxnet::TShape>(x), true, false);
-  }
-  const int N = small.Size();
-  int ltype = mxnet::common::cuda::get_load_type(N * sizeof(DType));
-  const int M = grad_blob.shape_.Size() / N;
-  MXNET_LOAD_TYPE_SWITCH(ltype, LType, {
-    const unsigned int blocks_x = ceil_div(N * sizeof(DType),
-                                           threads_per_warp * sizeof(LType));
-    const unsigned int preferred_number_of_blocks = 2 *
-                                                    MultiprocessorCount(ctx.run_ctx.ctx.dev_id);
-    const unsigned int blocks_y = std::max(preferred_number_of_blocks / blocks_x, 1u);
-    const dim3 n_blocks = {blocks_x, blocks_y, 1};
-    auto scratch_space = ctx.requested[fullc::kTempSpace]
-                            .get_space_typed<gpu, 1, AType>(mshadow::Shape1(N * blocks_y), s);
-    auto stream = mshadow::Stream<gpu>::GetStream(s);
-    AddBiasGradKernelPhase1<LType><<<n_blocks,
-                                     nthreads_addbiasgrad_phase1,
-                                     0,
-                                     stream>>>(scratch_space.dptr_,
-                                               grad.dptr_, N, M);
-    const int nblocks_phase2 = ceil_div(N, nthreads_addbiasgrad_phase2);
-    AddBiasGradKernelPhase2<<<nblocks_phase2,
-                              nthreads_addbiasgrad_phase2,
-                              0,
-                              stream>>>(scratch_space.dptr_,
-                                        gbias.dptr_, N,
-                                        blocks_y, req);
-  });
-}
-#endif
-
-template<typename DType>
-void AddBiasGrad(const TBlob& in_grad,
-                 Tensor<cpu, 2, DType> grad,
-                 OpReqType req,
-                 int num_hidden,
-                 const OpContext& ctx) {
-  mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
-  Tensor<cpu, 1, DType> gbias = in_grad.get<cpu, 1, DType>(s);
-  TBlob grad_blob = TBlob(grad);
-  TBlob gbias_blob = TBlob(gbias);
-  mxnet::TShape x(1, 0);
-  mxnet::TShape small;
-  if (shape_assign(&gbias_blob.shape_, Shape2(num_hidden, 1))) {
-    small = gbias_blob.shape_;
-  } else {
-    small = ReduceAxesShapeImpl(grad_blob.shape_, dmlc::optional<mxnet::TShape>(x), true, false);
-  }
-  ReduceAxesComputeImpl<cpu, mshadow::red::sum, false, false,
-                        mshadow_op::identity>(ctx, {grad_blob}, {req},
-                                              {in_grad}, small);
-}
-
 template<typename xpu, typename DType>
 void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
                 const std::vector<TBlob> &out_grad, const std::vector<TBlob> &in_data,
@@ -374,7 +169,19 @@ void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
   linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
   // gradient of bias
   if (!param.no_bias) {
-      AddBiasGrad(in_grad[fullc::kBias], grad, req[fullc::kBias], param.num_hidden, ctx);
+    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
+    TBlob grad_blob = TBlob(grad);
+    TBlob gbias_blob = TBlob(gbias);
+    mxnet::TShape x(1, 0);
+    mxnet::TShape small;
+    if (shape_assign(&gbias_blob.shape_, Shape2(param.num_hidden, 1))) {
+      small = gbias_blob.shape_;
+    } else {
+      small = ReduceAxesShapeImpl(grad_blob.shape_, dmlc::optional<mxnet::TShape>(x), true, false);
+    }
+    ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false,
+                          mshadow_op::identity>(ctx, {grad_blob}, {req[fullc::kBias]},
+                                                {in_grad[fullc::kBias]}, small);
   }
   // gradient of data
   // Legacy approach shown here for comparison: