[Bug Fixed] Fix batch norm when grad_req is add (#18500)

wkcn · web-flow · commit 2d58ff5512e2 · 2020-06-08T10:41:35.000-07:00
* fix batch norm when fix_gamma is True

* support gradient accumulation for batch norm

* mkldnn batchnorm support grad add

* unittest for bn

* fix bn arg

* fix lint

* fix mkldnn

* fix mkldnn bn

* fix grad when fixing gamma

* fix naive gpu bn

* fix lint

* fix cudnn bn

* fix flag

* fix lint

* fix testcase

* fix

* use @pytest.mark.parametrize

* combination

* remove redundant test in batchnorm

* npx.batch_norm test

* try to fix test

* reduce the number of tests for batchnorm

* fix
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
@@ -259,6 +259,7 @@ void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
                        const std::vector<TBlob> &outputs) {
   CHECK_EQ(inputs.size(), 8U);
   CHECK_EQ(outputs.size(), 3U);
+
   std::vector<TBlob> out_grad(1);
   std::vector<TBlob> out_data(3);
   std::vector<TBlob> in_data(3);
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
@@ -85,6 +85,31 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
   }
 }
 
+template<typename DType1, typename DType2, typename DType3, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1> &in_data,
+                               const BNTensor3<DType2> &in_data2,
+                               const BNTensor3<DType3> &out_data,
+                               const size_t channel,
+                               OnData onData) {
+  const size_t num         = in_data.OuterSize();
+  const size_t matrixSize  = in_data.InnerSize();
+  const size_t skipLength  = in_data.SkipLengthToNextSameChannelData();
+  const size_t startOffset = in_data.StartOffset(channel);
+
+  DType1 *data = in_data.dptr_  + startOffset;
+  DType2 *data2 = in_data2.dptr_  + startOffset;
+  DType3 *odata = out_data.dptr_ + startOffset;
+
+  for (size_t outer = 0; outer < num; ++outer) {
+    for (size_t i = 0; i < matrixSize; ++i) {
+      onData(data++, data2++, odata++);
+    }
+    data  += skipLength;
+    data2 += skipLength;
+    odata += skipLength;
+  }
+}
+
 }  // namespace batchnorm
 
 /*! \brief Forward CPU */
@@ -263,7 +288,7 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
                   dotp += (*thisInputData - mean) * (*gradOut_data);
                 });
 
-    if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) {  // if there's a grad input
+    if (!gradIn.IsEmpty() && req[batchnorm::kData] != kNullOp) {  // if there's a grad input
       if (is_train_and_not_global_stats) {
         // when in training mode
         // Q(X) = X - E[x] ; i.e. input centered to zero mean
@@ -272,44 +297,60 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
 
         // projection of gradOutput on to output scaled by std
         const AccReal k = dotp * invstd * invstd / itemCount;
-        ForEachFast(inputData, gradIn, static_cast<size_t>(channel),
-                    [&mean, &k](const DType *inputDataPtr, DType *gradIn_data) {
-                      *gradIn_data = (*inputDataPtr - mean) * k;
-                    });
-
         const AccReal iw = invstd * w;
         const AccReal gradMean = sumGradOut / itemCount;
-        ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
-                    [iw, gradMean](const DType *gradOut_data, DType *gradIn_data) {
-                      *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * iw;
-                    });
+        if (req[batchnorm::kData] != kAddTo) {
+          ForEachFast(inputData, gradIn, static_cast<size_t>(channel),
+                      [&mean, &k](const DType *inputDataPtr, DType *gradIn_data) {
+                        *gradIn_data = (*inputDataPtr - mean) * k;
+                      });
+
+          ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
+                      [iw, gradMean](const DType *gradOut_data, DType *gradIn_data) {
+                        *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * iw;
+                      });
+        } else {
+          ForEachFast(inputData, gradOut, gradIn, static_cast<size_t>(channel),
+                      [&mean, &k, iw, gradMean](const DType *inputDataPtr,
+                                                const DType *gradOut_data,
+                                                DType *gradIn_data) {
+                        DType normal_val = (*inputDataPtr - mean) * k;
+                        *gradIn_data += (*gradOut_data - gradMean -
+                            normal_val) * iw;
+                      });
+        }
       } else {
         // when in evaluation mode
         // Q(X) = X - running_mean  ; i.e. input centered to zero mean
         // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
         // dL/dX = w / running_std
         const AccReal iw = invstd * w;
-        ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
-                    [iw](const DType *gradOut_data, DType *gradIn_data) {
-                      *gradIn_data = *gradOut_data * iw;
-                    });
+        if (req[batchnorm::kData] != kAddTo) {
+          ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
+                      [iw](const DType *gradOut_data, DType *gradIn_data) {
+                        *gradIn_data = *gradOut_data * iw;
+                      });
+        } else {
+          ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
+                      [iw](const DType *gradOut_data, DType *gradIn_data) {
+                        *gradIn_data += *gradOut_data * iw;
+                      });
+        }
       }
     }
 
     // May want to make this a param eventually
     const AccReal scale = 1.0f;
 
-    if (IsBNWriting(req[batchnorm::kGamma])) {
-      if (!param_.fix_gamma) {
-        gradWeightData[channel] = scale * dotp * invstd;
-      } else {
+    if (!param_.fix_gamma) {
+      KERNEL_ASSIGN(gradWeightData[channel], req[batchnorm::kGamma], scale * dotp * invstd);
+    } else {
+      if (IsBNWriting(req[batchnorm::kGamma])) {
         gradWeightData[channel] = AccReal(0);
       }
     }
 
-    if (IsBNWriting(req[batchnorm::kBeta])) {
-      gradBiasData[channel] = scale * sumGradOut;
-    }
+    KERNEL_ASSIGN(gradBiasData[channel], req[batchnorm::kBeta], scale * sumGradOut);
   }
 }
 
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
@@ -34,6 +34,9 @@
 #define FIX_GAMMA_FLAG        8
 #define IS_TRAINING_FLAG      16
 #define USE_GLOBAL_STATS_FLAG 32
+#define ADDTO_DATA_FLAG       (1 << 6)
+#define ADDTO_GAMMA_FLAG      (1 << 7)
+#define ADDTO_BETA_FLAG       (1 << 8)
 
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn/cudnn_batch_norm-inl.h"
@@ -361,33 +364,60 @@ static __global__ void BatchNormalizationBackwardKernel(
                                 * momentum + localVariance * (AccReal(1) - momentum);
   }
 
-  if (gradInput.Size() > 0 && (flags & WRITE_DATA_FLAG) != 0) {
-    for (int batch = 0, nbatch = gradOutput.OuterSize(); batch < nbatch; ++batch) {
-      for (int x = threadIdx.x, nx = gradOutput.InnerSize(); x < nx; x += blockDim.x) {
-        const DType gradOut = gradOutput.get_ref(batch, plane, x);
-        if (is_train_and_not_global_stats) {
-          const DType inp = input.get_ref(batch, plane, x);
-          const AccReal proj = (inp - mean) * projScale;
-          gradInput.get_ref(batch, plane, x) =
-            ScalarConvert<AccReal, DType>::to((gradOut - proj - gradMean) * gradScale);
-        } else {
-          gradInput.get_ref(batch, plane, x) = ScalarConvert<AccReal, DType>::to(
-            gradOut * gradScale);
+  if (gradInput.Size() > 0 && (flags & (WRITE_DATA_FLAG | ADDTO_DATA_FLAG)) != 0) {
+    const bool grad_write = flags & WRITE_DATA_FLAG;
+    if (grad_write) {
+      for (int batch = 0, nbatch = gradOutput.OuterSize(); batch < nbatch; ++batch) {
+        for (int x = threadIdx.x, nx = gradOutput.InnerSize(); x < nx; x += blockDim.x) {
+          const DType gradOut = gradOutput.get_ref(batch, plane, x);
+          if (is_train_and_not_global_stats) {
+            const DType inp = input.get_ref(batch, plane, x);
+            const AccReal proj = (inp - mean) * projScale;
+            gradInput.get_ref(batch, plane, x) =
+              ScalarConvert<AccReal, DType>::to((gradOut - proj - gradMean) * gradScale);
+          } else {
+            gradInput.get_ref(batch, plane, x) = ScalarConvert<AccReal, DType>::to(
+              gradOut * gradScale);
+          }
+        }
+      }
+    } else {
+      // grad addto
+      for (int batch = 0, nbatch = gradOutput.OuterSize(); batch < nbatch; ++batch) {
+        for (int x = threadIdx.x, nx = gradOutput.InnerSize(); x < nx; x += blockDim.x) {
+          const DType gradOut = gradOutput.get_ref(batch, plane, x);
+          if (is_train_and_not_global_stats) {
+            const DType inp = input.get_ref(batch, plane, x);
+            const AccReal proj = (inp - mean) * projScale;
+            gradInput.get_ref(batch, plane, x) +=
+              ScalarConvert<AccReal, DType>::to((gradOut - proj - gradMean) * gradScale);
+          } else {
+            gradInput.get_ref(batch, plane, x) += ScalarConvert<AccReal, DType>::to(
+              gradOut * gradScale);
+          }
         }
       }
     }
   }
 
-  if (tensors.gradWeight.numElements() > 0 && threadIdx.x == 0 && (flags & WRITE_GAMMA_FLAG) != 0) {
+  if (tensors.gradWeight.numElements() > 0 && threadIdx.x == 0 &&
+      (flags & (WRITE_GAMMA_FLAG | ADDTO_GAMMA_FLAG)) != 0) {
     if ((flags & FIX_GAMMA_FLAG) == 0) {
-      tensors.gradWeight[plane] = ScalarConvert<AccReal, DType>::to(dotP * invstd);
+      if (flags & WRITE_GAMMA_FLAG)
+        tensors.gradWeight[plane] = ScalarConvert<AccReal, DType>::to(dotP * invstd);
+      else
+        tensors.gradWeight[plane] += ScalarConvert<AccReal, DType>::to(dotP * invstd);
     } else {
       tensors.gradWeight[plane] = DType(0);
     }
   }
 
-  if (tensors.gradBias.numElements() > 0 && threadIdx.x == 0 && (flags & WRITE_BETA_FLAG) != 0) {
-    tensors.gradBias[plane] = ScalarConvert<AccReal, DType>::to(gradOutputSum);
+  if (tensors.gradBias.numElements() > 0 && threadIdx.x == 0 &&
+      (flags & (WRITE_BETA_FLAG | ADDTO_BETA_FLAG)) != 0) {
+    if (flags & WRITE_BETA_FLAG)
+      tensors.gradBias[plane] = ScalarConvert<AccReal, DType>::to(gradOutputSum);
+    else
+      tensors.gradBias[plane] += ScalarConvert<AccReal, DType>::to(gradOutputSum);
   }
 }
 
@@ -579,12 +609,18 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
   flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0;
   if (IsBNWriting(req[batchnorm::kData])) {
     flags |= WRITE_DATA_FLAG;
+  } else if (req[batchnorm::kData] == kAddTo) {
+    flags |= ADDTO_DATA_FLAG;
   }
   if (IsBNWriting(req[batchnorm::kGamma])) {
     flags |= WRITE_GAMMA_FLAG;
+  } else if (req[batchnorm::kGamma] == kAddTo) {
+    flags |= ADDTO_GAMMA_FLAG;
   }
   if (IsBNWriting(req[batchnorm::kBeta])) {
     flags |= WRITE_BETA_FLAG;
+  } else if (req[batchnorm::kBeta] == kAddTo) {
+    flags |= ADDTO_BETA_FLAG;
   }
   return flags;
 }
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
@@ -222,13 +222,24 @@ class CuDNNBatchNormOp {
 
       if (param_.fix_gamma) gamma = 1.f;
 
+      bool grad_add_gamma_beta = (req[cudnnbatchnorm::kGamma] == kAddTo) ||
+                                 (req[cudnnbatchnorm::kBeta] == kAddTo);
+      if (grad_add_gamma_beta) {
+        if (IsBNWriting(req[cudnnbatchnorm::kGamma])) {
+          dgamma = 0.f;
+        }
+        if (IsBNWriting(req[cudnnbatchnorm::kBeta])) {
+          dbeta = 0.f;
+        }
+      }
+
       CUDNN_CALL(cudnnBatchNormalizationBackward(
         s->dnn_handle_,
         mode,
         &a,
-        &b,
+        req[cudnnbatchnorm::kData] == kAddTo ? &b_add : &b,
         &a,
-        req[cudnnbatchnorm::kGamma] == kWriteTo ? &b: &b_add,
+        grad_add_gamma_beta ? &b_add : &b,  // gamma and beta
         io_desc_,
         x.dptr_,
         io_desc_,
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -347,7 +347,8 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
   else if (diff.IsDefaultData())
     diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_desc());
   auto &bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
-  auto gradi_mem = const_cast<NDArray &>(gradIn).CreateMKLDNNData(data_mem->get_desc());
+  auto gradi_mem = CreateMKLDNNMem(const_cast<NDArray &>(gradIn),
+      bwd.pd.diff_src_desc(), req[batchnorm::kData]);
 
   if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
     const NDArray &gamma    = in_data[batchnorm::kGamma];
@@ -368,7 +369,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     }
     mkldnn_args_map_t net_args;
     net_args[MKLDNN_ARG_SRC] = *data_mem;
-    net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem;
+    net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem.second;
     net_args[MKLDNN_ARG_SCALE_SHIFT] = bwd.GetWeight();
     net_args[MKLDNN_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
     net_args[MKLDNN_ARG_DIFF_DST] = *diff_mem;
@@ -401,28 +402,46 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
       }
       net_args[MKLDNN_ARG_MEAN] = *(out_mean.GetMKLDNNData());
       net_args[MKLDNN_ARG_VARIANCE] = var_mem;
-      MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
-      MKLDNNStream::Get()->Submit();
     } else {
       net_args[MKLDNN_ARG_MEAN] =  *(moving_mean.GetMKLDNNData());
       net_args[MKLDNN_ARG_VARIANCE] = *(moving_var.GetMKLDNNData());
-      MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
-      MKLDNNStream::Get()->Submit();
     }
+    MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
+    CommitOutput(gradIn, gradi_mem);
+    MKLDNNStream::Get()->Submit();
 
     // copy data from gradw_mem to in_grad[1] and in_grad[2]
     DType *gw_buf = reinterpret_cast<DType *>(bwd.GetGradw().get_data_handle());
-    DType *w_grad_1 = in_grad[1].data().dptr<DType>();
-    DType *w_grad_2 = in_grad[2].data().dptr<DType>();
+    DType *w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
+    DType *w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
 
+    // the gradient of gamma
     if (!param.fix_gamma) {
-      memcpy(w_grad_1, gw_buf, copy_size);
-      memcpy(w_grad_2, &gw_buf[channels_], copy_size);
+      if (req[batchnorm::kGamma] != kNullOp) {
+        if (req[batchnorm::kGamma] != kAddTo) {
+          memcpy(w_grad_1, gw_buf, copy_size);
+        } else {
+          for (int i = 0; i < channels_; i++) {
+            w_grad_1[i] += gw_buf[i];
+          }
+        }
+      }
     } else {
       for (int i = 0; i < channels_; i++) {
         (in_grad[1].data().dptr<DType>())[i] = 0.0f;
       }
-      memcpy(w_grad_2, &gw_buf[channels_], copy_size);
+    }
+
+    // the gradient of beta
+    if (req[batchnorm::kBeta] != kNullOp) {
+      if (req[batchnorm::kBeta] != kAddTo) {
+        memcpy(w_grad_2, &gw_buf[channels_], copy_size);
+      } else {
+        DType *grad_beta = &gw_buf[channels_];
+        for (int i = 0; i < channels_; i++) {
+          w_grad_2[i] += grad_beta[i];
+        }
+      }
     }
   } else {
     LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ...";
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py