Add sum op for boolean ndarrays using tvm op module

reminisce · reminisce · commit e64fdc793791 · 2019-08-23T16:17:21.000-07:00
diff --git a/contrib/tvmop/core/__init__.py b/contrib/tvmop/core/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from . import umath
+from . import umath, fromnumeric
diff --git a/contrib/tvmop/core/fromnumeric.py b/contrib/tvmop/core/fromnumeric.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import tvm
+from .. import defop
+from ..utils import reduce_axes, assign_by_req
+
+
+def _compute_sum(itype, otype, ndim, reduce1st_dim, req):
+    axes = ([reduce1st_dim, 1 - reduce1st_dim] * ndim)[:ndim]
+    a = tvm.placeholder([tvm.var() for _ in range(ndim)], name='a', dtype=itype)
+    reduce_output = reduce_axes(a, axes, tvm.sum, otype)
+    output_placeholder, final_output = assign_by_req(reduce_output, req)
+    s = tvm.create_schedule(final_output.op)
+    return s, a, output_placeholder, final_output, [reduce_output, final_output]
+
+
+@defop(name='sum_cpu', target='cpu', itype=['bool'],
+       otype=['float32', 'float64', 'int32', 'int64'],
+       ndim=[5], req=['kWriteTo', 'kAddTo'], reduce1st_dim=[0, 1],
+       attrs=["reduce1st_dim", "req"])
+def _sum(itype, otype, ndim, reduce1st_dim, req):
+    s, a, output_placeholder, final_output, expr_list = _compute_sum(
+        itype, otype, ndim, reduce1st_dim, req)
+    for expr in expr_list:
+        axes = [axis for axis in expr.op.axis]
+        fused = s[expr].fuse(*axes)
+        s[expr].parallel(fused)
+    return s, [a, output_placeholder, final_output]
diff --git a/contrib/tvmop/opdef.py b/contrib/tvmop/opdef.py
@@ -80,6 +80,8 @@ def invoke_all(self):
                     + ''.join(["{}_{}".format(key, each_kwargs[key]) for key in self.attrs]) \
                     + ''.join(["%s_%d" % (arg.dtype, len(arg.shape))
                                for arg in args if hasattr(arg, 'shape')])
+                if 'sum' in name:
+                    print(name)
                 yield sch, args, name
 
     def get_binds(self, args):
diff --git a/contrib/tvmop/utils.py b/contrib/tvmop/utils.py
@@ -21,16 +21,18 @@
 AllTypes = ["float32", "float64", "float16", "uint8", "int8", "int32", "int64"]
 RealTypes = ["float32", "float64", "float16"]
 
-def assign_by_req(a, req):
+
+def assign_by_req(a, req, otype=None):
     b = tvm.placeholder(a.shape, name='assign_by_req_b', dtype=a.dtype)
-    if (req == "kAddTo"):
-        c = tvm.compute(a.shape, lambda *idx: a[idx] + b[idx])
+    if req == "kAddTo":
+        c = tvm.compute(a.shape, lambda *idx: a[idx].astype(otype) + b[idx]
+                                              if otype else a[idx] + b[idx])
     else:
-        c = tvm.compute(a.shape, lambda *idx: a[idx])
+        c = tvm.compute(a.shape, lambda *idx: a[idx].astype(otype) if otype else a[idx])
     return b, c
 
 
-def reduce_axes(X, axes, reducer):
+def reduce_axes(X, axes, reducer, atype=None):
     def get_index(idx, ridx):
         j = 0
         k = 0
@@ -45,5 +47,7 @@ def get_index(idx, ridx):
     odim = (len(ishape) + 1 - axes[0]) // 2
     oshape = [tvm.var() for _ in range(odim)]
     ridx = [tvm.reduce_axis((0, ishape[i])) for (i, val) in enumerate(axes) if val == 1]
-    ret = tvm.compute(oshape, lambda *idx: reducer(X[get_index(idx, ridx)], axis=ridx), name='ret')
+    ret = tvm.compute(oshape, lambda *idx: reducer(X[get_index(idx, ridx)].astype(atype)
+                                                   if atype else X[get_index(idx, ridx)],
+                                                   axis=ridx), name='ret')
     return ret
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -67,7 +67,6 @@ struct NumpyReduceAxesParam : public dmlc::Parameter<NumpyReduceAxesParam> {
 inline TShape NumpyReduceAxesShapeImpl(const TShape& ishape,
                                        const dmlc::optional<mxnet::Tuple<int>>& axis,
                                        bool keepdims) {
-  // TODO(junwu): improve the logic
   // If input is a scalar, output should be a scalar too
   if (ishape.ndim() == 0) {
     if (axis.has_value()) {
@@ -158,6 +157,10 @@ inline bool NeedSafeAcc(int itype, int otype) {
   return safe_acc_hint && rule;
 }
 
+void TVMOpReduce(const OpContext& ctx, const TBlob& input,
+                 const dmlc::optional<mxnet::Tuple<int>>& axis,
+                 const TBlob& output, const OpReqType req, const std::string& reducer_name);
+
 template<typename xpu, typename reducer, bool safe_acc_hint = false, bool normalize = false,
          typename OP = op::mshadow_op::identity>
 void NumpyReduceAxesCompute(const nnvm::NodeAttrs& attrs,
@@ -169,6 +172,19 @@ void NumpyReduceAxesCompute(const nnvm::NodeAttrs& attrs,
   if (param.initial.has_value()) {
     LOG(FATAL) << "initial is not supported yet";
   }
+  if (req[0] == kNullOp) return;
+  CHECK_NE(req[0], kWriteInplace) << "Reduce does not support write in-place";
+  // If boolean ndarray, use the kernel generated by TVM
+  if (inputs[0].type_flag_ == mshadow::kBool) {
+    std::string reducer_name;
+    if (std::is_same<reducer, mshadow_op::sum>::value) {
+      reducer_name = "sum";
+    } else {
+      LOG(FATAL) << "Only reduce op: `sum` is supported for boolean ndarrays";
+    }
+    TVMOpReduce(ctx, inputs[0], param.axis, outputs[0], req[0], reducer_name);
+    return;
+  }
   if (param.axis.has_value() && param.axis.value().ndim() == 0) {
     UnaryOp::IdentityCompute<xpu>(attrs, ctx, inputs, req, outputs);
   }
@@ -194,6 +210,8 @@ inline void NumpyReduceAxesBackwardUseNone(const nnvm::NodeAttrs& attrs,
                                            const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  CHECK_NE(outputs[0].type_flag_, kBool) << "reduce operators do not support gradient calculation "
+                                            "for input tensors of boolean type.";
   const NumpyReduceAxesParam& param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
   TShape small;
   if (param.keepdims) {
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -23,6 +23,10 @@
  * \brief CPU Implementation of broadcast and reduce functions based on value.
  */
 
+#if MXNET_USE_TVM_OP
+#include "../tvmop/op_module.h"
+#endif  // MXNET_USE_TVM_OP
+
 #include "np_broadcast_reduce_op.h"
 
 namespace mxnet {
@@ -38,7 +42,15 @@ inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
   const NumpyReduceAxesParam &param = nnvm::get<NumpyReduceAxesParam>(attrs.parsed);
 
   if (param.dtype.has_value()) {
+    if (in_attrs->at(0) == mshadow::kBool) {
+      CHECK(param.dtype.value() == mshadow::kInt64 || param.dtype.value() == mshadow::kFloat32
+          || param.dtype.value() == mshadow::kFloat64) << "Only support the following output "
+                                                         "dtypes when input dtype is bool: "
+                                                         "int32, int64, float32, float64.";
+    }
     TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+  } else if (in_attrs->at(0) == mshadow::kBool) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt64);
   } else {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
     TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
@@ -47,6 +59,64 @@ inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
   return out_attrs->at(0) != -1 && in_attrs->at(0) != -1;
 }
 
+#if MXNET_USE_TVM_OP
+static constexpr int max_reduce_ndim = 5;
+TBlob PrependAxes(const TBlob& src, const int dst_ndim);
+#endif  // MXNET_USE_TVM_OP
+
+void TVMOpReduce(const OpContext& ctx,
+                 const TBlob& input,
+                 const dmlc::optional<mxnet::Tuple<int>>& axis,
+                 const TBlob& output,
+                 const OpReqType req,
+                 const std::string& reducer_name) {
+#if MXNET_USE_TVM_OP
+  CHECK_GE(input.ndim(), output.ndim());
+  CHECK_LE(input.ndim(), max_reduce_ndim) << "TVMOpReduce only supports ndim <= "
+                                          << max_reduce_ndim;
+
+  const TBlob expanded_output = (input.ndim() == output.ndim() ?
+      output : output.reshape(NumpyReduceAxesShapeImpl(input.shape_, axis, true)));
+  CHECK_EQ(input.ndim(), expanded_output.ndim());
+  int reduce1st_dim = 0;
+  if (input.ndim() > 0 && input.size(0) != expanded_output.size(0)) {
+    reduce1st_dim = 1;
+  }
+  // collapse consecutive dimensions where reduction are performed or not performed
+  std::vector<index_t> ishape_vec;
+  for (int i = 0; i < input.ndim(); ++i) {
+    if (i == 0 || ((input.size(i) != expanded_output.size(i))
+                    != (input.size(i-1) != expanded_output.size(i-1)))) {
+      ishape_vec.push_back(input.size(i));
+    } else {
+      ishape_vec.back() *= input.size(i);
+    }
+  }
+  // append axes after collapsed ishape to reach the max ndim allowed
+  for (int i = ishape_vec.size(); i < max_reduce_ndim; ++i) {
+    ishape_vec.push_back(1);
+  }
+  std::vector<index_t> oshape_vec;
+  for (size_t i = reduce1st_dim; i < ishape_vec.size(); i += 2) {
+    oshape_vec.push_back(ishape_vec[i]);
+  }
+  TShape ishape(ishape_vec.begin(), ishape_vec.end()), oshape(oshape_vec.begin(), oshape_vec.end());
+  TBlob input_tvm = input.reshape(ishape);
+  TBlob output_tvm = output.reshape(oshape);
+  const std::string ctx_name =
+      (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU) ? "cpu" : "gpu";
+  std::ostringstream func_name;
+  func_name << reducer_name << "_"
+            << (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU ? "cpu" : "gpu")
+            << "reduce1st_dim_" << reduce1st_dim
+            << "req_" << (req == kWriteTo ? "kWriteTo" : "kAddTo");
+  LOG(INFO) << "sum func name: " << func_name.str();
+  tvm::runtime::TVMOpModule::Get()->Call(func_name.str(), ctx, {input_tvm, output_tvm, output_tvm});
+#else
+  LOG(FATAL) << "Please add USE_TVM_OP=1 to enable kernels generated by TVM."
+#endif  // MXNET_USE_TVM_OP
+}
+
 NNVM_REGISTER_OP(_np_sum)
 .describe(R"code()code" ADD_FILELINE)
 .set_num_inputs(1)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -26,9 +26,9 @@
 #if MXNET_USE_TVM_OP
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
+#include "../tvmop/op_module.h"
 #endif  // MXNET_USE_TVM_OP
 
-#include "../tvmop/op_module.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
 #include "../tensor/elemwise_binary_scalar_op.h"
 
@@ -140,7 +140,7 @@ bool NumpyBinaryLogicOpType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-static TBlob PrependAxes(const TBlob& src, const int dst_ndim) {
+TBlob PrependAxes(const TBlob& src, const int dst_ndim) {
   CHECK_LE(src.shape_.ndim(), dst_ndim);
   const int src_ndim = src.shape_.ndim();
   if (src_ndim == dst_ndim) return src;