Add sum boolean gpu compute

reminisce · reminisce · commit 4b372cc6e0e8 · 2019-08-23T21:24:12.000-07:00
diff --git a/contrib/tvmop/core/fromnumeric.py b/contrib/tvmop/core/fromnumeric.py
@@ -34,11 +34,30 @@ def _compute_sum(itype, otype, ndim, reduce1st_dim, req):
        otype=['float32', 'float64', 'int32', 'int64'],
        ndim=[5], req=['kWriteTo', 'kAddTo'], reduce1st_dim=[0, 1],
        attrs=["reduce1st_dim", "req"])
-def _sum(itype, otype, ndim, reduce1st_dim, req):
-    s, a, output_placeholder, final_output, expr_list = _compute_sum(
+def _sum_cpu(itype, otype, ndim, reduce1st_dim, req):
+    s, a, output_placeholder, final_output, tensor_list = _compute_sum(
         itype, otype, ndim, reduce1st_dim, req)
-    for expr in expr_list:
-        axes = [axis for axis in expr.op.axis]
-        fused = s[expr].fuse(*axes)
-        s[expr].parallel(fused)
+    for t in tensor_list:
+        axes = [axis for axis in t.op.axis]
+        fused = s[t].fuse(*axes)
+        s[t].parallel(fused)
+    return s, [a, output_placeholder, final_output]
+
+
+@defop(name='sum_gpu', target='gpu', itype=['bool'],
+       otype=['float32', 'float64', 'int32', 'int64'],
+       ndim=[5], req=['kWriteTo', 'kAddTo'], reduce1st_dim=[0, 1],
+       attrs=["reduce1st_dim", "req"])
+def _sum_gpu(itype, otype, ndim, reduce1st_dim, req):
+    s, a, output_placeholder, final_output, tensor_list = _compute_sum(
+        itype, otype, ndim, reduce1st_dim, req)
+    num_threads = 64
+    for t in tensor_list:
+        block_x = tvm.thread_axis("blockIdx.x")
+        thread_x = tvm.thread_axis("threadIdx.x")
+        axes = [axis for axis in t.op.axis]
+        fused = s[t].fuse(*axes)
+        bx, tx = s[t].split(fused, factor=num_threads)
+        s[t].bind(bx, block_x)
+        s[t].bind(tx, thread_x)
     return s, [a, output_placeholder, final_output]
diff --git a/contrib/tvmop/opdef.py b/contrib/tvmop/opdef.py
@@ -80,8 +80,6 @@ def invoke_all(self):
                     + ''.join(["{}_{}".format(key, each_kwargs[key]) for key in self.attrs]) \
                     + ''.join(["%s_%d" % (arg.dtype, len(arg.shape))
                                for arg in args if hasattr(arg, 'shape')])
-                if 'sum' in name:
-                    print(name)
                 yield sch, args, name
 
     def get_binds(self, args):
diff --git a/src/operator/numpy/np_broadcast_reduce_op.h b/src/operator/numpy/np_broadcast_reduce_op.h
@@ -27,6 +27,7 @@
 
 #include <algorithm>
 #include <vector>
+#include <string>
 #include "../tensor/broadcast_reduce_op.h"
 
 namespace mxnet {
diff --git a/src/operator/numpy/np_broadcast_reduce_op_value.cc b/src/operator/numpy/np_broadcast_reduce_op_value.cc
@@ -43,7 +43,9 @@ inline bool NumpySumType(const nnvm::NodeAttrs& attrs,
 
   if (param.dtype.has_value()) {
     if (in_attrs->at(0) == mshadow::kBool) {
-      CHECK(param.dtype.value() == mshadow::kInt64 || param.dtype.value() == mshadow::kFloat32
+      CHECK(param.dtype.value() == mshadow::kInt32
+          || param.dtype.value() == mshadow::kInt64
+          || param.dtype.value() == mshadow::kFloat32
           || param.dtype.value() == mshadow::kFloat64) << "Only support the following output "
                                                          "dtypes when input dtype is bool: "
                                                          "int32, int64, float32, float64.";
@@ -110,7 +112,6 @@ void TVMOpReduce(const OpContext& ctx,
             << (ctx.run_ctx.ctx.dev_type == mxnet::Context::DeviceType::kCPU ? "cpu" : "gpu")
             << "reduce1st_dim_" << reduce1st_dim
             << "req_" << (req == kWriteTo ? "kWriteTo" : "kAddTo");
-  LOG(INFO) << "sum func name: " << func_name.str();
   tvm::runtime::TVMOpModule::Get()->Call(func_name.str(), ctx, {input_tvm, output_tvm, output_tvm});
 #else
   LOG(FATAL) << "Please add USE_TVM_OP=1 to enable kernels generated by TVM."
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
@@ -232,27 +232,43 @@ def is_int(dtype):
     in_data_dim = random.choice([2, 3, 4])
     shape = rand_shape_nd(in_data_dim, dim=3)
     acc_type = {'float16': 'float32', 'float32': 'float64', 'float64': 'float64',
-                'int8': 'int32', 'int32': 'int64', 'int64': 'int64'}
+                'int8': 'int32', 'int32': 'int64', 'int64': 'int64', 'bool': 'int64'}
     for hybridize in [False, True]:
         for keepdims in [True, False]:
             for axis in ([i for i in range(in_data_dim)] + [(), None]):
-                for itype in ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']:
+                for itype in ['float16', 'float32', 'float64', 'int8', 'int32', 'int64', 'bool']:
                     for dtype in ['float16', 'float32', 'float64', 'int8', 'int32', 'int64']:
-                        if is_int(dtype) and not is_int(itype):
+                        print("==========================")
+                        print(shape)
+                        print(itype)
+                        print(axis)
+                        print(dtype)
+                        print(keepdims)
+                        print(hybridize)
+                        if (is_int(dtype) and not is_int(itype))\
+                                or (itype == 'bool' and dtype not in ('float32', 'float64', 'int32', 'int64')):
                             continue
                         # test gluon
                         test_sum = TestSum(axis=axis, dtype=dtype, keepdims=keepdims)
                         if hybridize:
                             test_sum.hybridize()
                         if is_int(itype):
                             x = _np.random.randint(-128, 128, shape, dtype=itype)
-                            x = mx.nd.array(x)
+                            x = np.array(x)
+                        elif itype == 'bool':
+                            x = _np.random.randint(0, 2, shape) < 1
+                            x = np.array(x, dtype='bool')
                         else:
-                            x = mx.nd.random.uniform(-1.0, 1.0, shape=shape, dtype=itype)
-                        x = x.as_np_ndarray()
-                        x.attach_grad()
+                            x = np.random.uniform(-1.0, 1.0, size=shape, dtype=itype)
                         expected_ret = _np.sum(x.asnumpy(), axis=axis, dtype=acc_type[itype], keepdims=keepdims)
                         expected_ret = expected_ret.astype(dtype)
+                        if itype == 'bool':  # special handling of boolean ndarray
+                            y = test_sum(x)
+                            assert y.dtype == expected_ret.dtype
+                            assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-4, atol=1e-5, use_broadcast=False)
+                            continue
+
+                        x.attach_grad()
                         with mx.autograd.record():
                             y = test_sum(x)
                         assert y.shape == expected_ret.shape