Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-978] Higher Order Gradient Support arctan, arctanh, radians. #15531

Merged
63 changes: 60 additions & 3 deletions src/operator/tensor/elemwise_unary_op_trig.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <mxnet/base.h>
#include "elemwise_unary_op.h"
#include "./elemwise_binary_op-inl.h"
#include "./util/node_op_util.h"

namespace mxnet {
namespace op {
Expand Down Expand Up @@ -201,7 +202,35 @@ The storage type of ``arctan`` output depends upon the input storage type:
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctan" });

MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arctan,
unary_bwd<mshadow_op::arctan_grad>);
unary_bwd<mshadow_op::arctan_grad>)
.set_attr<nnvm::FGradient>("FGradient",
[](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
// ograds[0]: head_grad_grads (dL/dy_grad)
// inputs[0]: dL/dy
// inputs[1]: x (ElemwiseGradUseIn)
// n: dL/dy * f'(x)
// f(x) = arctanh(x)
// dydx = f'(x) = 1/(1+x^2)
// f''(x) = f'(x) * f'(x) * -2 * x = (-2 * x) / (1 + x^2)^2
// return:
// 0: dL/dy_grad * dy/dx
// 1: dL/dy_grad * dL/dy * f''(x)
auto dldy = n->inputs[0];
auto x = n->inputs[1];
auto dldy_mul_dydx = nnvm::NodeEntry{n};
auto Op = mxnet::util::NodeOp{n};

auto x_grad = Op.div(dldy_mul_dydx, dldy);
auto x_grad_square = Op.square(x_grad);
auto x_grad_square_mul_x = Op.mul(x_grad_square, x);
auto x_grad_square_mul_2_x = Op.mul(-2.0, x_grad_square_mul_x);
auto grad_grad_x = Op.mul(dldy, x_grad_square_mul_2_x);

std::vector<nnvm::NodeEntry> ret;
ret.emplace_back(Op.mul(ograds[0], x_grad));
ret.emplace_back(Op.mul(ograds[0], grad_grad_x));
return ret;
});

// degrees
MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(degrees, cpu, mshadow_op::degrees)
Expand Down Expand Up @@ -239,7 +268,8 @@ The storage type of ``radians`` output depends upon the input storage type:
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_radians" });

MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_radians,
unary_bwd<mshadow_op::radians_grad>);
unary_bwd<mshadow_op::radians_grad>)
.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);

// sinh
MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(sinh, cpu, mshadow_op::sinh)
Expand Down Expand Up @@ -338,8 +368,35 @@ The storage type of ``arctanh`` output depends upon the input storage type:
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctanh" });

MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arctanh,
unary_bwd<mshadow_op::arctanh_grad>);
unary_bwd<mshadow_op::arctanh_grad>)
.set_attr<nnvm::FGradient>("FGradient",
[](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
// ograds[0]: head_grad_grads (dL/dy_grad)
// inputs[0]: dL/dy
// inputs[1]: x (ElemwiseGradUseIn)
// n: dL/dy * dy/dx
// f(x) = arctanh(x)
// dy/dx = f'(x) = 1/(1-x^2)
// f''(x) = f'(x) * f'(x) * 2 * x = (2 * x) / (1 - x^2)^2
// return:
// 0: dL/dy_grad * dy/dx
// 1: dL/dy_grad * dL/dy * f''(x)
auto dldy = n->inputs[0];
auto x = n->inputs[1];
auto dldy_mul_dydx = nnvm::NodeEntry{n};
auto Op = mxnet::util::NodeOp{n};

auto x_grad = Op.div(dldy_mul_dydx, dldy);
auto x_grad_square = Op.square(x_grad);
auto x_grad_square_mul_x = Op.mul(x_grad_square, x);
auto x_grad_square_mul_2_x = Op.mul(2.0, x_grad_square_mul_x);
auto grad_grad_x = Op.mul(dldy, x_grad_square_mul_2_x);

std::vector<nnvm::NodeEntry> ret;
ret.emplace_back(Op.mul(ograds[0], x_grad));
ret.emplace_back(Op.mul(ograds[0], grad_grad_x));
return ret;
});

} // namespace op
} // namespace mxnet
76 changes: 76 additions & 0 deletions src/operator/tensor/util/node_op_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* Copyright (c) 2019 by Contributors
* \file node_op_util.h
* \brief abstraction for commonly nnvm::Node operations.
*/
#ifndef MXNET_OPERATOR_TENSOR_UTIL_NODE_OP_UTIL_H_
#define MXNET_OPERATOR_TENSOR_UTIL_NODE_OP_UTIL_H_
#include <mxnet/base.h>
#include <string>
#include <unordered_map>
#include "../../elemwise_op_common.h"

namespace mxnet {
namespace util {

class NodeOp {
Copy link
Contributor Author

@kshitij12345 kshitij12345 Jul 15, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@larroy @apeforest @marcoabreu ,
Have added this class as an abstraction over MakeNode. Using this we can avoid #15331 , as missing argument will be detected at compile time. Also it reduces the noise while using MakeNode. There shouldn't be any performance hit as the member functions should get inlined.

arctan and arctanh both compute roughly the same thing. Can see the difference by using this class as abstraction and the our usual method. arctan -> Usual :: arctanh -> NodeOp

It might be possible that I have missed something.

Let me know what you think, will move forward from there or reset the commit.
Thanks.

Copy link
Contributor

@larroy larroy Jul 16, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good to me, indeed reduces noise.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, I'll also update arctan to use this.
Thanks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@apeforest Do review this part. If it looks good, I'll update all other PRs to use this machinery once this is in.

private:
const nnvm::NodePtr &dependent_node;

public:
explicit NodeOp(const nnvm::NodePtr &dependent_node) : dependent_node{dependent_node} {}

nnvm::NodeEntry mul(const nnvm::NodeEntry &lhs, const nnvm::NodeEntry &rhs) {
return nnvm::NodeEntry{mxnet::op::MakeNode("elemwise_mul",
dependent_node->attrs.name + "_mul",
{lhs, rhs}, nullptr, &dependent_node)};
}

nnvm::NodeEntry mul(const nnvm::NodeEntry &x, double scalar) {
const std::unordered_map<std::string, std::string> scalar_dict =
{{"scalar", std::to_string(scalar)}};
return nnvm::NodeEntry{mxnet::op::MakeNode("_mul_scalar",
dependent_node->attrs.name + "_mul_scalar",
{x}, &scalar_dict, &dependent_node)};
}

nnvm::NodeEntry mul(double scalar, const nnvm::NodeEntry &x) {
return NodeOp::mul(x, scalar);
}

nnvm::NodeEntry div(const nnvm::NodeEntry &lhs, const nnvm::NodeEntry &rhs) {
return nnvm::NodeEntry{mxnet::op::MakeNode("elemwise_div",
dependent_node->attrs.name + "_div",
{lhs, rhs}, nullptr, &dependent_node)};
}

nnvm::NodeEntry square(const nnvm::NodeEntry &x) {
return nnvm::NodeEntry{mxnet::op::MakeNode("square",
dependent_node->attrs.name + "_square",
{x}, nullptr, &dependent_node)};
}
};

} // namespace util
} // namespace mxnet

#endif // MXNET_OPERATOR_TENSOR_UTIL_NODE_OP_UTIL_H_
46 changes: 46 additions & 0 deletions tests/python/unittest/test_higher_order_grad.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


import math
import random
from mxnet import nd, autograd
from mxnet.test_utils import assert_almost_equal, random_arrays, rand_shape_nd
from common import with_seed
Expand Down Expand Up @@ -50,6 +51,51 @@ def grad_grad_op(x):
check_second_order_unary(array, cos, grad_grad_op)


@with_seed()
def test_arctan():
def arctan(x):
return nd.arctan(x)

def grad_grad_op(x):
return (-2 * x)/((1 + x**2)**2)

for dim in range(1, 5):
shape = rand_shape_nd(dim)
array = random_arrays(shape)
# Domain of arctan is all real numbers.
# Scale std_dev
array *= random.randint(500, 10000)
check_second_order_unary(array, arctan, grad_grad_op)


@with_seed()
def test_arctanh():
def arctanh(x):
return nd.arctanh(x)

def grad_grad_op(x):
return (2 * x)/((1 - x**2)**2)

for dim in range(1, 5):
shape = rand_shape_nd(dim)
array = random_arrays(shape)
check_second_order_unary(array, arctanh, grad_grad_op)


@with_seed()
def test_radians():
def radians(x):
return nd.radians(x)

def grad_grad_op(x):
return nd.zeros_like(x)

for dim in range(1, 5):
shape = rand_shape_nd(dim)
array = random_arrays(shape)
check_second_order_unary(array, radians, grad_grad_op)


@with_seed()
def test_relu():
def relu(x):
Expand Down