diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index 50637a8e7b42..3c03f8061d87 100644 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -1293,15 +1293,37 @@ struct AdamParam : public dmlc::Parameter { } }; +struct AdamUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, + DType* mean_data, DType* var_data, const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType beta1, const DType beta2, + const DType lr, const DType wd, + const DType epsilon, const OpReqType req) { + using namespace mshadow_op; + + DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd; + if (clip_gradient >= 0.f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); + } + + mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled; + var_data[i] = beta2 * var_data[i] + + (1.f - beta2) * grad_rescaled * grad_rescaled; + + KERNEL_ASSIGN(out_data[i], req, weight_data[i] - lr * mean_data[i] / + (square_root::Map(var_data[i]) + epsilon)); + } +}; + template inline void AdamUpdate(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; const AdamParam& param = nnvm::get(attrs.parsed); Stream* s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { @@ -1311,22 +1333,12 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs, Tensor var = inputs[3].FlatTo2D(s); Tensor out = outputs[0].FlatTo2D(s); - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - - if (param.clip_gradient >= 0.0f) { - mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * - F(grad, DType(param.clip_gradient)); - var = scalar(param.beta2)*var + scalar(1.f-param.beta2)*F( - F(grad, DType(param.clip_gradient))); - } else { - mean = scalar(param.beta1)*mean + scalar(1.f-param.beta1) * grad; - var = scalar(param.beta2)*var + scalar(1.f-param.beta2) * F(grad); - } - Assign(out, req[0], - weight - - scalar(param.lr) * mean / - (F(var) + scalar(param.epsilon))); + Kernel::Launch(s, weight.shape_.Size(), + out.dptr_, mean.dptr_, var.dptr_, weight.dptr_, grad.dptr_, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.beta1), static_cast(param.beta2), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.epsilon), req[0]); }); } @@ -1596,57 +1608,64 @@ struct RMSPropAlexParam : public dmlc::Parameter { } }; +struct RMSPropAlexUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, + DType* state_n_data, DType* state_g_data, DType* delta_data, + const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType gamma1, const DType gamma2, + const DType lr, const DType wd, + const DType clip_weights, const DType epsilon, + const OpReqType req) { + using namespace mshadow_op; + + DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i]; + if (clip_gradient >= 0.0f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); + } + + state_n_data[i] = (1.f - gamma1) * grad_rescaled * grad_rescaled + + gamma1 * state_n_data[i]; + state_g_data[i] = (1.f - gamma1) * grad_rescaled + + gamma1 * state_g_data[i]; + delta_data[i] = gamma2 * delta_data[i] - + (lr * (grad_rescaled) / + (square_root::Map(state_n_data[i] - + state_g_data[i] * state_g_data[i] + epsilon))); + + if (clip_weights >= 0.0f) { + const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights); + KERNEL_ASSIGN(out_data[i], req, clipped_weight); + } else { + KERNEL_ASSIGN(out_data[i], req, weight_data[i] + delta_data[i]); + } + } +}; + template inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; const RMSPropAlexParam ¶m = nnvm::get(attrs.parsed); Stream *s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - Tensor weight = inputs[0].FlatTo2D(s); - Tensor grad = inputs[1].FlatTo2D(s); - Tensor state_n = inputs[2].FlatTo2D(s); - Tensor state_g = inputs[3].FlatTo2D(s); - Tensor delta = inputs[4].FlatTo2D(s); - Tensor out = outputs[0].FlatTo2D(s); - - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - - if (param.clip_gradient >= 0.0f) { - state_n = scalar(1.f - param.gamma1) * - F(grad, DType(param.clip_gradient)) * - F(grad, DType(param.clip_gradient)) + - scalar(param.gamma1) * state_n; - state_g = scalar(1.f - param.gamma1) * - F(grad, DType(param.clip_gradient)) + - scalar(param.gamma1) * state_g; - delta = scalar(param.gamma2) * delta - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n - state_g * state_g + - scalar(param.epsilon)))); - } else { - state_n = scalar(1.f - param.gamma1) * (grad * grad) + - scalar(param.gamma1) * state_n; - state_g = scalar(1.f - param.gamma1) * grad + - scalar(param.gamma1) * state_g; - delta = scalar(param.gamma2) * delta - - scalar(param.lr) * - (grad / (F(state_n - state_g * state_g + - scalar(param.epsilon)))); - } + DType* weight_data = inputs[0].dptr(); + DType* grad_data = inputs[1].dptr(); + DType* state_n_data = inputs[2].dptr(); + DType* state_g_data = inputs[3].dptr(); + DType* delta_data = inputs[4].dptr(); + DType* out_data = outputs[0].dptr(); - if (param.clip_weights >= 0.0f) { - Assign(out, req[0], F(weight + delta, DType(param.clip_weights))); - } else { - Assign(out, req[0], weight + delta); - } + Kernel::Launch(s, inputs[0].shape_.Size(), + out_data, state_n_data, state_g_data, delta_data, weight_data, grad_data, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.gamma1), static_cast(param.gamma2), + static_cast(param.lr), static_cast(param.wd), + static_cast(param.clip_weights), static_cast(param.epsilon), req[0]); }); } @@ -1688,64 +1707,52 @@ struct RMSPropParam : public dmlc::Parameter { } }; +struct RMSPropUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, + DType* out_data, DType* state_n_data, + const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType gamma1, const DType lr, const DType wd, + const DType clip_weights, const DType epsilon, + const OpReqType req) { + using namespace mshadow_op; + + DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i]; + if (clip_gradient >= 0.0f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); + } + + state_n_data[i] = (1.f - gamma1) * (grad_rescaled * grad_rescaled) + gamma1 * state_n_data[i]; + + DType weight = weight_data[i] - + lr * (grad_rescaled / square_root::Map(state_n_data[i] + epsilon)); + if (clip_weights >= 0.0f) { + weight = clip::Map(weight, clip_weights); + } + KERNEL_ASSIGN(out_data[i], req, weight); + } +}; + template inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; const RMSPropParam ¶m = nnvm::get(attrs.parsed); Stream *s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - Tensor weight = inputs[0].FlatTo2D(s); - Tensor grad = inputs[1].FlatTo2D(s); - Tensor state_n = inputs[2].FlatTo2D(s); - Tensor out = outputs[0].FlatTo2D(s); + DType* weight_data = inputs[0].dptr(); + DType* grad_data = inputs[1].dptr(); + DType* state_n_data = inputs[2].dptr(); + DType* out_data = outputs[0].dptr(); - grad = scalar(param.rescale_grad) * grad + - scalar(param.wd) * weight; - - if (param.clip_gradient >= 0.0f) { - state_n = scalar(1.f - param.gamma1) * - F(grad, DType(param.clip_gradient)) * - F(grad, DType(param.clip_gradient)) + - scalar(param.gamma1) * state_n; - if (param.clip_weights >= 0.0f) { - Assign(out, req[0], - F(weight - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n + - scalar(param.epsilon)))), - DType(param.clip_weights))); - } else { - Assign(out, req[0], weight - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n + - scalar(param.epsilon))))); - } - } else { - state_n = scalar(1.f - param.gamma1) * (grad * grad) + - scalar(param.gamma1) * state_n; - if (param.clip_weights >= 0.0f) { - Assign(out, req[0], - F(weight - - scalar(param.lr) * - (grad / - (F(state_n + - scalar(param.epsilon)))), - DType(param.clip_weights))); - } else { - Assign(out, req[0], weight - - scalar(param.lr) * - (grad / - (F(state_n + - scalar(param.epsilon))))); - } - } + Kernel::Launch(s, inputs[0].shape_.Size(), + out_data, state_n_data, weight_data, grad_data, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.gamma1), static_cast(param.lr), static_cast(param.wd), + static_cast(param.clip_weights), static_cast(param.epsilon), req[0]); }); } @@ -1781,15 +1788,41 @@ struct FtrlParam : public dmlc::Parameter { } }; +struct FtrlUpdateKernel { + template + MSHADOW_XINLINE static void Map(int i, DType* out_data, + DType* n_data, DType* z_data, const DType* weight_data, const DType* grad_data, + const DType clip_gradient, const DType rescale_grad, + const DType beta, const DType lamda1, + const DType lr, const DType wd, + const OpReqType req) { + using namespace mshadow_op; + + DType grad_rescaled = grad_data[i] * rescale_grad; + if (clip_gradient >= 0.0f) { + grad_rescaled = clip::Map(grad_rescaled, clip_gradient); + } + + z_data[i] += grad_rescaled - (square_root::Map(n_data[i] + + square::Map(grad_rescaled)) - square_root::Map(n_data[i])) * + weight_data[i] / lr; + n_data[i] += square::Map(grad_rescaled); + + KERNEL_ASSIGN(out_data[i], req, + (sign::Map(z_data[i]) * lamda1 - z_data[i]) / + ((beta + square_root::Map(n_data[i])) / lr + wd) * + gt::Map(abs::Map(z_data[i]), lamda1)); + } +}; + template inline void FtrlUpdate(const nnvm::NodeAttrs& attrs, const OpContext &ctx, const std::vector &inputs, const std::vector &req, const std::vector &outputs) { - using namespace mshadow; - using namespace mshadow::expr; - using namespace mshadow_op; + using namespace mxnet_op; + const FtrlParam& param = nnvm::get(attrs.parsed); Stream* s = ctx.get_stream(); MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { @@ -1799,23 +1832,11 @@ inline void FtrlUpdate(const nnvm::NodeAttrs& attrs, Tensor n = inputs[3].FlatTo2D(s); Tensor out = outputs[0].FlatTo2D(s); - grad = scalar(param.rescale_grad) * grad; - - if (param.clip_gradient >= 0.0f) { - z += F(grad, DType(param.clip_gradient)) - (F(n + - F(F(grad, DType(param.clip_gradient)))) - F(n)) * - weight / scalar(param.lr); - n += F(F(grad, DType(param.clip_gradient))); - } else { - z += grad - (F(n + F(grad)) - F(n)) * - weight / scalar(param.lr); - n += F(grad); - } - Assign(out, req[0], - (F(z) * scalar(param.lamda1) - z) / - ((scalar(param.beta) + F(n)) / - scalar(param.lr) + scalar(param.wd)) * - F(F(z), scalar(param.lamda1))); + Kernel::Launch(s, weight.shape_.Size(), + out.dptr_, n.dptr_, z.dptr_, weight.dptr_, grad.dptr_, + static_cast(param.clip_gradient), static_cast(param.rescale_grad), + static_cast(param.beta), static_cast(param.lamda1), + static_cast(param.lr), static_cast(param.wd), req[0]); }); } diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py index 0f154bd67a1a..8aa43b4a553f 100644 --- a/tests/python/unittest/test_ndarray.py +++ b/tests/python/unittest/test_ndarray.py @@ -28,7 +28,7 @@ from mxnet.test_utils import default_context from mxnet.test_utils import np_reduce from mxnet.test_utils import same -from mxnet.test_utils import random_sample, rand_shape_nd +from mxnet.test_utils import random_sample, rand_shape_nd, random_arrays from mxnet import runtime from numpy.testing import assert_allclose import mxnet.autograd @@ -1799,6 +1799,71 @@ def check_save_load(save_is_np_shape, load_is_np_shape, shapes, save_throw_excep check_save_load(True, True, [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 5, 0)], False, False) +@with_seed() +def test_update_ops_mutation(): + def assert_mutate(x, y, op): + np.testing.assert_raises( + AssertionError, np.testing.assert_allclose, x, y) + + def assert_unchanged(x, y, op): + np.testing.assert_allclose(x, y) + + def test_op(op, num_inputs, mutated_inputs, **kwargs): + for dim in range(1, 7): + shape = rand_shape_nd(dim) + shapes = (shape,) * num_inputs + + # Generate Arrays + arrays = tuple(map(mx.nd.array, random_arrays(*shapes))) + + # Arrays before update + pre_arrays = tuple(map( + lambda x: x.asnumpy(), arrays)) + + # Operate + # weight -> arrays[0] + op(*arrays, out=arrays[0], **kwargs) + + # Arrays post update + post_arrays = tuple(map( + lambda x: x.asnumpy(), arrays)) + + for idx, (pre_array, post_array) in \ + enumerate(zip(pre_arrays, post_arrays)): + if idx in mutated_inputs: + assert_mutate(pre_array, post_array, op) + else: + assert_unchanged(pre_array, post_array, op) + + test_op(mx.nd.signsgd_update, 2, [0], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'clip_gradient': 1e-3}) + test_op(mx.nd.signum_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'momentum': 1e-3, 'clip_gradient': 1e-3, + 'wd_lh': 1e-3}) + test_op(mx.nd.sgd_update, 2, [0], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'clip_gradient': 1e-3}) + test_op(mx.nd.sgd_mom_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'momentum': 0.01, 'clip_gradient': 1e-3}) + test_op(mx.nd.nag_mom_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'momentum': 0.01, 'clip_gradient': 1e-3}) + test_op(mx.nd.ftml_update, 5, [0, 2, 3, 4], ** + {'t': 3, 'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3, + 'clip_grad': 1e-3}) + test_op(mx.nd.ftrl_update, 4, [0, 2, 3], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) + test_op(mx.nd.adam_update, 4, [0, 2, 3], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) + test_op(mx.nd.rmspropalex_update, 5, [0, 2, 3, 4], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) + test_op(mx.nd.rmsprop_update, 3, [0, 2], ** + {'rescale_grad': 0.1, 'lr': 0.01, 'wd': 1e-3}) + + if __name__ == '__main__': import nose nose.runmodule()