kshitij12345 commented on a change in pull request #17754:
URL: https://github.com/apache/incubator-mxnet/pull/17754#discussion_r419040219



##########
File path: src/operator/tensor/broadcast_reduce_op_value.cc
##########
@@ -138,25 +190,25 @@ NNVM_REGISTER_OP(broadcast_like)
       return std::vector<std::string>{"lhs", "rhs"};
     })
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::ObjectPtr& n,
-    const std::vector<nnvm::NodeEntry>& ograds) {
+.set_attr<nnvm::FGradient>("FGradient", NonlossGradFGradient{
+  [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
       if (CheckGradAllZero(ograds))
         return MakeZeroGradNodes(n, ograds);

Review comment:
       Probably redundant with `NonlossGradFGradient` verifying the same.

##########
File path: src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
##########
@@ -44,22 +44,42 @@ Example::
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, 
mshadow_op::power>)
-.set_attr<nnvm::FGradient>("FGradient", 
ElemwiseGradUseIn{"_backward_broadcast_power"});
-
-NNVM_REGISTER_OP(_backward_broadcast_power)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, 
mshadow_op::power_grad,
-                                                              
mshadow_op::power_rgrad>);
+.set_attr<nnvm::FGradient>("FGradient", NonlossGradFGradient{
+  [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+  auto head_grad_z = ograds[0];
+  auto x = nnvm::NodeEntry{mxnet::op::MakeNode("broadcast_like",
+      n->attrs.name + "_broadcast_like", {n->inputs[0], head_grad_z}, nullptr, 
&n)};
+  auto y =  nnvm::NodeEntry{mxnet::op::MakeNode("broadcast_like",
+      n->attrs.name + "_broadcast_like", {n->inputs[1], head_grad_z}, nullptr, 
&n)};
+
+  auto one_like  = nnvm::NodeEntry{mxnet::op::MakeNode("ones_like",
+      n->attrs.name + "_ones_like", {y}, nullptr, &n)};
+  auto y_sub_1 = nnvm::NodeEntry{MakeNode("elemwise_sub",
+      n->attrs.name + "_rhs_sub_1",  {y, one_like}, nullptr, &n)};
+  auto x_power_y_sub_1 = nnvm::NodeEntry{MakeNode("broadcast_power",
+      n->attrs.name + "_lhs_power_rhs_sub_1",  {x, y_sub_1}, nullptr, &n)};
+  auto dzdx = nnvm::NodeEntry{MakeNode("elemwise_mul",
+     n->attrs.name + "dpower/dlhs",  {y, x_power_y_sub_1}, nullptr, &n)};
+
+  auto lnx = nnvm::NodeEntry{MakeNode("log",
+      n->attrs.name + "_ln_lhs",  {x}, nullptr, &n)};
+  auto x_power_y = nnvm::NodeEntry{MakeNode("elemwise_mul",

Review comment:
       We can probably use `n`, instead of recomputing `x^y`. Might save one 
extra operation

##########
File path: tests/python/unittest/test_higher_order_grad.py
##########
@@ -570,6 +571,290 @@ def check_nth_order_unary(x, op, grad_ops, orders, 
rtol=None, atol=None):
         assert_almost_equal(
             expected_grad, computed_grad.asnumpy(), rtol=rtol, atol=atol)
 
+@with_seed()
+def test_elemwise_sub():
+    def sub(inputs):
+        return nd.elemwise_sub(inputs[0], inputs[1])
+    def grad_op(inputs):
+        return  [nd.ones_like(inputs[0]), nd.negative(nd.ones_like(inputs[1]))]
+    def grad_grad_op(inputs):
+        return  [nd.zeros_like(inputs[0]),  nd.zeros_like(inputs[1])]
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x, y = random_arrays(shape, shape)
+        check_nth_order_binary([x, y], sub, [grad_op, grad_grad_op], [1,  2])
+
+@with_seed()
+def test_elemwise_mul():
+    def mul(inputs):
+        return nd.elemwise_mul(inputs[0], inputs[1])
+    def grad_op(inputs):
+        return  [inputs[1], inputs[0]]
+    def grad_grad_op(inputs):
+        return [nd.zeros_like(inputs[0]) ,nd.zeros_like(inputs[1])]
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x, y = random_arrays(shape, shape)
+        check_nth_order_binary([x, y], mul, [grad_op, grad_grad_op], [1,  2])
+
+@with_seed()
+def test_power():
+    def power(inputs):
+        return nd.power(inputs[0], inputs[1])
+
+    def grad_op(inputs):
+        x, y = inputs
+        return  [y * nd.power(x, y - 1), nd.power(x, y) * nd.log(x)]
+
+    def grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * nd.power(x, y - 2), nd.power(x, y) * 
(nd.log(x) ** 2)]
+
+    def grad_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * (y - 2) * nd.power(x, y - 3), nd.power(x, y) * 
(nd.log(x) ** 3)]
+
+    low = 1.0
+    high = 3.0
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x = nd.random.uniform(low, high, shape)
+        y = nd.random.uniform(low, high, shape)
+        check_nth_order_binary([x, y], power, [grad_op, grad_grad_op, 
grad_grad_grad_op], [1, 2, 3])
+
+#  based on gen_broadcast_data in test_operation.py
+def gen_broadcast_shape(idx):
+    # Manually set test cases
+    binary_op_data_shape = nd.array(
+        [[[2, 5, 1, 30, 7], [1, 5, 448, 30, 1]],
+         [[10, 49, 1, 77, 17], [10, 1, 2, 1, 17]],
+         [[13, 2, 65, 2,  1], [13, 1, 65, 1, 225]],
+         [[9, 434, 4, 2, 37], [9, 1, 4, 1, 37]],
+         [[2, 52, 1, 4, 1], [1, 52, 60, 1, 37]],
+         [[1, 23, 7, 122, 50], [2, 1, 7, 1, 50]],
+         [[1, 17, 1, 5, 1], [22, 1, 2, 1, 28]],
+         [[29, 1, 2, 1, 8], [29, 22, 1, 130, 1]],
+         [[2, 36, 1, 427, 3], [1, 36, 11, 427, 1]],
+         [[1, 2, 1, 100, 7], [1, 2, 448, 100, 1]],
+         [[1, 2, 495, 77, 7], [1, 2, 1, 1, 7]],
+         [[1, 43, 65, 2, 1], [1, 43, 65, 1, 225]],
+         [[1, 92, 434, 2, 2], [1, 92, 1, 2, 2]],
+         [[1, 92, 1, 4, 1], [1, 92, 134, 1, 17]],
+         [[1, 53, 2, 122, 143], [1, 1, 2, 1, 143]],
+         [[1, 179, 1, 87, 17], [1, 179, 1, 1, 17]],
+         [[1, 1, 17, 5, 1], [1, 22, 1, 1, 28]],
+         [[1, 2, 1, 1, 8], [1, 2, 52, 430, 1]],
+         [[1, 163, 1, 22, 3], [1, 163, 116, 22, 1]],
+         [[1, 1, 44, 30, 7], [1, 1, 44, 30, 1]],
+         [[1, 1, 1, 1, 28], [1, 127, 1, 5, 28]],
+         [[1, 2, 394, 38, 1], [1, 2, 394, 38, 16]],
+         [[1, 10, 49, 77, 17], [1, 1, 1, 1, 17]],
+         [[1, 431, 6, 2, 225], [1, 1, 6, 2, 225]],
+         [[1, 15, 1, 28, 1], [1, 15, 1, 28, 463]], [[1, 129, 2, 48, 96], [1, 
129, 2, 1, 1]],
+         [[1, 1, 403, 17, 2], [1, 44, 403, 17, 2]],
+         [[1, 1, 65, 2, 22], [1, 1, 65, 1, 1]],
+         [[1, 24, 103, 17, 18], [1, 24, 1, 1, 1]],
+         [[1, 1, 1, 1, 2], [1, 24, 194, 50, 1]],
+         [[1, 1, 107, 84, 9], [1, 1, 1, 1, 1]]])
+    if idx < binary_op_data_shape.shape[0]:
+        l_shape = binary_op_data_shape[idx][0]
+        r_shape = binary_op_data_shape[idx][1]
+    else:
+        # Generate random data that has ndim between 1-7 and all the shape 
dims between 1-5
+        ndim = nd.random.randint(1, 6)
+        shape = nd.random.randint(1, 6, size=(ndim,))
+        l_same_dim = nd.random.randint(0, 5)
+        r_same_dim = nd.random.randint(0, 5)
+        l_axis_flags = nd.random.randint(0, 2, size=ndim)
+        r_axis_flags = nd.random.randint(0, 2, size=ndim)
+        if l_same_dim == 4:
+            l_axis_flags = nd.ones(ndim)
+        if r_same_dim == 4:
+            r_axis_flags = nd.ones(ndim)
+        l_shape = shape.copy()
+        r_shape = shape.copy()
+        l_shape[nd.where(l_axis_flags == 0)] = 1
+        r_shape[nd.where(r_axis_flags == 0)] = 1
+    return tuple(l_shape.asnumpy().astype(int)), 
tuple(r_shape.asnumpy().astype(int))
+
+# from test_operation.py
+def reduce_op(shape, x):
+    if shape == x.shape:
+        return x
+    keepdims_shape = list(x.shape)
+    for i in range(len(shape)):
+        if x.shape[i] != shape[i]:
+            keepdims_shape[i] = 1
+            x = nd.sum(x, axis=i).reshape(keepdims_shape)
+    return x
+
+@with_seed()
+def test_broadcast_power():
+    def broadcast_power(inputs):
+        return nd.broadcast_power(inputs[0], inputs[1])
+
+    def unreduced_grad_op(inputs):
+        x, y = inputs
+        return [y * nd.broadcast_power(x, y - 1), nd.broadcast_power(x, y) * 
nd.log(x)]
+
+    def unreduced_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * nd.broadcast_power(x, y - 2), 
nd.broadcast_power(x, y) * (nd.log(x) ** 2)]
+
+    def unreduced_grad_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * (y - 2) * nd.broadcast_power(x, y - 3), 
nd.broadcast_power(x, y) * (nd.log(x) ** 3)]
+
+    low = 1.0
+    high = 3.0
+    for dim in range(1, 5):
+        x_shape, y_shape = gen_broadcast_shape(dim)
+        x = nd.random.uniform(low, high, x_shape)
+        y = nd.random.uniform(low, high, y_shape)
+
+        check_nth_order_binary([x, y], broadcast_power, [unreduced_grad_op, 
unreduced_grad_grad_op,
+            unreduced_grad_grad_grad_op], [1, 2, 3], True, rtol=1e-3, 
atol=1e-5)
+
+def autograd_grad_ex(heads, variables, head_grads=None, retain_graph=None, 
create_graph=False,
+            train_mode=True):
+    """ If some variables don't in the path of computing heads, we set the 
heads grad of them to zero
+    instead of throwing exceptions.
+
+    The autograd.grad requires user knows which variables involved to compute 
the heads grad of them.
+    That's fine for first order grad, but for higher order grad, the variables 
used to compute the heads,
+    may not used to computer their higher order grad. It's impossible to ask 
user to know

Review comment:
       nit : `may not used to compute their higher order grad`

##########
File path: tests/cpp/operator/runner/core_op_runner_test.cc
##########
@@ -46,7 +46,15 @@ static const std::vector<std::pair<std::string, 
std::string>> test_unary_operato
 
 static const std::vector<std::pair<std::string, std::string>> 
test_binary_operators = {
   { "elemwise_add", "_backward_add" },
-  { "elemwise_mul", "_backward_mul" }
+  // TODO(Deng, Wenqi): In 
https://github.com/apache/incubator-mxnet/pull/17754,
+  //  we have changed backward op to graph of ops for computing backward of 
elemwise_mul,
+  //  but the CoreOpExecutor in tests/cpp/include/test_core_op.h actually has 
issues
+  //  to support this way even it provides CoreOpExecutor::GetBackward for the 
case.
+  //  e.g: It actually assumes there is one backward for all kinds of op, but 
elemwise_mul has two.
+  //  It will get wrong dependency for the second backward in 
CoreOpExecutor::GetBackwardDependency
+  //  due to "return igrad_entries[0].node;" //  and failed to call 
CoreOpExecutor::bwd_inputs()
+  //  and CoreOpExecutor::bwd_outpuss() due to "CHECK_EQ(backward_.size(), 
1U)";.
+//  { "elemwise_mul", "_backward_mul" }

Review comment:
       I don't know much about this, but I don't think removing an existing 
test would be a good idea.
   
   @apeforest @larroy @sxjscience Would be better people to take the call.

##########
File path: tests/python/unittest/test_higher_order_grad.py
##########
@@ -570,6 +571,290 @@ def check_nth_order_unary(x, op, grad_ops, orders, 
rtol=None, atol=None):
         assert_almost_equal(
             expected_grad, computed_grad.asnumpy(), rtol=rtol, atol=atol)
 
+@with_seed()
+def test_elemwise_sub():
+    def sub(inputs):
+        return nd.elemwise_sub(inputs[0], inputs[1])
+    def grad_op(inputs):
+        return  [nd.ones_like(inputs[0]), nd.negative(nd.ones_like(inputs[1]))]
+    def grad_grad_op(inputs):
+        return  [nd.zeros_like(inputs[0]),  nd.zeros_like(inputs[1])]
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x, y = random_arrays(shape, shape)
+        check_nth_order_binary([x, y], sub, [grad_op, grad_grad_op], [1,  2])
+
+@with_seed()
+def test_elemwise_mul():
+    def mul(inputs):
+        return nd.elemwise_mul(inputs[0], inputs[1])
+    def grad_op(inputs):
+        return  [inputs[1], inputs[0]]
+    def grad_grad_op(inputs):
+        return [nd.zeros_like(inputs[0]) ,nd.zeros_like(inputs[1])]
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x, y = random_arrays(shape, shape)
+        check_nth_order_binary([x, y], mul, [grad_op, grad_grad_op], [1,  2])
+
+@with_seed()
+def test_power():
+    def power(inputs):
+        return nd.power(inputs[0], inputs[1])
+
+    def grad_op(inputs):
+        x, y = inputs
+        return  [y * nd.power(x, y - 1), nd.power(x, y) * nd.log(x)]
+
+    def grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * nd.power(x, y - 2), nd.power(x, y) * 
(nd.log(x) ** 2)]
+
+    def grad_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * (y - 2) * nd.power(x, y - 3), nd.power(x, y) * 
(nd.log(x) ** 3)]
+
+    low = 1.0
+    high = 3.0
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x = nd.random.uniform(low, high, shape)
+        y = nd.random.uniform(low, high, shape)
+        check_nth_order_binary([x, y], power, [grad_op, grad_grad_op, 
grad_grad_grad_op], [1, 2, 3])
+
+#  based on gen_broadcast_data in test_operation.py
+def gen_broadcast_shape(idx):
+    # Manually set test cases
+    binary_op_data_shape = nd.array(
+        [[[2, 5, 1, 30, 7], [1, 5, 448, 30, 1]],
+         [[10, 49, 1, 77, 17], [10, 1, 2, 1, 17]],
+         [[13, 2, 65, 2,  1], [13, 1, 65, 1, 225]],
+         [[9, 434, 4, 2, 37], [9, 1, 4, 1, 37]],
+         [[2, 52, 1, 4, 1], [1, 52, 60, 1, 37]],
+         [[1, 23, 7, 122, 50], [2, 1, 7, 1, 50]],
+         [[1, 17, 1, 5, 1], [22, 1, 2, 1, 28]],
+         [[29, 1, 2, 1, 8], [29, 22, 1, 130, 1]],
+         [[2, 36, 1, 427, 3], [1, 36, 11, 427, 1]],
+         [[1, 2, 1, 100, 7], [1, 2, 448, 100, 1]],
+         [[1, 2, 495, 77, 7], [1, 2, 1, 1, 7]],
+         [[1, 43, 65, 2, 1], [1, 43, 65, 1, 225]],
+         [[1, 92, 434, 2, 2], [1, 92, 1, 2, 2]],
+         [[1, 92, 1, 4, 1], [1, 92, 134, 1, 17]],
+         [[1, 53, 2, 122, 143], [1, 1, 2, 1, 143]],
+         [[1, 179, 1, 87, 17], [1, 179, 1, 1, 17]],
+         [[1, 1, 17, 5, 1], [1, 22, 1, 1, 28]],
+         [[1, 2, 1, 1, 8], [1, 2, 52, 430, 1]],
+         [[1, 163, 1, 22, 3], [1, 163, 116, 22, 1]],
+         [[1, 1, 44, 30, 7], [1, 1, 44, 30, 1]],
+         [[1, 1, 1, 1, 28], [1, 127, 1, 5, 28]],
+         [[1, 2, 394, 38, 1], [1, 2, 394, 38, 16]],
+         [[1, 10, 49, 77, 17], [1, 1, 1, 1, 17]],
+         [[1, 431, 6, 2, 225], [1, 1, 6, 2, 225]],
+         [[1, 15, 1, 28, 1], [1, 15, 1, 28, 463]], [[1, 129, 2, 48, 96], [1, 
129, 2, 1, 1]],
+         [[1, 1, 403, 17, 2], [1, 44, 403, 17, 2]],
+         [[1, 1, 65, 2, 22], [1, 1, 65, 1, 1]],
+         [[1, 24, 103, 17, 18], [1, 24, 1, 1, 1]],
+         [[1, 1, 1, 1, 2], [1, 24, 194, 50, 1]],
+         [[1, 1, 107, 84, 9], [1, 1, 1, 1, 1]]])
+    if idx < binary_op_data_shape.shape[0]:
+        l_shape = binary_op_data_shape[idx][0]
+        r_shape = binary_op_data_shape[idx][1]
+    else:
+        # Generate random data that has ndim between 1-7 and all the shape 
dims between 1-5
+        ndim = nd.random.randint(1, 6)
+        shape = nd.random.randint(1, 6, size=(ndim,))
+        l_same_dim = nd.random.randint(0, 5)
+        r_same_dim = nd.random.randint(0, 5)
+        l_axis_flags = nd.random.randint(0, 2, size=ndim)
+        r_axis_flags = nd.random.randint(0, 2, size=ndim)
+        if l_same_dim == 4:
+            l_axis_flags = nd.ones(ndim)
+        if r_same_dim == 4:
+            r_axis_flags = nd.ones(ndim)
+        l_shape = shape.copy()
+        r_shape = shape.copy()
+        l_shape[nd.where(l_axis_flags == 0)] = 1
+        r_shape[nd.where(r_axis_flags == 0)] = 1
+    return tuple(l_shape.asnumpy().astype(int)), 
tuple(r_shape.asnumpy().astype(int))
+
+# from test_operation.py
+def reduce_op(shape, x):
+    if shape == x.shape:
+        return x
+    keepdims_shape = list(x.shape)
+    for i in range(len(shape)):
+        if x.shape[i] != shape[i]:
+            keepdims_shape[i] = 1
+            x = nd.sum(x, axis=i).reshape(keepdims_shape)
+    return x
+
+@with_seed()
+def test_broadcast_power():
+    def broadcast_power(inputs):
+        return nd.broadcast_power(inputs[0], inputs[1])
+
+    def unreduced_grad_op(inputs):
+        x, y = inputs
+        return [y * nd.broadcast_power(x, y - 1), nd.broadcast_power(x, y) * 
nd.log(x)]
+
+    def unreduced_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * nd.broadcast_power(x, y - 2), 
nd.broadcast_power(x, y) * (nd.log(x) ** 2)]
+
+    def unreduced_grad_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * (y - 2) * nd.broadcast_power(x, y - 3), 
nd.broadcast_power(x, y) * (nd.log(x) ** 3)]
+
+    low = 1.0
+    high = 3.0
+    for dim in range(1, 5):
+        x_shape, y_shape = gen_broadcast_shape(dim)
+        x = nd.random.uniform(low, high, x_shape)
+        y = nd.random.uniform(low, high, y_shape)
+
+        check_nth_order_binary([x, y], broadcast_power, [unreduced_grad_op, 
unreduced_grad_grad_op,
+            unreduced_grad_grad_grad_op], [1, 2, 3], True, rtol=1e-3, 
atol=1e-5)
+
+def autograd_grad_ex(heads, variables, head_grads=None, retain_graph=None, 
create_graph=False,
+            train_mode=True):
+    """ If some variables don't in the path of computing heads, we set the 
heads grad of them to zero
+    instead of throwing exceptions.
+
+    The autograd.grad requires user knows which variables involved to compute 
the heads grad of them.
+    That's fine for first order grad, but for higher order grad, the variables 
used to compute the heads,
+    may not used to computer their higher order grad. It's impossible to ask 
user to know
+    the formulas of every order grad.
+
+    E.g. we use such code to compute 2-nd order gradient:
+      with autograd.record():
+          z = op(x, y)
+          head_grad = nd.ones_like(z)
+          dz_dx, _  = autograd.grad(heads=z, variables=[x, y], 
head_grads=nd.ones_like(z),
+                              create_graph=True, retain_graph=True)
+          d2z_d2x, _  = autograd.grad(heads=dz_dx, variables=[x, y], 
head_grads=nd.ones_like(dz_dx),
+                              create_graph=True, retain_graph=True)
+    If z = x * y, because d2z_d2x = 0, MXNET will report the input is 
unreachable from the output.
+    But it seems in that case MXNET returns zeros is more reasonable.
+    """
+    # xxx: only consider one head currently
+    argument_names =  autograd.get_symbol(heads).list_arguments()
+
+    # XXX: in some cases, a variable may has more than one outputs, we need a 
other way ot get  the name of various.
+    # But in the unittest, it is fine
+    variable_names = [autograd.get_symbol(variable).list_outputs()[0] for 
variable in variables]
+    involved_variable_indexes = []
+    involved_variables = []
+    for i in range(0, len(variables)):
+        if variable_names[i] in argument_names:
+            involved_variables.append(variables[i])
+            involved_variable_indexes.append(i)
+
+    if involved_variables:
+        partial_grads = autograd.grad(heads, involved_variables, head_grads, 
retain_graph, create_graph, train_mode)
+    else:
+        partial_grads = []
+
+    grads = []
+    partial_grads_index = 0
+    for i in range(0, len(variables)):
+       if i in involved_variable_indexes:
+           grads.append(partial_grads[partial_grads_index])
+           partial_grads_index += 1
+       else:
+           grads.append(nd.zeros_like(variables[i]))
+    return grads
+
+
+def check_nth_order_binary(inputs, op, grad_ops, orders, broadcast_op = False, 
rtol=None, atol=None):
+    """Assert n-th order autograd gradient against expected gradient.
+
+    Multiple order of gradients can be checked by passing list of
+    function computing the particular order gradient and passing the 
corresponding list of order.
+    Note
+    ----
+    1. Orders should always be monotonically increasing.
+    2. Elements of grads_ops should correspond to elements of orders
+    i.e. grads_op = [grad_op, grad_grad_grad_op] should be passed with
+         orders = [1, 3]
+
+    Parameters
+    ----------
+    inputs : tuple of mxnet.NDArray (x, y)
+        Input Array.
+    op : Callable (x,y) -> z
+        Operation to perform on Input Array.
+    grad_ops : Callable or List of Callable
+        Function (x,y) -> (n_grad_x, n_grad_y) to compute and assert gradient 
of given order.
+    orders : int or List of int
+        Order/s to assert expected and computed gradients.
+
+    Returns
+    -------
+    None
+
+    """
+    if isinstance(orders, int):
+        orders = [orders]
+        grad_ops = [grad_ops]
+
+    assert all(i < j for i, j in zip(orders[0:-1], orders[1:])), \
+        "orders should be monotonically increasing"
+    assert len(set(orders)) == len(orders), \
+        "orders should have unique elements"
+    highest_order = max(orders)
+
+    inputs = [nd.array(input) for input in inputs]
+    for input in inputs:
+        input.attach_grad()
+
+    expected_grads = [grad_op(inputs) for grad_op in grad_ops]
+    computed_grads = []
+    head_grads = [[]]
+
+    # Perform compute.
+    with autograd.record():
+        z = op(inputs)
+        heads = [z for _ in inputs]
+        for current_order in range(1, highest_order+1):
+            grads = []
+            new_head_grads = []
+            new_heads = []
+            for i in range(0, len(heads)):
+                head = heads[i]
+                head_grad = nd.random.normal(shape=head.shape)
+                new_head_grads.append(head_grad)
+                grads.append(autograd_grad_ex(heads=head, variables=inputs, 
head_grads=head_grad,
+                                              create_graph=True, 
retain_graph=True)[i])
+                # If we only use once auto grad with head_grads = head_grad in 
every iteration,
+                # in the i-th iteration, we use head = derivative_(i-1) * 
head_grad_(i-1)
+                # but in the expected computed, we use  head = derivative_(i-1)
+                # why it is woks in the check_nth_order_unary?
+                # Because most of them defined gradient of the first gradient 
(derivative_(1) * head_grad_(1))
+                # of the function, and in the gradient function, they manually 
defined derivative_(i-1)
+                # and use it to computed derivative_(1) * head_grad_(1).
+                # It maybe a wrong approach, because the gradient of the first 
gradient should compute the grad of
+                # derivative_(1) * head_grad_(1) instead of gradient of 
derivative_(1).

Review comment:
        I think that is taken care of. (Though I'll think some more to wrap my 
head more around it).
   
    Like in the example below, `grad_grad_mid` considers the fact that 
`head_grad` was used in computing the first gradient,
   
https://github.com/apache/incubator-mxnet/blob/5fd90249cafd14910633d25d3bc370a522736ac7/src/operator/tensor/elemwise_unary_op_trig.cc#L94-L121
   
   
   cc @apeforest @larroy @sxjscience 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to