[GitHub] [incubator-mxnet] tobecontinued commented on a change in pull request #17754: [MXNET-978] Higher Order Gradient Support broadcast_to, broadcast_power, power, elemwise_mul and elemwise_sub and add unit test function check check_nth_order_binary

GitBox Mon, 04 May 2020 06:48:48 -0700


tobecontinued commented on a change in pull request #17754:
URL: https://github.com/apache/incubator-mxnet/pull/17754#discussion_r419449575




##########
File path: tests/python/unittest/test_higher_order_grad.py
##########
@@ -570,6 +571,290 @@ def check_nth_order_unary(x, op, grad_ops, orders, 
rtol=None, atol=None):
         assert_almost_equal(
             expected_grad, computed_grad.asnumpy(), rtol=rtol, atol=atol)
 
+@with_seed()
+def test_elemwise_sub():
+    def sub(inputs):
+        return nd.elemwise_sub(inputs[0], inputs[1])
+    def grad_op(inputs):
+        return  [nd.ones_like(inputs[0]), nd.negative(nd.ones_like(inputs[1]))]
+    def grad_grad_op(inputs):
+        return  [nd.zeros_like(inputs[0]),  nd.zeros_like(inputs[1])]
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x, y = random_arrays(shape, shape)
+        check_nth_order_binary([x, y], sub, [grad_op, grad_grad_op], [1,  2])
+
+@with_seed()
+def test_elemwise_mul():
+    def mul(inputs):
+        return nd.elemwise_mul(inputs[0], inputs[1])
+    def grad_op(inputs):
+        return  [inputs[1], inputs[0]]
+    def grad_grad_op(inputs):
+        return [nd.zeros_like(inputs[0]) ,nd.zeros_like(inputs[1])]
+
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x, y = random_arrays(shape, shape)
+        check_nth_order_binary([x, y], mul, [grad_op, grad_grad_op], [1,  2])
+
+@with_seed()
+def test_power():
+    def power(inputs):
+        return nd.power(inputs[0], inputs[1])
+
+    def grad_op(inputs):
+        x, y = inputs
+        return  [y * nd.power(x, y - 1), nd.power(x, y) * nd.log(x)]
+
+    def grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * nd.power(x, y - 2), nd.power(x, y) * 
(nd.log(x) ** 2)]
+
+    def grad_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * (y - 2) * nd.power(x, y - 3), nd.power(x, y) * 
(nd.log(x) ** 3)]
+
+    low = 1.0
+    high = 3.0
+    for dim in range(1, 5):
+        shape = rand_shape_nd(dim)
+        x = nd.random.uniform(low, high, shape)
+        y = nd.random.uniform(low, high, shape)
+        check_nth_order_binary([x, y], power, [grad_op, grad_grad_op, 
grad_grad_grad_op], [1, 2, 3])
+
+#  based on gen_broadcast_data in test_operation.py
+def gen_broadcast_shape(idx):
+    # Manually set test cases
+    binary_op_data_shape = nd.array(
+        [[[2, 5, 1, 30, 7], [1, 5, 448, 30, 1]],
+         [[10, 49, 1, 77, 17], [10, 1, 2, 1, 17]],
+         [[13, 2, 65, 2,  1], [13, 1, 65, 1, 225]],
+         [[9, 434, 4, 2, 37], [9, 1, 4, 1, 37]],
+         [[2, 52, 1, 4, 1], [1, 52, 60, 1, 37]],
+         [[1, 23, 7, 122, 50], [2, 1, 7, 1, 50]],
+         [[1, 17, 1, 5, 1], [22, 1, 2, 1, 28]],
+         [[29, 1, 2, 1, 8], [29, 22, 1, 130, 1]],
+         [[2, 36, 1, 427, 3], [1, 36, 11, 427, 1]],
+         [[1, 2, 1, 100, 7], [1, 2, 448, 100, 1]],
+         [[1, 2, 495, 77, 7], [1, 2, 1, 1, 7]],
+         [[1, 43, 65, 2, 1], [1, 43, 65, 1, 225]],
+         [[1, 92, 434, 2, 2], [1, 92, 1, 2, 2]],
+         [[1, 92, 1, 4, 1], [1, 92, 134, 1, 17]],
+         [[1, 53, 2, 122, 143], [1, 1, 2, 1, 143]],
+         [[1, 179, 1, 87, 17], [1, 179, 1, 1, 17]],
+         [[1, 1, 17, 5, 1], [1, 22, 1, 1, 28]],
+         [[1, 2, 1, 1, 8], [1, 2, 52, 430, 1]],
+         [[1, 163, 1, 22, 3], [1, 163, 116, 22, 1]],
+         [[1, 1, 44, 30, 7], [1, 1, 44, 30, 1]],
+         [[1, 1, 1, 1, 28], [1, 127, 1, 5, 28]],
+         [[1, 2, 394, 38, 1], [1, 2, 394, 38, 16]],
+         [[1, 10, 49, 77, 17], [1, 1, 1, 1, 17]],
+         [[1, 431, 6, 2, 225], [1, 1, 6, 2, 225]],
+         [[1, 15, 1, 28, 1], [1, 15, 1, 28, 463]], [[1, 129, 2, 48, 96], [1, 
129, 2, 1, 1]],
+         [[1, 1, 403, 17, 2], [1, 44, 403, 17, 2]],
+         [[1, 1, 65, 2, 22], [1, 1, 65, 1, 1]],
+         [[1, 24, 103, 17, 18], [1, 24, 1, 1, 1]],
+         [[1, 1, 1, 1, 2], [1, 24, 194, 50, 1]],
+         [[1, 1, 107, 84, 9], [1, 1, 1, 1, 1]]])
+    if idx < binary_op_data_shape.shape[0]:
+        l_shape = binary_op_data_shape[idx][0]
+        r_shape = binary_op_data_shape[idx][1]
+    else:
+        # Generate random data that has ndim between 1-7 and all the shape 
dims between 1-5
+        ndim = nd.random.randint(1, 6)
+        shape = nd.random.randint(1, 6, size=(ndim,))
+        l_same_dim = nd.random.randint(0, 5)
+        r_same_dim = nd.random.randint(0, 5)
+        l_axis_flags = nd.random.randint(0, 2, size=ndim)
+        r_axis_flags = nd.random.randint(0, 2, size=ndim)
+        if l_same_dim == 4:
+            l_axis_flags = nd.ones(ndim)
+        if r_same_dim == 4:
+            r_axis_flags = nd.ones(ndim)
+        l_shape = shape.copy()
+        r_shape = shape.copy()
+        l_shape[nd.where(l_axis_flags == 0)] = 1
+        r_shape[nd.where(r_axis_flags == 0)] = 1
+    return tuple(l_shape.asnumpy().astype(int)), 
tuple(r_shape.asnumpy().astype(int))
+
+# from test_operation.py
+def reduce_op(shape, x):
+    if shape == x.shape:
+        return x
+    keepdims_shape = list(x.shape)
+    for i in range(len(shape)):
+        if x.shape[i] != shape[i]:
+            keepdims_shape[i] = 1
+            x = nd.sum(x, axis=i).reshape(keepdims_shape)
+    return x
+
+@with_seed()
+def test_broadcast_power():
+    def broadcast_power(inputs):
+        return nd.broadcast_power(inputs[0], inputs[1])
+
+    def unreduced_grad_op(inputs):
+        x, y = inputs
+        return [y * nd.broadcast_power(x, y - 1), nd.broadcast_power(x, y) * 
nd.log(x)]
+
+    def unreduced_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * nd.broadcast_power(x, y - 2), 
nd.broadcast_power(x, y) * (nd.log(x) ** 2)]
+
+    def unreduced_grad_grad_grad_op(inputs):
+        x, y = inputs
+        return   [y * (y - 1) * (y - 2) * nd.broadcast_power(x, y - 3), 
nd.broadcast_power(x, y) * (nd.log(x) ** 3)]
+
+    low = 1.0
+    high = 3.0
+    for dim in range(1, 5):
+        x_shape, y_shape = gen_broadcast_shape(dim)
+        x = nd.random.uniform(low, high, x_shape)
+        y = nd.random.uniform(low, high, y_shape)
+
+        check_nth_order_binary([x, y], broadcast_power, [unreduced_grad_op, 
unreduced_grad_grad_op,
+            unreduced_grad_grad_grad_op], [1, 2, 3], True, rtol=1e-3, 
atol=1e-5)
+
+def autograd_grad_ex(heads, variables, head_grads=None, retain_graph=None, 
create_graph=False,
+            train_mode=True):
+    """ If some variables don't in the path of computing heads, we set the 
heads grad of them to zero
+    instead of throwing exceptions.
+
+    The autograd.grad requires user knows which variables involved to compute 
the heads grad of them.
+    That's fine for first order grad, but for higher order grad, the variables 
used to compute the heads,
+    may not used to computer their higher order grad. It's impossible to ask 
user to know
+    the formulas of every order grad.
+
+    E.g. we use such code to compute 2-nd order gradient:
+      with autograd.record():
+          z = op(x, y)
+          head_grad = nd.ones_like(z)
+          dz_dx, _  = autograd.grad(heads=z, variables=[x, y], 
head_grads=nd.ones_like(z),
+                              create_graph=True, retain_graph=True)
+          d2z_d2x, _  = autograd.grad(heads=dz_dx, variables=[x, y], 
head_grads=nd.ones_like(dz_dx),
+                              create_graph=True, retain_graph=True)
+    If z = x * y, because d2z_d2x = 0, MXNET will report the input is 
unreachable from the output.
+    But it seems in that case MXNET returns zeros is more reasonable.
+    """
+    # xxx: only consider one head currently
+    argument_names =  autograd.get_symbol(heads).list_arguments()
+
+    # XXX: in some cases, a variable may has more than one outputs, we need a 
other way ot get  the name of various.
+    # But in the unittest, it is fine
+    variable_names = [autograd.get_symbol(variable).list_outputs()[0] for 
variable in variables]
+    involved_variable_indexes = []
+    involved_variables = []
+    for i in range(0, len(variables)):
+        if variable_names[i] in argument_names:
+            involved_variables.append(variables[i])
+            involved_variable_indexes.append(i)
+
+    if involved_variables:
+        partial_grads = autograd.grad(heads, involved_variables, head_grads, 
retain_graph, create_graph, train_mode)
+    else:
+        partial_grads = []
+
+    grads = []
+    partial_grads_index = 0
+    for i in range(0, len(variables)):
+       if i in involved_variable_indexes:
+           grads.append(partial_grads[partial_grads_index])
+           partial_grads_index += 1
+       else:
+           grads.append(nd.zeros_like(variables[i]))
+    return grads
+
+
+def check_nth_order_binary(inputs, op, grad_ops, orders, broadcast_op = False, 
rtol=None, atol=None):
+    """Assert n-th order autograd gradient against expected gradient.
+
+    Multiple order of gradients can be checked by passing list of
+    function computing the particular order gradient and passing the 
corresponding list of order.
+    Note
+    ----
+    1. Orders should always be monotonically increasing.
+    2. Elements of grads_ops should correspond to elements of orders
+    i.e. grads_op = [grad_op, grad_grad_grad_op] should be passed with
+         orders = [1, 3]
+
+    Parameters
+    ----------
+    inputs : tuple of mxnet.NDArray (x, y)
+        Input Array.
+    op : Callable (x,y) -> z
+        Operation to perform on Input Array.
+    grad_ops : Callable or List of Callable
+        Function (x,y) -> (n_grad_x, n_grad_y) to compute and assert gradient 
of given order.
+    orders : int or List of int
+        Order/s to assert expected and computed gradients.
+
+    Returns
+    -------
+    None
+
+    """
+    if isinstance(orders, int):
+        orders = [orders]
+        grad_ops = [grad_ops]
+
+    assert all(i < j for i, j in zip(orders[0:-1], orders[1:])), \
+        "orders should be monotonically increasing"
+    assert len(set(orders)) == len(orders), \
+        "orders should have unique elements"
+    highest_order = max(orders)
+
+    inputs = [nd.array(input) for input in inputs]
+    for input in inputs:
+        input.attach_grad()
+
+    expected_grads = [grad_op(inputs) for grad_op in grad_ops]
+    computed_grads = []
+    head_grads = [[]]
+
+    # Perform compute.
+    with autograd.record():
+        z = op(inputs)
+        heads = [z for _ in inputs]
+        for current_order in range(1, highest_order+1):
+            grads = []
+            new_head_grads = []
+            new_heads = []
+            for i in range(0, len(heads)):
+                head = heads[i]
+                head_grad = nd.random.normal(shape=head.shape)
+                new_head_grads.append(head_grad)
+                grads.append(autograd_grad_ex(heads=head, variables=inputs, 
head_grads=head_grad,
+                                              create_graph=True, 
retain_graph=True)[i])
+                # If we only use once auto grad with head_grads = head_grad in 
every iteration,
+                # in the i-th iteration, we use head = derivative_(i-1) * 
head_grad_(i-1)
+                # but in the expected computed, we use  head = derivative_(i-1)
+                # why it is woks in the check_nth_order_unary?
+                # Because most of them defined gradient of the first gradient 
(derivative_(1) * head_grad_(1))
+                # of the function, and in the gradient function, they manually 
defined derivative_(i-1)
+                # and use it to computed derivative_(1) * head_grad_(1).
+                # It maybe a wrong approach, because the gradient of the first 
gradient should compute the grad of
+                # derivative_(1) * head_grad_(1) instead of gradient of 
derivative_(1).

Review comment:
       I rechecked the code. you are right, The `check_nth_order_unary` is 
correct.  because
   
   
https://github.com/apache/incubator-mxnet/blob/8c76631caa5f349aef5d515ec95a64d2954dbcef/tests/python/unittest/test_higher_order_grad.py#L564-L566
   
   Thx!.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [incubator-mxnet] tobecontinued commented on a change in pull request #17754: [MXNET-978] Higher Order Gradient Support broadcast_to, broadcast_power, power, elemwise_mul and elemwise_sub and add unit test function check check_nth_order_binary

Reply via email to