SINGA-387 Modified the design of autograd backward engine and correct some mistakes in it
another solution: - let Dummy involved in dependency counting system - modified former backward function Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/2fea345c Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/2fea345c Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/2fea345c Branch: refs/heads/master Commit: 2fea345cf13df1e3a2511c5bb80732647abc4b45 Parents: 6c28abd Author: xuewanqi <[email protected]> Authored: Fri Aug 10 05:12:31 2018 +0000 Committer: xuewanqi <[email protected]> Committed: Mon Aug 13 06:03:28 2018 +0000 ---------------------------------------------------------------------- python/singa/autograd.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/2fea345c/python/singa/autograd.py ---------------------------------------------------------------------- diff --git a/python/singa/autograd.py b/python/singa/autograd.py index 007af27..94214fc 100755 --- a/python/singa/autograd.py +++ b/python/singa/autograd.py @@ -44,14 +44,15 @@ def infer_dependency(op): a Counter instance with the operation as the key, and the number of operations that are depending on it as the value ''' - # dependency = {} + # not count the dependency of current op. + # if the current op is not a terminal op, then this function may just + # count dependency of a branch. dependency_count = Counter() queue = deque([op]) while len(queue) > 0: cur_op = queue.pop() for src_op, _, _, _ in cur_op.src: - if src_op not in dependency_count and \ - (not isinstance(src_op, Dummy)): + if src_op not in dependency_count: # dependency[src_op] = [Counter() for _ in src_op.y_id2idx] dependency_count[src_op] = 0 queue.append(src_op) @@ -64,10 +65,7 @@ def infer_dependency(op): def gradients(y, dy=None): grads = {} # mapping: x->dx if x.stores_grad for p, dp in backward(y, dy): - if not grads.has_key(p): - grads[p] = dp - else: - grads[p] += dp + grads[p] = dp return grads @@ -142,20 +140,21 @@ def backward(y, dy=None): # add the gradient from another children operation that # uses y_idx'th output of src_op as input arg dxs[y_idx] += dx - if y_stores_grad: - # store the gradient for final return, e.g. if x is parameter - - # g = not_ready[src_op][y_idx] + + dependency[src_op] -= 1 - g = dx # connot confirm that the gradient of a parameter is calculated completely. May disobey some optimize algorithms as the engine transmit - # a gradient (partly) once it is calculated which may cause wrongly records of some optimizer parameters. + if y_stores_grad: + if dependency[src_op] == 0: + # store the gradient for final return, e.g. if x is parameter + # may cause a delay output, as only after src_op is ready then output, not the current outlet of src_op is ready then output. + g = not_ready[src_op][y_idx] + tg = Tensor(device=g.device(), data=g) + yield (y, tg) - tg = Tensor(device=g.device(), data=g) - yield (y, tg) - dependency[src_op] -= 1 if src_op.requires_grad is True: if dependency[src_op] == 0: if not isinstance(src_op, Dummy): + #Dummy can be in not_ready list but cannot be in ready list. ready.append((src_op, not_ready[src_op])) del not_ready[src_op] del op # delete the operation to free all tensors from this op
