eric-haibin-lin commented on a change in pull request #7903: Refactor AdaGrad optimizer to support sparse tensors URL: https://github.com/apache/incubator-mxnet/pull/7903#discussion_r139759839
########## File path: python/mxnet/optimizer.py ########## @@ -664,27 +667,45 @@ class AdaGrad(Optimizer): ---------- eps: float, optional Small value to avoid division by 0. + """ def __init__(self, eps=1e-7, **kwargs): super(AdaGrad, self).__init__(**kwargs) self.float_stable_eps = eps def create_state(self, index, weight): - return zeros(weight.shape, weight.context) # history + return zeros(weight.shape, weight.context, stype=weight.stype) # history def update(self, index, weight, grad, state): assert(isinstance(weight, NDArray)) assert(isinstance(grad, NDArray)) self._update_count(index) lr = self._get_lr(index) wd = self._get_wd(index) - + save_grad_stype = grad.stype grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) history = state - history[:] += (grad * grad) - weight[:] += -lr * (grad / sqrt(history + self.float_stable_eps) + wd * weight) + save_history_stype = history.stype + + is_sparse = True if weight.stype != 'default' or grad.stype != 'default' else False + + if is_sparse: + history[:] = op.elemwise_add(history, op.square(grad)) + assert history.stype == save_history_stype + srt = op.sqrt(_internal._scatter_plus_scalar(history, self.float_stable_eps)) Review comment: use `scatter_plus(sparse.retain(history, indices))` instead of `scatter_plus(history)`? otherwise the scatter_plus is expensive ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services