[singa] branch dev updated: add function comments for autograd

wangwei Fri, 03 Apr 2020 05:10:10 -0700

This is an automated email from the ASF dual-hosted git repository.

wangwei pushed a commit to branch dev
in repository https://gitbox.apache.org/repos/asf/singa.git



The following commit(s) were added to refs/heads/dev by this push:
     new fc077bf  add function comments for autograd
     new 1820075  Merge pull request #649 from joddiy/fix_autograd_doc
fc077bf is described below

commit fc077bfa5c492f0bd35dba2bcdeb49216fae7f04
Author: joddiy <[email protected]>
AuthorDate: Thu Apr 2 19:25:14 2020 +0800

    add function comments for autograd
---
 python/singa/autograd.py | 2211 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 1755 insertions(+), 456 deletions(-)

diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 17ce07e..4e7593f 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -362,6 +362,9 @@ class Dummy(Operation):
 
 
 class Mean(Operation):
+    """
+    Element-wise mean of each of the input CTensors.
+    """
 
     def __init__(self):
         super(Mean, self).__init__()
@@ -369,10 +372,9 @@ class Mean(Operation):
     def forward(self, *l):
         """
         Args:
-            l: a list of CTensor
-            element-wise mean operator
+            l (a list of CTensor): a list of CTensor for element-wise mean.
         Returns:
-            a new CTensor
+            a new CTensor.
         """
         if training:
             self.l = len(l)
@@ -386,18 +388,29 @@ class Mean(Operation):
     def backward(self, dy):
         """
         Args:
-            dy(CTensor): dL / dy
+            dy (CTensor): dL / dy.
         Returns:
-            a list of dx(CTensor)
+            a list of dx (CTensor).
         """
         return [singa.MultFloat(dy, 1 / self.l)] * self.l
 
 
 def mean(*l):
+    """
+    Element-wise mean of each of the input tensors.
+    Args:
+        l (a list of Tensor): element-wise mean operator.
+    Returns:
+        a new Tensor.
+    """
     return Mean()(*l)[0]
 
 
 class ReLU(Operation):
+    """
+    Relu means rectified linear function, i.e, y = max(0, x) is applied to the 
+    CTensor elementwise.
+    """
 
     def __init__(self):
         super(ReLU, self).__init__()
@@ -405,9 +418,9 @@ class ReLU(Operation):
     def forward(self, x):
         """
         Args:
-            x(CTensor): input tensor
+            x (CTensor): input tensor.
         Returns:
-            a new CTensor whose element y = x if x >= 0; otherwise 0;
+            a new CTensor whose element y = x if x >= 0; otherwise 0.
         """
         if training:
             self.input = x
@@ -416,30 +429,37 @@ class ReLU(Operation):
     def backward(self, dy):
         """
         Args:
-            dy(CTensor): dL / dy
+            dy (CTensor): dL / dy.
         Returns:
-            dx(CTensor): dL / dx = dy if x >= 0; otherwise 0;
+            dx (CTensor): dL / dx = dy if x >= 0; otherwise 0.
         """
         return singa.ReLUBackward(dy, self.input)
 
 
 def relu(x):
+    """
+    Relu means rectified linear function, i.e, y = max(0, x) is applied to the 
+    CTensors elementwise.
+    Args:
+        x (Tensor): input tensor.
+    Returns:
+        a new Tensor whose element y = x if x >= 0; otherwise 0.
+    """
     return ReLU()(x)[0]
 
 
 class Less(Operation):
+    """
+    Returns the tensor resulted from performing the less logical operation 
+    elementwise on the input CTensors x and y.
+    """
 
     def __init__(self):
         super(Less, self).__init__()
 
     def forward(self, x, y):
-        """Do forward propgation.
-        Store the [x<y] if requires gradient.
-        Args:
-            x (CTensor): matrix
-            y (CTensor): matrix
-        Returns:
-            a CTensor for the result
+        """
+        Return a<b, where a and b are CTensor.
         """
         cur = singa.LTFloat(singa.__sub__(x, y), 0)
         if training:
@@ -449,18 +469,32 @@ class Less(Operation):
     def backward(self, dy):
         """
         Args:
-            dy (CTensor): data for the dL / dy, L is the loss
+            dy (CTensor): data for the dL / dy, L is the loss.
+        Raises:
+            AssertionError: no backward function for this operator.
         """
         assert False, ('no backward function for less')
 
 
 def less(x, y):
+    """
+    Return a<b, where a and b are CTensor.
+    """
     return Less()(x, y)[0]
 
 
 class Clip(Operation):
+    """
+    Clip operator limits the given input within an interval. The interval 
+    is specified by the inputs 'min' and 'max'.
+    """
 
     def __init__(self, min, max):
+        """
+        Args:
+            min (float): min value, under which element is replaced by min.
+            max (float): max value, above which element is replaced by max.
+        """
         super(Clip, self).__init__()
         self.max = max
         self.min = min
@@ -468,9 +502,9 @@ class Clip(Operation):
     def forward(self, x):
         """
         Args:
-            x(CTensor): input tensor
+            x (CTensor): input tensor
         Returns:
-            np.clip(x,min,max)
+            a new CTensor with np.clip(x,min,max)
         """
         self.mask = singa.Tensor(list(x.shape()), x.device())
         self.mask.SetFloatValue(1.0)
@@ -492,49 +526,77 @@ class Clip(Operation):
         return x
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): dL / dy
+        Returns:
+            dx (CTensor): dL / dx
+        """
         return singa.__mul__(dy, self.mask)
 
 
 def clip(x, min=None, max=None):
+    """
+    Clip operator limits the given input within an interval. The interval 
+    is specified by the inputs 'min' and 'max'.
+    Args:
+        x (Tensor): input tensor
+        min (float): Minimum value, under which element is replaced by min.
+        max (float): Maximum value, above which element is replaced by max.
+    Returns:
+        a new Tensor with np.clip(x,min,max).
+    """
     return Clip(min, max)(x)[0]
 
 
 class Identity(Operation):
+    """
+    Init a identity operator
+    """
 
     def __init__(self):
         super(Identity, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): input tensor.
+        Returns:
+            the same CTensor x.
+        """
         return x
 
     def backward(self, dy):
         """
         Args:
-            dy(CTensor): dL / dy
+            dy (CTensor): dL / dy.
         Returns:
-            dx(CTensor): dL / dx = dy;
+            dx (CTensor): dL / dx.
         """
         return dy
 
 
 def identity(x):
+    """
+    Init a identity operator.
+    Args:
+        x (Tensor): input tensor.
+    Returns:
+        the same Tensor with x.
+    """
     return Identity()(x)[0]
 
-
 class Matmul(Operation):
-    """For matrix multiplication"""
+    """
+    Init matrix multiplication operator.
+    """
 
     def __init__(self):
         super(Matmul, self).__init__()
 
     def forward(self, x, w):
-        """Do forward propgation.
-        Store the x(or w) if w(or x) requires gradient.
-        Args:
-            x (CTensor): matrix
-            w (CTensor): matrix
-        Returns:
-            a CTensor for the result
+        """
+        Return np.matmul(x,w), where x and w are CTensor.
         """
         if training:
             self.input = (x, w)
@@ -543,9 +605,9 @@ class Matmul(Operation):
     def backward(self, dy):
         """
         Args:
-            dy (CTensor): data for the dL / dy, L is the loss
+            dy (CTensor): data for the dL / dy, L is the loss.
         Returns:
-            a tuple for (dx, dw)
+            a tuple for (dx, dw).
         """
         return (
             singa.Mult(dy, singa.DefaultTranspose(self.input[1])),
@@ -554,22 +616,24 @@ class Matmul(Operation):
 
 
 def matmul(x, w):
+    """
+    Return np.matmul(x,w), where x and w are Tensor.
+    """
     return Matmul()(x, w)[0]
 
 
 class Greater(Operation):
+    """
+    Returns the tensor resulted from performing the greater logical 
+    operation elementwise on the input tensors A and B.
+    """
 
     def __init__(self):
         super(Greater, self).__init__()
 
     def forward(self, x, y):
-        """Do forward propgation.
-        Store the [x>y] if requires gradient.
-        Args:
-            x (CTensor): matrix
-            y (CTensor): matrix
-        Returns:
-            a CTensor for the result
+        """
+        Return a>b, where a and b are CTensor.
         """
         cur = singa.GTFloat(singa.__sub__(x, y), 0)
         if training:
@@ -579,12 +643,17 @@ class Greater(Operation):
     def backward(self, dy):
         """
         Args:
-            dy (CTensor): data for the dL / dy, L is the loss
+            dy (CTensor): data for the dL / dy, L is the loss.
+        Raises:
+            AssertionError: no backward function for this operator.
         """
         assert False, ('no backward function for greater')
 
 
 def greater(x, y):
+    """
+    Return a>b, where a and b are Tensor.
+    """
     return Greater()(x, y)[0]
 
 
@@ -597,7 +666,7 @@ class AddBias(Operation):
         """
         To indicate the calculation axis, 0 for row, 1 for column.
         Args:
-            axis: 0 or 1, default is 0.
+            axis (int): 0 or 1, default is 0.
         """
         super(AddBias, self).__init__()
         self.axis = axis
@@ -605,8 +674,8 @@ class AddBias(Operation):
     def forward(self, x, b):
         """
         Args:
-            x: matrix.
-            b: bias to be added.
+            x (CTensor): matrix.
+            b (CTensor): bias to be added.
         Return:
             the result Tensor
         """
@@ -631,16 +700,42 @@ class AddBias(Operation):
 
 
 def add_bias(x, b, axis=0):
+    """
+    Add Bias to each row / column of the Tensor, depending on the axis arg.
+    Args:
+        x (Tensor): matrix.
+        b (Tensor): bias to be added.
+        axis (int): 0 or 1, default is 0.
+    Return:
+        the result Tensor
+    """
     return AddBias(axis)(x, b)[0]
 
-
 class Reshape(Operation):
+    """
+    Reshape the input tensor similar to np.reshape. 
+    """
 
     def __init__(self, shape):
+        """
+        Args:
+            shape (list of int): Specified shape for output. At most one
+                dimension of the new shape can be -1. In this case, the 
+                value is inferred from the size of the tensor and the 
+                remaining dimensions. A dimension could also be 0, 
+                in which case the actual dimension value is unchanged 
+                (i.e. taken from the input tensor).
+        """
         super(Reshape, self).__init__()
         self.shape = list(shape)
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): matrix.
+        Return:
+            the result CTensor
+        """
         self._shape = x.shape()
         shape = self.shape
         # handle the shape with 0
@@ -655,19 +750,48 @@ class Reshape(Operation):
         return singa.Reshape(x, self.cache)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): dL / dy
+        Returns:
+            dx (CTensor): dL / dx
+        """
         return singa.Reshape(dy, self._shape)
 
 
-def reshape(a, shape):
-    return Reshape(shape)(a)[0]
+def reshape(x, shape):
+    """
+    Reshape the input tensor similar to mp.reshape. 
+    Args:
+        x (Tensor): matrix.
+        shape (list of int): Specified shape for output. At most one
+            dimension of the new shape can be -1. In this case, the 
+            value is inferred from the size of the tensor and the 
+            remaining dimensions. A dimension could also be 0, 
+            in which case the actual dimension value is unchanged 
+            (i.e. taken from the input tensor).
+    Return:
+        the result Tensor
+    """
+    return Reshape(shape)(x)[0]
 
 
 class PRelu(Operation):
+    """
+    PRelu applies the function f(x) = slope * x for x < 0, 
+    f(x) = x for x >= 0 to the data tensor elementwise.
+    """
 
     def __init__(self):
         super(PRelu, self).__init__()
 
     def forward(self, x, slope):
+        """
+        Args:
+            x (CTensor): matrix.
+        Return:
+            the result CTensor
+        """
         mask0 = singa.LTFloat(x, 0.0)
         res = singa.__mul__(x, mask0)
         res = singa.__mul__(res, slope)
@@ -682,6 +806,12 @@ class PRelu(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): dL / dy
+        Returns:
+            dx (CTensor): dL / dx
+        """
         dx1mask = singa.GEFloat(self.input, 0.0)
         dx2 = singa.__mul__(self.mask0, self.slope)
         dx = singa.__add__(dx1mask, dx2)
@@ -697,15 +827,29 @@ class PRelu(Operation):
 
 
 def prelu(x, slope):
+    """
+    PRelu applies the function f(x) = slope * x for x < 0, 
+    f(x) = x for x >= 0 to the data tensor elementwise.
+    Args:
+        x (Tensor): matrix.
+    Return:
+        the result Tensor
+    """
     return PRelu()(x, slope)[0]
 
 
 class Add(Operation):
+    """
+    Performs element-wise binary addition.
+    """
 
     def __init__(self):
         super(Add, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return a+b, where a and b are CTensor.
+        """
         res = singa.__add__(a, b)
         if training:
             self.shape0 = list(a.shape())
@@ -714,6 +858,13 @@ class Add(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy(CTensor): dL / dy
+        Return:
+            a tuple for (dx0, dx1), dx0 is data for dL / da, dx1 is data
+            for dL / db.
+        """
         dx0, dx1 = dy, dy
         if (type(dy) == float) or self.shape0 == self.shape1:
             assert self.shape0 == self.shape1, ('should have same shape')
@@ -725,18 +876,27 @@ class Add(Operation):
 
 
 def add(a, b):
+    """
+    Return a+b, where a and b are Tensor.
+    """
     return Add()(a, b)[0]
 
 
 class Elu(Operation):
-
-    def __init__(self, alpha=1):
+    """
+    f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0., is applied 
to 
+    the tensor elementwise.
+    """
+    def __init__(self, alpha=1.):
+        """
+        Args:
+            alpha (float): Coefficient of ELU, default is 1.0
+        """
         super(Elu, self).__init__()
         self.alpha = alpha
 
     def forward(self, x):
-        """Do forward propgation.
-        Store the x if requires gradient.
+        """
         Args:
             x (CTensor): matrix
         Returns:
@@ -755,9 +915,9 @@ class Elu(Operation):
     def backward(self, dy):
         """
         Args:
-            dy (CTensor): data for the dL / dy, L is the loss
+            dy (CTensor): dL / dy
         Returns:
-            a tuple for dx
+            dx (CTensor): dL / dx
         """
         dx1mask = singa.LTFloat(self.input, 0.0)
         dx = singa.MultFloat(singa.Exp(self.input), self.alpha)
@@ -771,22 +931,30 @@ class Elu(Operation):
 
 
 def elu(x, alpha=1):
+    """
+    f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0., is applied 
to 
+    the tensor elementwise.
+    Args:
+        x (Tensor): matrix
+        alpha (float): Coefficient of ELU, default is 1.0
+    Returns:
+        a Tensor for the result
+    """
     return Elu(alpha)(x)[0]
 
 
 class Equal(Operation):
-
+    """
+    Returns the tensor resulted from performing the equal logical operation 
+    elementwise on the input tensors x and y.
+    """
     def __init__(self):
         super(Equal, self).__init__()
 
     def forward(self, x, y):
-        """Do forward propgation.
-       Store the x if requires gradient.
-       Args:
-           x (CTensor): matrix
-       Returns:
-           a CTensor for the result
-       """
+        """
+        Return a=b, where a and b are CTensor.
+        """
         m = singa.__sub__(x, y)
         cur = singa.__mul__(singa.GEFloat(m, 0), singa.LEFloat(m, 0))
         return cur
@@ -795,24 +963,37 @@ class Equal(Operation):
         """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss
+        Raises:
+            AssertionError: no backward function for this operator
         """
         assert False, ('no backward function for equal')
 
 
 def equal(x, y):
+    """
+    Return a=b, where a and b are Tensor.
+    """
     return Equal()(x, y)[0]
 
 
 class SeLU(Operation):
+    """
+    y = gamma * (alpha * e^x - alpha) for x <= 0, y = gamma * x for x > 0 
+    is applied to the tensor elementwise.
+    """
 
     def __init__(self, alpha=1.67326, gamma=1.0507):
+        """
+        Args:
+            alpha (float): Coefficient of SELU default to 1.67326
+            gamma (float): Coefficient of SELU default to 1.0507
+        """
         super(SeLU, self).__init__()
         self.alpha = alpha
         self.gamma = gamma
 
     def forward(self, x):
-        """Do forward propgation.
-        Store the x if x requires gradient.
+        """
         Args:
             x (CTensor): matrix
         Returns:
@@ -833,9 +1014,9 @@ class SeLU(Operation):
     def backward(self, dy):
         """
         Args:
-            dy (CTensor): data for the dL / dy, L is the loss
+            dy (CTensor): dL / dy
         Returns:
-            dx
+            dx (CTensor): dL / dx
         """
         dx1mask = singa.LEFloat(self.input, 0.0)
         dx1 = singa.MultFloat(singa.Exp(self.input), self.gamma * self.alpha)
@@ -850,6 +1031,16 @@ class SeLU(Operation):
 
 
 def selu(x, alpha=1.67326, gamma=1.0507):
+    """
+    y = gamma * (alpha * e^x - alpha) for x <= 0, y = gamma * x for x > 0 
+    is applied to the tensor elementwise.
+    Args:
+        x (Tensor): matrix
+        alpha (float): Coefficient of SELU default to 1.67326
+        gamma (float): Coefficient of SELU default to 1.0507
+    Returns:
+        a Tensor for the result
+    """
     return SeLU(alpha, gamma)(x)[0]
 
 
@@ -860,15 +1051,19 @@ class SoftMax(Operation):
     """
 
     def __init__(self, axis=1):
+        """
+        Args:
+            axis (int): axis of softmax, default to 1
+        """
         super(SoftMax, self).__init__()
         self.axis = axis
 
     def forward(self, x):
         """
         Args:
-            x(data): the input 1d or 2d tensor
+            x (CTensor): the input 1d or 2d tensor
         Returns:
-            the result Tensor
+            the result CTensor
         """
         self.output = singa.SoftMax(x, self.axis)
         return self.output
@@ -876,24 +1071,41 @@ class SoftMax(Operation):
     def backward(self, dy):
         """
         Args:
-            dy (CTensor): data for the dL / dy, L is the loss
+            dy (CTensor): dL / dy
         Returns:
-            dx (Ctensor): data for the dL / dx, L is the loss,
-            x is the input of current Opertion
+            dx (CTensor): dL / dx
         """
         return singa.SoftMaxBackward(dy, self.axis, self.output)
 
 
 def softmax(x, axis=1):
+    """
+    Apply SoftMax for each row of the Tensor or each column of the Tensor
+    according to the parameter axis.
+    Args:
+        x (Tensor): the input 1d or 2d tensor
+        axis (int): axis of softmax, default to 1
+    Returns:
+        the result Tensor
+    """
     return SoftMax(axis)(x)[0]
 
 
 class Sum(Operation):
+    """
+    Element-wise sum of each of the input tensors
+    """
 
     def __init__(self):
         super(Sum, self).__init__()
 
     def forward(self, *l):
+        """
+        Args:
+            l (a list of CTensor): element-wise sum operator
+        Returns:
+            a CTensor for the result
+        """
         if training:
             self.l = len(l)
         assert (len(l) > 0)
@@ -904,10 +1116,23 @@ class Sum(Operation):
         return x
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): dL / dy
+        Returns:
+            dx (CTensor): dL / dx
+        """
         return [dy] * self.l
 
 
 def sum(*l):
+    """
+    Element-wise sum of each of the input tensors
+    Args:
+        l (a list of Tensor): element-wise sum operator
+    Returns:
+        a Tensor for the result
+    """
     return Sum()(*l)[0]
 
 
@@ -1021,19 +1246,41 @@ def ctensor2numpy(x):
 
 
 class Flatten(Operation):
+    """
+    Flattens the input tensor into a 2D matrix. If input tensor has shape 
+    (d_0, d_1, ... d_n) then the output will have shape (d_0 X d_1 ... 
+    d_(axis-1), d_axis X d_(axis+1) ... X dn).
+    """
 
-    def __init__(self, start_axis=1):
+    def __init__(self, axis=1):
+        """
+        Args:
+            axis (int): Indicate up to which input dimensions (exclusive) 
+                should be flattened to the outer dimension of the output. The 
+                value for axis must be in the range [-r, r], where r is the 
+                rank of the input tensor. Negative value means counting 
+                dimensions from the back. When axis = 0, the shape of the 
+                output tensor is (1, (d_0 X d_1 ... d_n), where the shape 
+                of the input tensor is (d_0, d_1, ... d_n).
+        Returns:
+            the result CTensor
+        """
         super(Flatten, self).__init__()
-        # flatten all axis after (inclusive) start_axis
-        self.start_axis = start_axis
+        self.axis = axis
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): the input tensor
+        Returns:
+            the result CTensor
+        """
         self.shape = list(x.shape())
-        shape, axis = self.shape, self.start_axis
-        # the start_axis must be within this range (0, r-1)
+        shape, axis = self.shape, self.axis
+        # the axis must be within this range (0, r-1)
         assert axis <= len(
             shape
-        ) - 1 or axis >= 0, "the start_axis must be within (0, %d-1)" % len(
+        ) - 1 or axis >= 0, "the axis must be within (0, %d-1)" % len(
             shape)
         # calculate the new shape
         new_shape = (1, int(np.prod(shape))) if axis == 0 else (
@@ -1043,12 +1290,34 @@ class Flatten(Operation):
         return y
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): data for the dL / dy, L is the loss
+        Returns:
+            dx (CTensor): data for the dL / dx, L is the loss,
+        """
         dx = singa.Reshape(dy, self.shape)
         return dx
 
 
-def flatten(x):
-    return Flatten()(x)[0]
+def flatten(x, axis=1):
+    """
+    Flattens the input tensor into a 2D matrix. If input tensor has shape 
+    (d_0, d_1, ... d_n) then the output will have shape (d_0 X d_1 ... 
+    d_(axis-1), d_axis X d_(axis+1) ... X dn).
+    Args:
+        x (Tensor): the input tensor
+        axis (int): Indicate up to which input dimensions (exclusive) 
+            should be flattened to the outer dimension of the output. The 
+            value for axis must be in the range [-r, r], where r is the 
+            rank of the input tensor. Negative value means counting 
+            dimensions from the back. When axis = 0, the shape of the 
+            output tensor is (1, (d_0 X d_1 ... d_n), where the shape 
+            of the input tensor is (d_0, d_1, ... d_n).
+    Returns:
+        the result Tensor
+    """
+    return Flatten(axis)(x)[0]
 
 
 class Layer(object):
@@ -1112,8 +1381,18 @@ class Layer(object):
 
 
 class Linear(Layer):
+    """
+    Generate a Linear operator
+    """
 
     def __init__(self, in_features, out_features, bias=True):
+        """
+        Args:
+            in_channels: int, the channel of input
+            out_channels: int, the channel of output, also is the number of 
+                filters
+            bias: bool
+        """
         w_shape = (in_features, out_features)
         b_shape = (out_features,)
         self.bias = bias
@@ -1156,12 +1435,31 @@ class Linear(Layer):
 
 
 class Concat(Operation):
+    """
+    Concatenate a list of tensors into a single tensor. All input tensors must 
+    have the same shape, except for the dimension size of the axis to 
+    concatenate on.
+    """
 
     def __init__(self, axis=0):
+        """
+        Args:
+            axis (int): Which axis to concat on. A negative value means 
+                counting dimensions from the back. Accepted range is [-r, r-1] 
+                where r = rank(inputs).
+        Returns:
+            the result CTensor
+        """
         super(Concat, self).__init__()
         self.axis = axis
 
     def forward(self, *xs):
+        """
+        Args:
+            xs (a list of CTensor): List of tensors for concatenation
+        Returns:
+            a CTensor for the result
+        """
         if training:
             offset = 0
             self.slice_point = []
@@ -1172,6 +1470,12 @@ class Concat(Operation):
         return singa.ConcatOn(x, self.axis)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): data for the dL / dy, L is the loss
+        Returns:
+            dxs (a tuple of CTensor): data for the dL / dxs, L is the loss,
+        """
         assert hasattr(
             self, "slice_point"), "Please set training as True before do BP. "
         assert self.slice_point[-1] == dy.shape()[self.axis], "Shape mismatch."
@@ -1184,20 +1488,34 @@ class Concat(Operation):
 
 
 def cat(xs, axis=0):
-    # xs is a tuple of multiple Tensors
+    """
+    Concatenate a list of tensors into a single tensor. All input tensors must 
+    have the same shape, except for the dimension size of the axis to 
+    concatenate on.
+    Args:
+        xs (a list of Tensor): List of tensors for concatenation
+        axis (int): Which axis to concat on. A negative value means 
+            counting dimensions from the back. Accepted range is [-r, r-1] 
+            where r = rank(inputs).
+    Returns:
+        a Tensor for the result
+    """
     return Concat(axis)(*xs)[0]
 
 
 class _Conv2d(Operation):
+    """
+    Init a conv 2d operator
+    """
 
     def __init__(self, handle, odd_padding=(0, 0, 0, 0)):
         """
-        Init a conv 2d operator
         Args:
-            handle: ConvHandle for cpu or CudnnConvHandle for gpu
-        Args:
-            odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-            so we need to firstly handle the input, then use the nomal padding 
method.
+            handle (object): ConvHandle for cpu or CudnnConvHandle for gpu
+            odd_padding (tuple of four ints):, the odd paddding is the value 
+                that cannot be handled by the tuple padding (w, h) mode so 
+                we need to firstly handle the input, then use the nomal 
padding 
+                method.
         """
         super(_Conv2d, self).__init__()
         self.handle = handle
@@ -1207,13 +1525,10 @@ class _Conv2d(Operation):
 
     def forward(self, x, W, b=None):
         """
-        Do forward of conv
-        Args:
-            x: CTensor, input
         Args:
-            W: CTensor, weight
-        Args:
-            b: CTensor, bias
+            x (CTensor): input
+            W (CTensor): weight
+            b (CTensor): bias
         Returns:
             CTensor 
         """
@@ -1243,11 +1558,10 @@ class _Conv2d(Operation):
 
     def backward(self, dy):
         """
-        Do backward of conv
         Args:
-            dy: CTensor, gradient
+            dy (CTensor): dL / dy
         Returns:
-            CTensor 
+            dx (CTensor): dL / dx
         """
         assert training is True and hasattr(
             self, "inputs"), "Please set training as True before do BP. "
@@ -1282,16 +1596,14 @@ def conv2d(handle, x, W, b=None, odd_padding=(0, 0, 0, 
0)):
     """
     Conv 2d operator
     Args:
-        handle: ConvHandle for cpu or CudnnConvHandle for gpu
-    Args:
-        x: CTensor, input
-    Args:
-        W: CTensor, weight
-    Args:
-        b: CTensor, bias
-    Args:
-        odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-        so we need to firstly handle the input, then use the nomal padding 
method.
+        handle (object): ConvHandle for cpu or CudnnConvHandle for gpu
+        x (Tensor): input
+        W (Tensor): weight
+        b (Tensor): bias
+        odd_padding (tuple of four ints):, the odd paddding is the value 
+            that cannot be handled by the tuple padding (w, h) mode so 
+            we need to firstly handle the input, then use the nomal padding 
+            method.
     """
     if b is None:
         return _Conv2d(handle, odd_padding)(x, W)[0]
@@ -1300,6 +1612,9 @@ def conv2d(handle, x, W, b=None, odd_padding=(0, 0, 0, 
0)):
 
 
 class Conv2d(Layer):
+    """
+    Generate a Conv 2d operator
+    """
 
     def __init__(self,
                  in_channels,
@@ -1313,29 +1628,26 @@ class Conv2d(Layer):
                  pad_mode="NOTSET",
                  **kwargs):
         """
-        Generate a Conv 2d operator
-        Args:
-            in_channels: int, the channel of input
-        Args:
-            out_channels: int, the channel of output, also is the number of 
filters
-        Args:
-            kernel_size: int or tuple, kernel size for two direction of each 
axis. For example, (2, 3), the first 2 means will add 2 at the beginning and 
also 2 at the end for its axis.
-            and if a int is accepted, the kernel size will be inited as (int, 
int)
         Args:
-            stride: int or tuple, stride, the logic is the same as kernel size.
-        Args:
-            padding: int, tuple, list or None, padding, the logic is the same 
as kernel size. However, if you set pad_mode as "SAME_UPPER" or "SAME_LOWER" 
mode, 
-            you can set padding as None, and the padding will be computed 
automatically.
-        Args:
-            dilation: int, only support 1
-        Args:
-            group: int
-        Args:
-            bias: bool
-        Args:
-            pad_mode: string, can be NOTSET, SAME_UPPER, or SAME_LOWER, where 
default value is NOTSET, which means explicit padding is used.
-            SAME_UPPER or SAME_LOWER mean pad the input so that the output 
spatial size match the input.
-            In case of odd number add the extra padding at the end for 
SAME_UPPER and at the beginning for SAME_LOWER.
+            in_channels (int): the channel of input
+            out_channels (int): the channel of output, also is the number of 
filters
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            dilation (int): only support 1
+            group (int): group
+            bias (bool): bias
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where 
+                default value is NOTSET, which means explicit padding is used. 
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output 
+                spatial size match the input. In case of odd number add the 
extra 
+                padding at the end for SAME_UPPER and at the beginning for 
SAME_LOWER.
         """
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -1490,6 +1802,9 @@ class Conv2d(Layer):
 
 
 class SeparableConv2d(Layer):
+    """
+    Generate a Conv 2d operator
+    """
 
     def __init__(
         self,
@@ -1500,6 +1815,21 @@ class SeparableConv2d(Layer):
         padding=0,
         bias=False,
     ):
+        """
+        Args:
+            in_channels (int): the channel of input
+            out_channels (int): the channel of output, also is the number of 
filters
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            bias (bool): bias
+        """
         self.depthwise_conv = Conv2d(
             in_channels,
             in_channels,
@@ -1519,8 +1849,17 @@ class SeparableConv2d(Layer):
 
 
 class BatchNorm2d(Layer):
+    """
+    Generate a BatchNorm 2d operator
+    """
 
     def __init__(self, num_features, momentum=0.9):
+        """
+        Args:
+            num_features (int): int, the channel of input
+            momentum (float): Factor used in computing the running mean and 
+                variance.
+        """
         self.channels = num_features
         self.momentum = momentum
 
@@ -1589,14 +1928,34 @@ class BatchNorm2d(Layer):
 
 
 class _BatchNorm2d(Operation):
+    """
+    Carries out batch normalization as described in the paper 
+    https://arxiv.org/abs/1502.03167. 
+    """
 
     def __init__(self, handle, running_mean, running_var, name=None):
+        """
+        Args:
+            handle (object): BatchNormHandle for cpu and CudnnBatchNormHandle 
+                for gpu
+            running_mean (float): the running_mean
+            running_var (float): the running_var
+            name (string): the name assigned to this operator
+        """
         super(_BatchNorm2d, self).__init__(name)
         self.handle = handle
         self.running_mean = running_mean.data
         self.running_var = running_var.data
 
     def forward(self, x, scale, bias):
+        """
+        Args:
+            x (CTensor): the input tensor
+            scale (CTensor): the bias tensor
+            bias (CTensor): the bias tensor
+        Returns:
+            the result CTensor
+        """
         if training:
             if (type(self.handle) == singa.BatchNormHandle):
                 y, mean, var = singa.CpuBatchNormForwardTraining(
@@ -1634,6 +1993,14 @@ class _BatchNorm2d(Operation):
         return y
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): data for the dL / dy, L is the loss
+        Returns:
+            dx (CTensor): data for the dL / dx, L is the loss
+            ds (CTensor): data for the dL / ds, L is the loss
+            db (CTensor): data for the dL / db, L is the loss
+        """
         assert training is True and hasattr(
             self, "cache"), "Please set training as True before do BP. "
 
@@ -1650,19 +2017,37 @@ class _BatchNorm2d(Operation):
 
 
 def batchnorm_2d(handle, x, scale, bias, running_mean, running_var):
+    """
+    Carries out batch normalization as described in the paper 
+    https://arxiv.org/abs/1502.03167. 
+    Args:
+        handle (object): BatchNormHandle for cpu and CudnnBatchNormHandle 
+            for gpu
+        x (Tensor): the input tensor
+        scale (Tensor): the bias tensor
+        bias (Tensor): the bias tensor
+        running_mean (float): the running_mean
+        running_var (float): the running_var
+    Returns:
+        the result Tensor
+    """
     return _BatchNorm2d(handle, running_mean, running_var)(x, scale, bias)[0]
 
 
 class _Pooling2d(Operation):
+    """
+    Init a pool 2d operator
+    """
 
     def __init__(self, handle, odd_padding=(0, 0, 0, 0)):
         """
-        Init a pool 2d operator
-        Args:
-            handle: PoolingHandle for cpu or CudnnPoolingHandle for gpu
         Args:
-            odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-            so we need to firstly handle the input, then use the nomal padding 
method.
+            handle (object): PoolingHandle for cpu or CudnnPoolingHandle for 
+                gpu
+            odd_padding (tuple of four int): the odd paddding is the value 
+                that cannot be handled by the tuple padding (w, h) mode so 
+                it needs to firstly handle the input, then use the normal 
+                padding method.
         """
         super(_Pooling2d, self).__init__()
         self.handle = handle
@@ -1671,6 +2056,12 @@ class _Pooling2d(Operation):
             self.re_new_handle = True
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): the input tensor
+        Returns:
+            the result CTensor
+        """
         assert x.nDim() == 4, "The dimensions of input should be 4D."
         if self.odd_padding != (0, 0, 0, 0):
             x = utils.handle_odd_pad_fwd(x, self.odd_padding)
@@ -1688,6 +2079,12 @@ class _Pooling2d(Operation):
         return y
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): data for the dL / dy, L is the loss
+        Returns:
+            dx (CTensor): data for the dL / dx, L is the loss,
+        """
         if (type(self.handle) != singa.PoolingHandle):
             dx = singa.GpuPoolingBackward(self.handle, dy, self.cache[0],
                                           self.cache[1])
@@ -1704,17 +2101,23 @@ def pooling_2d(handle, x, odd_padding=(0, 0, 0, 0)):
     """
     Pooling 2d operator
     Args:
-        handle: ConvHandle for cpu or CudnnConvHandle for gpu
-    Args:
-        x: CTensor, input
-    Args:
-        odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-        so we need to firstly handle the input, then use the nomal padding 
method.
+        handle (object): PoolingHandle for cpu or CudnnPoolingHandle for 
+            gpu
+        x (Tensor): input
+        odd_padding (tuple of four int): the odd paddding is the value 
+            that cannot be handled by the tuple padding (w, h) mode so 
+            it needs to firstly handle the input, then use the normal 
+            padding method.
+    Returns:
+        the result Tensor
     """
     return _Pooling2d(handle, odd_padding)(x)[0]
 
 
 class Pooling2d(Layer):
+    """
+    Generate a Pooling 2d operator
+    """
 
     def __init__(self,
                  kernel_size,
@@ -1723,21 +2126,22 @@ class Pooling2d(Layer):
                  is_max=True,
                  pad_mode="NOTSET"):
         """
-        Generate a Pooling 2d operator
         Args:
-            kernel_size: int or tuple, kernel size for two direction of each 
axis. For example, (2, 3), the first 2 means will add 2 at the beginning and 
also 2 at the end for its axis.
-            and if a int is accepted, the kernel size will be inited as (int, 
int)
-        Args:
-            stride: int or tuple, stride, the logic is the same as kernel size.
-        Args:
-            padding: int or tuple or None, padding, the logic is the same as 
kernel size. However, if you set pad_mode as "SAME_UPPER" or "SAME_LOWER" mode, 
-            you can set padding as None, and the padding will be computed 
automatically.
-        Args:
-            is_max: bool, is max pooling or avg pooling
-        Args:
-            pad_mode: string, can be NOTSET, SAME_UPPER, or SAME_LOWER, where 
default value is NOTSET, which means explicit padding is used.
-            SAME_UPPER or SAME_LOWER mean pad the input so that the output 
spatial size match the input.
-            In case of odd number add the extra padding at the end for 
SAME_UPPER and at the beginning for SAME_LOWER.
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            is_max (bool): is max pooling or avg pooling
+            pad_mode (string): can be NOTSET, SAME_UPPER, or SAME_LOWER, where 
+                default value is NOTSET, which means explicit padding is used. 
+                SAME_UPPER or SAME_LOWER mean pad the input so that the output 
+                spatial size match the input. In case of odd number add the 
extra 
+                padding at the end for SAME_UPPER and at the beginning for 
SAME_LOWER.
         """
         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
@@ -1837,6 +2241,9 @@ class Pooling2d(Layer):
 
 
 class MaxPool2d(Pooling2d):
+    """
+    Generate a Max Pooling 2d operator
+    """
 
     def __init__(self,
                  kernel_size,
@@ -1844,19 +2251,20 @@ class MaxPool2d(Pooling2d):
                  padding=0,
                  odd_padding=(0, 0, 0, 0)):
         """
-        Generate a Max Pooling 2d operator
         Args:
-            kernel_size: int or tuple, kernel size for two direction of each 
axis. For example, (2, 3), the first 2 means will add 2 at the beginning and 
also 2 at the end for its axis.
-            and if a int is accepted, the kernel size will be inited as (int, 
int)
-        Args:
-            stride: int or tuple, stride, the logic is the same as kernel size.
-        Args:
-            padding: int or tuple or None, padding, the logic is the same as 
kernel size. However, if you set pad_mode as "SAME_UPPER" or "SAME_LOWER" mode, 
-            you can set padding as None, and the padding will be computed 
automatically.
-        Args:
-            pad_mode: string, can be NOTSET, SAME_UPPER, or SAME_LOWER, where 
default value is NOTSET, which means explicit padding is used.
-            SAME_UPPER or SAME_LOWER mean pad the input so that the output 
spatial size match the input.
-            In case of odd number add the extra padding at the end for 
SAME_UPPER and at the beginning for SAME_LOWER.
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            odd_padding (tuple of four int): the odd paddding is the value 
+                that cannot be handled by the tuple padding (w, h) mode so 
+                it needs to firstly handle the input, then use the normal 
+                padding method.
         """
         super(MaxPool2d, self).__init__(kernel_size, stride, padding, True,
                                         odd_padding)
@@ -1870,24 +2278,29 @@ class AvgPool2d(Pooling2d):
                  padding=0,
                  odd_padding=(0, 0, 0, 0)):
         """
-        Generate a Avg Pooling 2d operator
-        Args:
-            kernel_size: int or tuple, kernel size for two direction of each 
axis. For example, (2, 3), the first 2 means will add 2 at the beginning and 
also 2 at the end for its axis.
-            and if a int is accepted, the kernel size will be inited as (int, 
int)
-        Args:
-            stride: int or tuple, stride, the logic is the same as kernel size.
         Args:
-            padding: int or tuple or None, padding, the logic is the same as 
kernel size. However, if you set pad_mode as "SAME_UPPER" or "SAME_LOWER" mode, 
-            you can set padding as None, and the padding will be computed 
automatically.
-        Args:
-            odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-            so we need to firstly handle the input, then use the nomal padding 
method.
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            odd_padding (tuple of four int): the odd paddding is the value 
+                that cannot be handled by the tuple padding (w, h) mode so 
+                it needs to firstly handle the input, then use the normal 
+                padding method.
         """
         super(AvgPool2d, self).__init__(kernel_size, stride, padding, False,
                                         odd_padding)
 
 
 class MaxPool1d(Pooling2d):
+    """
+    Generate a Max Pooling 1d operator
+    """
 
     def __init__(self,
                  kernel_size,
@@ -1895,18 +2308,20 @@ class MaxPool1d(Pooling2d):
                  padding=0,
                  odd_padding=(0, 0, 0, 0)):
         """
-        Generate a Max Pooling 1d operator
-        Args:
-            kernel_size: int or tuple, kernel size for two direction of each 
axis. For example, (2, 3), the first 2 means will add 2 at the beginning and 
also 2 at the end for its axis.
-            and if a int is accepted, the kernel size will be inited as (int, 
int)
-        Args:
-            stride: int or tuple, stride, the logic is the same as kernel size.
-        Args:
-            padding: int or tuple or None, padding, the logic is the same as 
kernel size. However, if you set pad_mode as "SAME_UPPER" or "SAME_LOWER" mode, 
-            you can set padding as None, and the padding will be computed 
automatically.
         Args:
-            odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-            so we need to firstly handle the input, then use the nomal padding 
method.
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            odd_padding (tuple of four int): the odd paddding is the value 
+                that cannot be handled by the tuple padding (w, h) mode so 
+                it needs to firstly handle the input, then use the normal 
+                padding method.
         """
         if stride is None:
             stride = kernel_size
@@ -1915,6 +2330,9 @@ class MaxPool1d(Pooling2d):
 
 
 class AvgPool1d(Pooling2d):
+    """
+    Generate a Avg Pooling 1d operator
+    """
 
     def __init__(self,
                  kernel_size,
@@ -1922,18 +2340,20 @@ class AvgPool1d(Pooling2d):
                  padding=0,
                  odd_padding=(0, 0, 0, 0)):
         """
-        Generate a Avg Pooling 1d operator
-        Args:
-            kernel_size: int or tuple, kernel size for two direction of each 
axis. For example, (2, 3), the first 2 means will add 2 at the beginning and 
also 2 at the end for its axis.
-            and if a int is accepted, the kernel size will be inited as (int, 
int)
         Args:
-            stride: int or tuple, stride, the logic is the same as kernel size.
-        Args:
-            padding: int or tuple or None, padding, the logic is the same as 
kernel size. However, if you set pad_mode as "SAME_UPPER" or "SAME_LOWER" mode, 
-            you can set padding as None, and the padding will be computed 
automatically.
-        Args:
-            odd_padding:tuple of four bins, the odd paddding is the value that 
cannot be handled by the tuple padding (w, h) mode
-            so we need to firstly handle the input, then use the nomal padding 
method.
+            kernel_size (int or tuple): kernel size for two direction of each 
+                axis. For example, (2, 3), the first 2 means will add 2 at the 
+                beginning and also 2 at the end for its axis.and if a int is 
+                accepted, the kernel size will be initiated as (int, int)
+            stride (int or tuple): stride, the logic is the same as kernel 
size.
+            padding (int): tuple, list or None, padding, the logic is the same 
+                as kernel size. However, if you set pad_mode as "SAME_UPPER" 
or 
+                "SAME_LOWER" mode, you can set padding as None, and the 
padding 
+                will be computed automatically.
+            odd_padding (tuple of four int): the odd paddding is the value 
+                that cannot be handled by the tuple padding (w, h) mode so 
+                it needs to firstly handle the input, then use the normal 
+                padding method.
         """
         if stride is None:
             stride = kernel_size
@@ -1942,17 +2362,32 @@ class AvgPool1d(Pooling2d):
 
 
 class Tanh(Operation):
+    """
+    Calculates the hyperbolic tangent of the given input tensor element-wise.
+    """
 
     def __init__(self):
         super(Tanh, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         out = singa.Tanh(x)
         if training:
             self.cache = (out,)
         return out
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.__mul__(self.cache[0], self.cache[0])
         dx = singa.MultFloat(dx, -1.0)
         dx = singa.AddFloat(dx, 1.0)
@@ -1961,20 +2396,42 @@ class Tanh(Operation):
 
 
 def tanh(x):
+    """
+    Calculates the hyperbolic tangent of the given input tensor element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Tanh()(x)[0]
 
 
 class Cos(Operation):
+    """
+    Calculates the cosine of the given input tensor, element-wise.
+    """
 
     def __init__(self):
         super(Cos, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Cos(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Sin(self.input)
         dx = singa.MultFloat(dx, -1.0)
         dx *= dy
@@ -1982,40 +2439,86 @@ class Cos(Operation):
 
 
 def cos(x):
+    """
+    Calculates the cosine of the given input tensor, element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """    
+
     return Cos()(x)[0]
 
 
 class Cosh(Operation):
+    """
+    Calculates the hyperbolic cosine of the given input tensor element-wise.
+    """
 
     def __init__(self):
         super(Cosh, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Cosh(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Sinh(self.input)
         dx *= dy
         return dx
 
 
 def cosh(x):
+    """
+    Calculates the hyperbolic cosine of the given input tensor element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Cosh()(x)[0]
 
 
 class Acos(Operation):
+    """
+    Calculates the arccosine (inverse of cosine) of the given input tensor, 
+    element-wise.
+    """
 
     def __init__(self):
         super(Acos, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Acos(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Square(self.input)
         dx = singa.MultFloat(dx, -1.0)
         dx = singa.AddFloat(dx, 1.0)
@@ -2026,20 +2529,43 @@ class Acos(Operation):
 
 
 def acos(x):
+    """
+    Calculates the arccosine (inverse of cosine) of the given input tensor, 
+    element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Acos()(x)[0]
 
 
 class Acosh(Operation):
+    """
+    Calculates the hyperbolic arccosine of the given input tensor element-wise.
+    """
 
     def __init__(self):
         super(Acosh, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Acosh(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.SubFloat(self.input, 1.0)
         dx = singa.Sqrt(dx)
         temp = singa.AddFloat(self.input, 1.0)
@@ -2051,60 +2577,126 @@ class Acosh(Operation):
 
 
 def acosh(x):
+    """
+    Calculates the hyperbolic arccosine of the given input tensor element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Acosh()(x)[0]
 
 
 class Sin(Operation):
+    """
+    Calculates the sine of the given input tensor, element-wise.
+    """
 
     def __init__(self):
         super(Sin, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Sin(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Cos(self.input)
         dx *= dy
         return dx
 
 
 def sin(x):
+    """
+    Calculates the sine of the given input tensor, element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Sin()(x)[0]
 
 
 class Sinh(Operation):
+    """
+    Calculates the hyperbolic sine of the given input tensor element-wise.
+    """
 
     def __init__(self):
         super(Sinh, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Sinh(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Cosh(self.input)
         dx *= dy
         return dx
 
 
 def sinh(x):
+    """
+    Calculates the hyperbolic sine of the given input tensor element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Sinh()(x)[0]
 
 
 class Asin(Operation):
+    """
+    Calculates the arcsine (inverse of sine) of the given input tensor, 
element-wise.
+    """
 
     def __init__(self):
         super(Asin, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Asin(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Square(self.input)
         dx = singa.MultFloat(dx, -1.0)
         dx = singa.AddFloat(dx, 1.0)
@@ -2114,20 +2706,43 @@ class Asin(Operation):
 
 
 def asin(x):
+    """
+    Calculates the arcsine (inverse of sine) of the given input tensor, 
element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """    
+
     return Asin()(x)[0]
 
 
 class Asinh(Operation):
+    """
+    Calculates the hyperbolic arcsine of the given input tensor element-wise.
+    """
 
     def __init__(self):
         super(Asinh, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Asinh(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """             
         dx = singa.Square(self.input)
         dx = singa.AddFloat(dx, 1.0)
         dx = singa.PowFloat(dx, -0.5)
@@ -2136,20 +2751,42 @@ class Asinh(Operation):
 
 
 def asinh(x):
+    """
+    Calculates the hyperbolic arcsine of the given input tensor element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Asinh()(x)[0]
 
 
 class Tan(Operation):
+    """
+    Insert single-dimensional entries to the shape of an input tensor (data). 
+    """
 
     def __init__(self):
         super(Tan, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Tan(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Cos(self.input)
         dx = singa.Square(dx)
         dx = singa.PowFloat(dx, -1.0)
@@ -2158,20 +2795,42 @@ class Tan(Operation):
 
 
 def tan(x):
+    """
+    Calculates the tangent of the given input tensor, element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Tan()(x)[0]
 
 
 class Atan(Operation):
+    """
+    Calculates the arctangent (inverse of tangent) of the given input tensor, 
element-wise.
+    """
 
     def __init__(self):
         super(Atan, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Atan(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Square(self.input)
         dx = singa.AddFloat(dx, 1.0)
         dx = singa.PowFloat(dx, -1.0)
@@ -2180,20 +2839,42 @@ class Atan(Operation):
 
 
 def atan(x):
+    """
+    Calculates the arctangent (inverse of tangent) of the given input tensor, 
element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """    
     return Atan()(x)[0]
 
 
 class Atanh(Operation):
+    """
+    Calculates the hyperbolic arctangent of the given input tensor 
element-wise.
+    """
 
     def __init__(self):
         super(Atanh, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         return singa.Atanh(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Square(self.input)
         dx = singa.MultFloat(dx, -1.0)
         dx = singa.AddFloat(dx, 1.0)
@@ -2203,21 +2884,43 @@ class Atanh(Operation):
 
 
 def atanh(x):
+    """
+    Calculates the hyperbolic arctangent of the given input tensor 
element-wise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """   
     return Atanh()(x)[0]
 
 
 class Sigmoid(Operation):
+    """
+    y = 1 / (1 + exp(-x)), is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(Sigmoid, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         out = singa.Sigmoid(x)
         if training:
             self.cache = (out,)
         return out
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.MultFloat(self.cache[0], -1.0)
         dx = singa.AddFloat(dx, 1.0)
         dx = singa.__mul__(self.cache[0], dx)
@@ -2226,15 +2929,29 @@ class Sigmoid(Operation):
 
 
 def sigmoid(x):
+    """
+    y = 1 / (1 + exp(-x)), is applied to the tensor elementwise.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """   
     return Sigmoid()(x)[0]
 
 
 class Mul(Operation):
+    """
+    Performs element-wise binary multiplication (with Numpy-style broadcasting 
+    support).        
+    """   
 
     def __init__(self):
         super(Mul, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return np.multiply(a,b), where a and b are CTensor.
+        """
         # todo we cannot support mul op for int tensors
         _a, _b = a, b
         dtype0 = _a.data_type()
@@ -2254,6 +2971,13 @@ class Mul(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            a tuple for (da, db), da is data for dL / da, db is data
+                for dL / db.
+        """
         dx0 = singa.__mul__(dy, self.input[1])
         dx1 = singa.__mul__(dy, self.input[0])
         if (type(dy) == float) or self.shape0 == self.shape1:
@@ -2265,9 +2989,23 @@ class Mul(Operation):
         return dx0, dx1
 
 
+def mul(x, y):
+    """
+    Return np.multiply(x,y), where a and b are Tensor.
+    """
+    return Mul()(x, y)[0]
+
+
 class Unsqueeze(Operation):
+    """
+    Insert single-dimensional entries to the shape of an input tensor (data). 
+    """
 
     def __init__(self, axis):
+        """
+        Args:
+            axis (list of int): the dimensions to be inserted.
+        """
         super(Unsqueeze, self).__init__()
         if (type(axis) is int):
             self.axis = list(axis)
@@ -2275,6 +3013,12 @@ class Unsqueeze(Operation):
             self.axis = axis
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         self.cache = x.shape()
         cur = list(self.cache)
         # todo, need optimize after we have scalar tensor
@@ -2285,28 +3029,57 @@ class Unsqueeze(Operation):
         return singa.Reshape(x, cur)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         return singa.Reshape(dy, self.cache)
 
 
 def unsqueeze(x, axis=-1):
+    """
+    Insert single-dimensional entries to the shape of an input tensor (data). 
+    Args:
+        x (Tensor): Input tensor
+        axis (list of int): the dimensions to be inserted.
+    Returns: 
+        Tensor, the output
+    """  
     return Unsqueeze(axis)(x)[0]
 
 
-def mul(x, y):
-    # do pointwise multiplication
-    return Mul()(x, y)[0]
-
-
 class Transpose(Operation):
+    """
+    Transpose the input tensor similar to numpy.transpose. 
+    """
 
     def __init__(self, perm):
+        """
+        Args:
+            perm (list of ints): A list of integers. By default, reverse the 
+                dimensions, otherwise permute the axes according to the values 
given.
+        """
         super(Transpose, self).__init__()
         self.perm = list(perm)
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         return singa.Transpose(x, self.perm)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         cur = []
         for i in range(len(self.perm)):
             cur += [self.perm.index(i)]
@@ -2314,6 +3087,15 @@ class Transpose(Operation):
 
 
 def transpose(x, shape):
+    """
+    Transpose the input tensor similar to numpy.transpose. 
+    Args:
+        x (Tensor): Input tensor
+        perm (list of ints): A list of integers. By default, reverse the 
+            dimensions, otherwise permute the axes according to the values 
given.
+    Returns: 
+        Tensor, the output
+    """
     return Transpose(shape)(x)[0]
 
 
@@ -2346,6 +3128,9 @@ class RNN_Base(Layer):
 
 
 class RNN(RNN_Base):
+    """
+    Generate a RNN operator
+    """
 
     def __init__(
         self,
@@ -2358,6 +3143,22 @@ class RNN(RNN_Base):
         dropout=0,
         bidirectional=False,
     ):
+        """
+        Args:
+            input_size (int):  The number of expected features in the input x
+            hidden_size (int): The number of features in the hidden state h
+            num_layers (int):  Number of recurrent layers. Default: 1
+            nonlinearity (string): The non-linearity to use. Default: 'tanh'
+            bias (bool):  If False, then the layer does not use bias weights. 
+                Default: True
+            batch_first (bool):  If True, then the input and output tensors 
+                are provided as (batch, seq, feature). Default: False
+            dropout (float): If non-zero, introduces a Dropout layer on the 
+                outputs of each RNN layer except the last layer, with dropout 
+                probability equal to dropout. Default: 0
+            bidirectional (bool): If True, becomes a bidirectional RNN. 
+                Default: False
+        """
         self.nonlinearity = nonlinearity
 
         Wx_shape = (input_size, hidden_size)
@@ -2407,6 +3208,9 @@ class RNN(RNN_Base):
 
 
 class LSTM(RNN_Base):
+    """
+    Generate a LSTM operator
+    """
 
     def __init__(
         self,
@@ -2419,6 +3223,22 @@ class LSTM(RNN_Base):
         dropout=0,
         bidirectional=False,
     ):
+        """
+        Args:
+            input_size (int):  The number of expected features in the input x
+            hidden_size (int): The number of features in the hidden state h
+            num_layers (int):  Number of recurrent layers. Default: 1
+            nonlinearity (string): The non-linearity to use. Default: 'tanh'
+            bias (bool):  If False, then the layer does not use bias weights. 
+                Default: True
+            batch_first (bool):  If True, then the input and output tensors 
+                are provided as (batch, seq, feature). Default: False
+            dropout (float): If non-zero, introduces a Dropout layer on the 
+                outputs of each RNN layer except the last layer, with dropout 
+                probability equal to dropout. Default: 0
+            bidirectional (bool): If True, becomes a bidirectional RNN. 
+                Default: False
+        """
         self.nonlinearity = nonlinearity
 
         Wx_shape = (input_size, hidden_size)
@@ -2511,46 +3331,89 @@ class LSTM(RNN_Base):
 
 
 class Abs(Operation):
+    """
+    y = abs(x), is applied to the tensor elementwise.
+    """
 
     def forward(self, a):
+        """
+        Return abs(a), where a is CTensor.
+        """
         if training:
             self.input = a
         return singa.Abs(a)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """     
         dx = singa.Sign(self.input)
         dx *= dy
         return dx
 
 
 def abs(a):
+    """
+    Return abs(a), where a is Tensor.
+    """
     return Abs()(a)[0]
 
 
 class Exp(Operation):
+    """
+    y = exp(x), is applied to the tensor elementwise.
+    """
 
     def forward(self, a):
+        """
+        Return exp(a), where a is Tensor.
+        """
         if training:
             self.input = a
         return singa.Exp(a)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         dx = singa.Exp(self.input)
         dx *= dy
         return dx
 
 
 def exp(a):
+    """
+    Return exp(a), where a is Tensor.
+    """
     return Exp()(a)[0]
 
 
 class LeakyRelu(Operation):
+    """
+    f(x) = alpha * x for x < 0, f(x) = x for x >= 0, is applied to the tensor 
elementwise.
+    """
 
     def __init__(self, a):
+        """
+        Args:
+            a (float): Coefficient of leakage.
+        """
         super(LeakyRelu, self).__init__()
         self.a = a
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = x
         x1 = singa.LTFloat(x, 0.0)
@@ -2561,6 +3424,12 @@ class LeakyRelu(Operation):
         return x1
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         # TODO(wangwei) check the correctness
         dx1 = singa.GTFloat(self.input, 0.0)
         dx2 = singa.LTFloat(self.input, 0.0)
@@ -2571,34 +3440,73 @@ class LeakyRelu(Operation):
 
 
 def leakyrelu(x, a=0.01):
+    """
+    f(x) = alpha * x for x < 0, f(x) = x for x >= 0 is applied to the tensor 
+    elementwise.
+    Args:
+        x (Tensor): Input tensor
+        a (float): Coefficient of leakage, default to 0.01.
+    Returns: 
+        Tensor, the output
+    """
     return LeakyRelu(a)(x)[0]
 
 
 class Sign(Operation):
+    """
+    Calculate the sign of the given input tensor element-wise. If input > 0, 
+    output 1. if input < 0, output -1. if input == 0, output 0.
+    """
 
     def __init__(self):
         super(Sign, self).__init__()
 
     def forward(self, a):
+        """
+        Args:
+            a (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         if training:
             self.input = a
         return singa.Sign(a)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         dx = singa.MultFloat(dy, 0.0)
         return dx
 
 
 def sign(a):
+    """
+    Calculate the sign of the given input tensor element-wise. If input > 0, 
+    output 1. if input < 0, output -1. if input == 0, output 0.
+    Args:
+        a (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Sign()(a)[0]
 
 
 class Pow(Operation):
+    """
+    f(x) = a^b, is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(Pow, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return a^b, where a and b are CTensor.
+        """
         res = singa.Pow(a, b)
         if training:
             self.input = (a, b)
@@ -2608,6 +3516,13 @@ class Pow(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            a tuple for (da, db), da is data for dL / da, db is data
+                for dL / db.
+        """
         da1 = singa.__mul__(
             self.input[1],
             singa.Pow(self.input[0], singa.SubFloat(self.input[1], 1.0)))
@@ -2625,15 +3540,24 @@ class Pow(Operation):
 
 
 def pow(a, b):
+    """
+    Return a^b, where a and b are Tensor.
+    """
     return Pow()(a, b)[0]
 
 
 class SoftSign(Operation):
+    """
+    Calculates the softsign (x/(1+|x|)) of the given input tensor element-wise.
+    """
 
     def __init__(self):
         super(SoftSign, self).__init__()
 
     def forward(self, x):
+        """
+        Return (x/(1+|x|)), where x is CTensor.
+        """
         # y = x / (1 + np.abs(x))
         if training:
             self.input = x
@@ -2643,6 +3567,12 @@ class SoftSign(Operation):
         return y
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         dx = singa.AddFloat(singa.Abs(self.input), 1.0)
         dx = singa.PowFloat(singa.Square(dx), -1.0)
         dx = singa.__mul__(dy, dx)
@@ -2650,20 +3580,35 @@ class SoftSign(Operation):
 
 
 def softsign(x):
+    """
+    Return (x/(1+|x|)), where x is Tensor.
+    """
     return SoftSign()(x)[0]
 
 
 class Sqrt(Operation):
+    """
+    y = x^0.5, is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(Sqrt, self).__init__()
 
     def forward(self, x):
+        """
+        Return x^0.5, where x is CTensor.
+        """
         if training:
             self.input = x
         return singa.Sqrt(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         dx = singa.PowFloat(self.input, -0.5)
         dx = singa.MultFloat(dx, 0.5)
         dx = singa.__mul__(dy, dx)
@@ -2671,15 +3616,24 @@ class Sqrt(Operation):
 
 
 def sqrt(x):
+    """
+    Return x^0.5, where x is Tensor.
+    """
     return Sqrt()(x)[0]
 
 
 class SoftPlus(Operation):
+    """
+    y = ln(exp(x) + 1) is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(SoftPlus, self).__init__()
 
     def forward(self, x):
+        """
+        Return ln(exp(x) + 1), where x is CTensor.
+        """
         #f(x) = ln(exp(x) + 1)
         if training:
             self.input = x
@@ -2688,6 +3642,12 @@ class SoftPlus(Operation):
         return y
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         dx = singa.Exp(singa.MultFloat(self.input, -1.0))
         dx = singa.PowFloat(singa.AddFloat(dx, 1.0), -1.0)
         dx = singa.__mul__(dy, dx)
@@ -2695,15 +3655,25 @@ class SoftPlus(Operation):
 
 
 def softplus(x):
+    """
+    Return ln(exp(x) + 1), where x is Tensor.
+    """
     return SoftPlus()(x)[0]
 
 
 class Sub(Operation):
+    """
+    Performs element-wise binary subtraction (with Numpy-style broadcasting 
+    support).
+    """
 
     def __init__(self):
         super(Sub, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return a-b, where x is CTensor.
+        """
         res = singa.__sub__(a, b)
         if training:
             self.shape0 = list(a.shape())
@@ -2712,6 +3682,13 @@ class Sub(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            a tuple for (da, db), da is data for dL / da, db is data
+                for dL / db.
+        """
         dx0 = dy
         dx1 = singa.MultFloat(dy, -1.0)
         if (type(dy) == float) or self.shape0 == self.shape1:
@@ -2724,17 +3701,32 @@ class Sub(Operation):
 
 
 def sub(a, b):
+    """
+    Return a-b, where a and b are Tensor.
+    """
     return Sub()(a, b)[0]
 
 
 # optimize min to support multi inputs
 class Min(Operation):
+    """
+    Element-wise min of each of the input tensors (with Numpy-style 
+    broadcasting support).
+    """
 
     def __init__(self):
         super(Min, self).__init__()
         self.masks = []
 
     def _min(self, a, b):
+        """
+        Args:
+            a (CTensor): First operand
+            b (CTensor): Second operand
+        Returns: 
+            CTensor, the output
+            tuple of CTensor, mask tensor
+        """
         m = singa.__sub__(a, b)
         mask0 = singa.LEFloat(m, 0)
         mask1 = singa.GTFloat(m, 0)
@@ -2742,6 +3734,12 @@ class Min(Operation):
         return res, (mask0, mask1)
 
     def forward(self, *x):
+        """
+        Args:
+            *x (a list of CTensor): List of tensors for max.
+        Returns: 
+            CTensor, the output
+        """    
         assert (len(x) > 0)
         self.l = len(x)
         if len(x) == 1:
@@ -2756,6 +3754,12 @@ class Min(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            a tuple for (*dx), dx is data for dL / dx.
+        """
         if self.l == 1:
             return self.masks[0][0]
         else:
@@ -2773,39 +3777,69 @@ class Min(Operation):
 
 
 def min(*l):
+    """
+    Element-wise min of each of the input tensors (with Numpy-style 
+    broadcasting support).
+    Args:
+        *x (a list of Tensor): List of tensors for max.
+    Returns: 
+        Tensor, the output
+    """
     return Min()(*l)[0]
 
 
 class Log(Operation):
+    """
+    y = log(x), is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(Log, self).__init__()
 
     def forward(self, x):
+        """
+        Return log(x), where x is CTensor.
+        """
         if training:
             self.input = x
         return singa.Log(x)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         dx = singa.PowFloat(self.input, -1)
         dx = singa.__mul__(dy, dx)
         return dx
 
 
 def log(x):
+    """
+    Return log(x), where x is Tensor.
+    """
     return Log()(x)[0]
 
 
 class HardSigmoid(Operation):
+    """
+    y = max(0, min(1, alpha * x + beta)), is applied to the tensor elementwise.
+    """
 
     def __init__(self, alpha=0.2, gamma=0.5):
+        """
+        Args:
+            alpha (float): Value of alpha.
+            gamma (float): Value of beta.
+        """
         super(HardSigmoid, self).__init__()
         self.alpha = alpha
         self.gamma = gamma
 
     def forward(self, x):
-        """Do forward propgation.
-        #y = max(0, min(1, alpha * x + gamma))
+        """
         Args:
             x (CTensor): matrix
         Returns:
@@ -2823,6 +3857,12 @@ class HardSigmoid(Operation):
         return singa.ReLU(ans)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         mask0 = singa.GTFloat(self.cache, 0.0)
         mask1 = singa.LTFloat(self.cache, 1.0)
         mask = singa.__mul__(mask0, mask1)
@@ -2830,16 +3870,43 @@ class HardSigmoid(Operation):
 
 
 def hardsigmoid(x, alpha=0.2, gamma=0.5):
+    """
+    y = max(0, min(1, alpha * x + beta)), is applied to the tensor elementwise.
+    Args:
+        x (Tensor): matrix
+        alpha (float): Value of alpha.
+        gamma (float): Value of beta.        
+    Returns:
+        a Tensor for the result
+    """
     return HardSigmoid(alpha, gamma)(x)[0]
 
 
 class Squeeze(Operation):
+    """
+    Remove single-dimensional entries from the shape of a tensor. Takes a 
+    parameter axes with a list of axes to squeeze. If axes is not provided, 
+    all the single dimensions will be removed from the shape. If an axis is 
+    selected with shape entry not equal to one, an error is raised.
+    """
 
     def __init__(self, axis=[]):
+        """
+        Args:
+            axis (list of ints): List of integers indicating the dimensions 
+                to squeeze. Negative value means counting dimensions from 
+                the back. Accepted range is [-r, r-1] where r = rank(data).
+        """
         super(Squeeze, self).__init__()
         self.axis = axis
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         self.cache = x.shape()
         newshape = []
         if (self.axis == []):
@@ -2860,19 +3927,44 @@ class Squeeze(Operation):
         return singa.Reshape(x, newshape)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         return singa.Reshape(dy, self.cache)
 
 
 def squeeze(x, axis=[]):
+    """
+    Remove single-dimensional entries from the shape of a tensor. Takes a 
+    parameter axes with a list of axes to squeeze. If axes is not provided, 
+    all the single dimensions will be removed from the shape. If an axis is 
+    selected with shape entry not equal to one, an error is raised.
+    Args:
+        x (Tensor): Input tensor
+        axis (list of ints): List of integers indicating the dimensions 
+            to squeeze. Negative value means counting dimensions from 
+            the back. Accepted range is [-r, r-1] where r = rank(data).
+    Returns: 
+        Tensor, the output
+    """
     return Squeeze(axis)(x)[0]
 
 
 class Div(Operation):
+    """
+    Performs element-wise binary division (with Numpy-style broadcasting 
support).
+    """
 
     def __init__(self):
         super(Div, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return np.div(a,b), where a and b are CTensor.
+        """
         res = singa.__mul__(a, singa.PowFloat(b, -1.0))
         # res = singa.__div__(a, b)
         if training:
@@ -2884,6 +3976,13 @@ class Div(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            a CTensor tuple for (da, db), da is data for dL / da, db is data
+                for dL / db.
+        """
         #dy/dx_0 = b^(-1)
         #dy/dx_1 = (-a)*b^(-2)
         dx0 = singa.__mul__(dy, self.input[1])
@@ -2899,36 +3998,73 @@ class Div(Operation):
 
 
 def div(a, b):
+    """
+    Return np.div(a,b), where a and b are Tensor.
+    """
     return Div()(a, b)[0]
 
 
 class Shape(Operation):
+    """
+    Takes a tensor as input and outputs a tensor containing the shape of the 
+    input tensor.
+    """
 
     def __init__(self):
         super(Shape, self).__init__()
 
     def forward(self, x):
+        """
+        Args:
+            x (CTensor): Input tensor
+        Returns: 
+            CTensor, the output
+        """
         cur = list(x.shape())
         cur = tensor.from_numpy(np.array(cur))
         cur.to_device(x.device())
         return cur.data
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        """
         return list(dy.shape())
 
 
 def shape(x):
+    """
+    Takes a tensor as input and outputs a tensor containing the shape of the 
+    input tensor.
+    Args:
+        x (Tensor): Input tensor
+    Returns: 
+        Tensor, the output
+    """
     return Shape()(x)[0]
 
 
 # optimize max to support multi inputs
 class Max(Operation):
+    """
+    Element-wise max of each of the input tensors (with Numpy-style 
+    broadcasting support). 
+    """
 
     def __init__(self):
         super(Max, self).__init__()
         self.masks = []
 
     def _max(self, a, b):
+        """
+        Args:
+            a (CTensor): First operand
+            b (CTensor): Second operand
+        Returns: 
+            CTensor, the output
+            tuple of CTensor, mask tensor
+        """    
         m = singa.__sub__(a, b)
         mask0 = singa.GEFloat(m, 0)
         mask1 = singa.LTFloat(m, 0)
@@ -2936,6 +4072,12 @@ class Max(Operation):
         return res, (mask0, mask1)
 
     def forward(self, *x):
+        """
+        Args:
+            *x (a list of CTensor): List of tensors for max.
+        Returns: 
+            CTensor, the output
+        """    
         assert (len(x) > 0)
         self.l = len(x)
         if len(x) == 1:
@@ -2950,6 +4092,12 @@ class Max(Operation):
         return res
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            a tuple for (*dx), dx is data for dL / dx.
+        """
         if self.l == 1:
             return self.masks[0][0]
         else:
@@ -2967,34 +4115,62 @@ class Max(Operation):
 
 
 def max(*l):
+    """
+    Element-wise max of each of the input tensors (with Numpy-style 
broadcasting support). 
+    Args:
+        *x (a list of Tensor): List of tensors for max.
+    Returns: 
+        CTensor, the output
+    """
     return Max()(*l)[0]
 
 
 class And(Operation):
+    """
+    Returns the tensor resulted from performing the and logical operation 
elementwise on the input tensors A and B (with Numpy-style broadcasting 
support).
+    """
 
     def __init__(self):
         super(And, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return np.logical_and(a,b), where a and b are CTensor.
+        """
         m = singa.__mul__(a, b)
         cur = singa.PowFloat(singa.Sign(m), 2)
 
         return cur
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Raises:
+            AssertionError: no backward function for this operator
+        """
         assert False, ('no gradient for backward function')
 
 
 def _and(a, b):
+    """
+    Return np.logical_and(a,b), where a and b are Tensor.
+    """
     return And()(a, b)[0]
 
 
 class Or(Operation):
+    """
+    Returns the tensor resulted from performing the or logical operation 
elementwise on the input tensors A and B (with Numpy-style broadcasting 
support).
+    """
 
     def __init__(self):
         super(Or, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return np.logical_or(a,b), where a and b are CTensor.
+        """
         m = singa.__add__(singa.PowFloat(singa.Sign(a), 2.0),
                           singa.PowFloat(singa.Sign(b), 2.0))
         cur = singa.Sign(m)
@@ -3002,19 +4178,34 @@ class Or(Operation):
         return cur
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Raises:
+            AssertionError: no backward function for this operator
+        """
         assert False, ('no gradient for backward function')
 
 
 def _or(a, b):
+    """
+    Return np.logical_or(a,b), where a and b are Tensor.
+    """
     return Or()(a, b)[0]
 
 
 class Not(Operation):
+    """
+    Returns the negation of the input tensor element-wise.
+    """
 
     def __init__(self):
         super(Not, self).__init__()
 
     def forward(self, x):
+        """
+        Return np.logical_not(x), where x is CTensor.
+        """
         mask0 = singa.GEFloat(x, 0)
         mask1 = singa.LEFloat(x, 0)
         cur = singa.__mul__(mask0, mask1)
@@ -3022,19 +4213,34 @@ class Not(Operation):
         return cur
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Raises:
+            AssertionError: no backward function for this operator
+        """
         assert False, ('no gradient for backward function')
 
 
 def _not(x):
+    """
+    Return np.logical_not(x), where x is Tensor.
+    """
     return Not()(x)[0]
 
 
 class Xor(Operation):
+    """
+    Performing the xor logical operation elementwise on the input tensors A 
and B (with Numpy-style broadcasting support).
+    """
 
     def __init__(self):
         super(Xor, self).__init__()
 
     def forward(self, a, b):
+        """
+        Return np.logical_xor(a,b), where a and b are CTensor.
+        """
         m = singa.__sub__(singa.PowFloat(singa.Sign(a), 2.0),
                           singa.PowFloat(singa.Sign(b), 2.0))
         cur = singa.PowFloat(singa.Sign(m), 2.0)
@@ -3042,36 +4248,66 @@ class Xor(Operation):
         return cur
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Raises:
+            AssertionError: no backward function for this operator
+        """
         assert False, ('no gradient for backward function')
 
 
 def _xor(a, b):
+    """
+    Return np.logical_xor(a,b), where a and b are Tensor.
+    """
     return Xor()(a, b)[0]
 
 
 class Negative(Operation):
+    """
+    y = -x, is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(Negative, self).__init__()
 
     def forward(self, x):
+        """
+        Return -x, where x is CTensor.
+        """
         #y=-x
         return singa.MultFloat(x, -1)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         return singa.MultFloat(dy, -1)
 
 
 def negative(x):
+    """
+    Return -x, where x is Tensor.
+    """
     return Negative()(x)[0]
 
 
 class Reciprocal(Operation):
+    """
+    y = 1/x, is applied to the tensor elementwise.
+    """
 
     def __init__(self):
         super(Reciprocal, self).__init__()
 
     def forward(self, x):
+        """
+        Return 1/x, where x is CTensor.
+        """
         #y=1/x elementwise
         if training:
             self.input = x
@@ -3079,33 +4315,44 @@ class Reciprocal(Operation):
         return singa.PowFloat(x, -1)
 
     def backward(self, dy):
+        """
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
+        Returns: 
+            CTensor, the gradient over input
+        """
         #dy/dx = -1/x**2
         dx = singa.MultFloat(singa.PowFloat(self.input, -2), -1)
         return singa.__mul__(dy, dx)
 
 
 def reciprocal(x):
+    """
+    Return 1/x, where x is Tensor.
+    """
     return Reciprocal()(x)[0]
 
 
 class Gemm(Operation):
+    """
+    Init a General Matrix multiplication(Gemm) operator. Compute Y = alpha * 
+    A' * B' + beta * C, where input tensor A has shape (M, K) or (K, M), input 
+    tensor B has shape (K, N) or (N, K), input tensor C is broadcastable to 
+    shape (M, N), and output tensor Y has shape (M, N).
+    A' = transpose(A) if transA else A
+    B' = transpose(B) if transB else B
+    """
 
     def __init__(self, alpha=1.0, beta=1.0, transA=0, transB=0):
         """
-        init a General Matrix multiplication(Gemm) operator
-        Compute Y = alpha * A' * B' + beta * C, where input tensor A has shape 
(M, K) or (K, M), input tensor B has shape (K, N) or (N, K), input tensor C is 
broadcastable to shape (M, N), and output tensor Y has shape (M, N).
-        A' = transpose(A) if transA else A
-        B' = transpose(B) if transB else B
-        Args:alpha: 
-            float, Scalar multiplier for the product of input tensors A * B.
-        Args:beta: 
-            float, Scalar multiplier for input tensor C.
-        Args:transA: 
-            int, Whether A should be transposed
-        Args:transB: 
-            int, Whether B should be transposed
+        Args:
+            alpha (float): Scalar multiplier for the product of input tensors 
+                A * B.
+            beta (float): Scalar multiplier for input tensor C.
+            ransA (int): Whether A should be transposed
+            transB (int): Whether B should be transposed
         Returns: 
-            tensor, the output
+            CTensor, the output
         """
         super(Gemm, self).__init__()
         self.alpha = alpha
@@ -3116,12 +4363,14 @@ class Gemm(Operation):
     def forward(self, A, B, C=None):
         """
         forward propogation of Gemm
-        Args:A: 
-            tensor, The shape of A should be (M, K) if transA is 0, or (K, M) 
if transA is non-zero.
-        Args:B: 
-            tensor, The shape of B should be (K, N) if transB is 0, or (N, K) 
if transB is non-zero.
-        Args:C: 
-            tensor(optional), Optional input tensor C. If not specified, the 
computation is done as if C is a scalar 0. The shape of C should be 
unidirectional broadcastable to (M, N).
+        Args:
+            A (CTensor): The shape of A should be (M, K) if transA is 0, or 
+                (K, M) if transA is non-zero.
+            B (CTensor): The shape of B should be (K, N) if transB is 0, or 
+                (N, K) if transB is non-zero.
+            C (CTensor): (optional), Optional input tensor C. If not 
specified, 
+                the computation is done as if C is a scalar 0. The shape of C 
+                should be unidirectional broadcastable to (M, N).
         Returns: 
             tensor, the output
         """
@@ -3137,12 +4386,12 @@ class Gemm(Operation):
     def backward(self, dy):
         """
         backward propogation of Gemm
-        Args:dy: 
-            tensor, The shape of A should be (M, K) if transA is 0, or (K, M) 
if transA is non-zero.
+        Args:
+            dy (CTensor): The shape of A should be (M, K) if transA is 0, or 
(K, M) if transA is non-zero.
         Returns: 
-            tensor, the gradient over A
-            tensor, the gradient over B
-            tensor(optional), the gradient over C
+            CTensor, the gradient over A
+            CTensor, the gradient over B
+            CTensor(optional), the gradient over C
         """
         _A, _B, C = self.inputs
         # y = alpha * A  * B  => da = alpha * dy * BT
@@ -3172,39 +4421,42 @@ class Gemm(Operation):
 
 def gemm(A, B, C=None, alpha=1.0, beta=1.0, transA=0, transB=0):
     """
-    init a General Matrix multiplication(Gemm) operator
-    Compute Y = alpha * A' * B' + beta * C, where input tensor A has shape (M, 
K) or (K, M), input tensor B has shape (K, N) or (N, K), input tensor C is 
broadcastable to shape (M, N), and output tensor Y has shape (M, N).
+    Init a General Matrix multiplication(Gemm) operator. Compute Y = alpha * 
+    A' * B' + beta * C, where input tensor A has shape (M, K) or (K, M), input 
+    tensor B has shape (K, N) or (N, K), input tensor C is broadcastable to 
+    shape (M, N), and output tensor Y has shape (M, N).
     A' = transpose(A) if transA else A
     B' = transpose(B) if transB else B
-    Args:A: 
-        tensor, The shape of A should be (M, K) if transA is 0, or (K, M) if 
transA is non-zero.
-    Args:B: 
-        tensor, The shape of B should be (K, N) if transB is 0, or (N, K) if 
transB is non-zero.
-    Args:C: 
-        tensor(optional), Optional input tensor C. If not specified, the 
computation is done as if C is a scalar 0. The shape of C should be 
unidirectional broadcastable to (M, N).
-    Args:alpha: 
-        float, Scalar multiplier for the product of input tensors A * B.
-    Args:beta: 
-        float, Scalar multiplier for input tensor C.
-    Args:transA: 
-        int, Whether A should be transposed
-    Args:transB: 
-        int, Whether B should be transposed
+    Args:
+        A (Tensor): The shape of A should be (M, K) if transA is 0, or 
+            (K, M) if transA is non-zero.
+        B (Tensor): The shape of B should be (K, N) if transB is 0, or 
+            (N, K) if transB is non-zero.
+        C (Tensor): (optional), Optional input tensor C. If not specified, 
+            the computation is done as if C is a scalar 0. The shape of C 
+            should be unidirectional broadcastable to (M, N).
+        alpha (float): Scalar multiplier for the product of input tensors A * 
B.
+        beta (float): Scalar multiplier for input tensor C.
+        ransA (int): Whether A should be transposed
+        transB (int): Whether B should be transposed
     Returns: 
-        tensor, the output
+        Tensor, the output
     """
     return Gemm(alpha, beta, transA, transB)(A, B, C)[0]
 
 
 class GlobalAveragePool(Operation):
+    """
+    Init a GlobalAveragePool operator
+    """
 
     def __init__(self, data_format='channels_first'):
         """
-        init a GlobalAveragePool operator
-        Args:data_format: 
-            A string, we support two formats: channels_last and 
channels_first, default is channels_first.
-            channels_first means the format of input is (N x C x H x W)
-            channels_last means the format of input is (N x H x W x C)
+        Args:
+            data_format (string): A string, we support two formats: 
+                channels_last and channels_first, default is channels_first.
+                channels_first means the format of input is (N x C x H x W)
+                channels_last means the format of input is (N x H x W x C)
         """
         super(GlobalAveragePool, self).__init__()
         self.data_format = data_format
@@ -3212,10 +4464,10 @@ class GlobalAveragePool(Operation):
     def forward(self, x):
         """
         forward propogation of GlobalAveragePool
-        Args:x: 
-            the input tensor
+        Args:
+            x (CTensor): the input tensor
         Returns: 
-            tensor, the output
+            CTensor, the output
         """
         if training:
             self.mask = singa.Tensor(x.shape(), x.device())
@@ -3244,10 +4496,10 @@ class GlobalAveragePool(Operation):
     def backward(self, dy):
         """
         backward propogation of GlobalAveragePool
-        Args:dy: 
-            the gradient tensor from upper operations
+        Args:
+            dy (CTensor): the gradient tensor from upper operations
         Returns: 
-            tensor, the gradient over input
+            CTensor, the gradient over input
         """
         self.mask.SetFloatValue(self.shape_divisor)
         return singa.__mul__(self.mask, dy)
@@ -3256,26 +4508,29 @@ class GlobalAveragePool(Operation):
 def globalaveragepool(x, data_format='channels_first'):
     """
     GlobalAveragePool operator
-    Args:x
-        the input tensor
-    Args:data_format: 
-        A string, we support two formats: channels_last and channels_first, 
default is channels_first.
-        channels_first means the format of input is (N x C x H x W)
-        channels_last means the format of input is (N x H x W x C)
+    Args:
+        x (Tensor): the input tensor
+        data_format (string): A string, we support two formats: 
+            channels_last and channels_first, default is channels_first.
+            channels_first means the format of input is (N x C x H x W)
+            channels_last means the format of input is (N x H x W x C)
     Returns: 
-        tensor, the output
+        Tensor, the output
     """
     return GlobalAveragePool(data_format)(x)[0]
 
 
 class ConstantOfShape(Operation):
+    """
+    Init a ConstantOfShape, generate a tensor with given value and shape.
+    """
 
-    def __init__(self, value=0):
+    def __init__(self, value=0.):
         """
-        Init a ConstantOfShape, generate a tensor with given value and shape.
         Args:
-            value: (Optional) The value of the output elements. Should be a 
one-element value. If not specified, 
-            it defaults to 0 and datatype float32
+            value (float): (Optional) The value of the output elements. Should 
+                be a one-element value. If not specified, it defaults to 0 and 
+                datatype float32
         """
         super(ConstantOfShape, self).__init__()
         self.value = value
@@ -3284,10 +4539,13 @@ class ConstantOfShape(Operation):
         """
         forward of ConstantOfShape
         Args:
-            x: CTensor, 1D tensor. The shape of the expected output tensor. 
All values must be >= 0.
+            x: CTensor, 1D tensor. The shape of the expected output tensor. 
+                All values must be >= 0.
         Returns:
-            the output CTensor. If attribute 'value' is specified, the value 
and datatype of the output tensor is taken from 'value'. 
-            If attribute 'value' is not specified, the value in the output 
defaults to 0, and the datatype defaults to float32.
+            the output CTensor. If attribute 'value' is specified, the value 
+                and datatype of the output tensor is taken from 'value'. If 
+                attribute 'value' is not specified, the value in the output 
+                defaults to 0, and the datatype defaults to float32.
         """
         x_shape = tensor.to_numpy(tensor.from_raw_tensor(x)).astype(
             np.int64).tolist()
@@ -3300,7 +4558,9 @@ class ConstantOfShape(Operation):
         """
         backward of ConstantOfShape
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
+        Raises:
+            AssertionError: no backward function for this operator
         """
         assert False, ('no gradient for backward function')
 
@@ -3309,25 +4569,30 @@ def constant_of_shape(x, value=0):
     """
     Init a ConstantOfShape, generate a tensor with given value and shape.
     Args:
-        x: CTensor, 1D tensor. The shape of the expected output tensor. All 
values must be >= 0.
-    Args:
-        value: (Optional) The value of the output elements. Should be a 
one-element tensor. If not specified, 
-        it defaults to a tensor of value 0 and datatype float32
+        x: Tensor, 1D tensor. The shape of the expected output tensor. 
+            All values must be >= 0.
+        value (float): (Optional) The value of the output elements. Should 
+            be a one-element value. If not specified, it defaults to 0 and 
+            datatype float32
     Returns:
-            the output CTensor. If attribute 'value' is specified, the value 
and datatype of the output tensor is taken from 'value'. 
-            If attribute 'value' is not specified, the value in the output 
defaults to 0, and the datatype defaults to float32.
+        the output Tensor. If attribute 'value' is specified, the value 
+            and datatype of the output tensor is taken from 'value'. If 
+            attribute 'value' is not specified, the value in the output 
+            defaults to 0, and the datatype defaults to float32.
     """
     return ConstantOfShape(value)(x)[0]
 
 
 class Dropout(Operation):
+    """
+    Init a Dropout, which scales the masked input data by the following 
equation:
+    output = scale * data * mask, scale = 1. / (1. - ratio).
+    """
 
     def __init__(self, ratio=0.5):
         """
-        Init a Dropout, which scales the masked input data by the following 
equation:
-        output = scale * data * mask, scale = 1. / (1. - ratio).
         Args:
-            ratio: float, he ratio of random dropout, with value in [0, 1).
+            ratio (float): the ratio of random dropout, with value in [0, 1).
         """
         super(Dropout, self).__init__()
         self.ratio = ratio
@@ -3336,7 +4601,7 @@ class Dropout(Operation):
         """
         forward of Dropout
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3351,7 +4616,7 @@ class Dropout(Operation):
         """
         backward of Dropout
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3362,28 +4627,31 @@ class Dropout(Operation):
 
 def dropout(x, ratio=0.5):
     """
-    Init a Dropout, which scales the masked input data by the following 
equation:
-    output = scale * data * mask, scale = 1. / (1. - ratio).
-    Args:
-        x: CTensor, input tensor.
+    Init a Dropout, which scales the masked input data by the following 
+    equation: output = scale * data * mask, scale = 1. / (1. - ratio).
     Args:
-        ratio: float, he ratio of random dropout, with value in [0, 1).
+        x (Tensor): input tensor.
+        ratio (float): the ratio of random dropout, with value in [0, 1).
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Dropout(ratio)(x)[0]
 
 
 class ReduceSum(Operation):
+    """
+    Init a ReduceSum, computes the sum of the input tensor's element along 
+    the provided axes.
+    """
 
     def __init__(self, axes=None, keepdims=1):
         """
-        Init a ReduceSum, computes the sum of the input tensor's element along 
the provided axes.
         Args:
-            axes: list of ints, A list of integers, along which to reduce. 
Accepted range is [-r, r-1] where r = rank(data).
-            The default is None, which reduces over all the dimensions of the 
input tensor.
-        Args:
-            keepdims: int, Keep the reduced dimension or not, default 1 mean 
keep reduced dimension.
+            axes (list of int): A list of integers, along which to reduce. 
+                Accepted range is [-r, r-1] where r = rank(data). The default 
+                is None, which reduces over all the dimensions of the input 
tensor.
+            keepdims (int): Keep the reduced dimension or not, default 1 mean 
+                keep reduced dimension.
         """
         super(ReduceSum, self).__init__()
         self.axes = axes
@@ -3393,7 +4661,7 @@ class ReduceSum(Operation):
         """
         forward of ReduceSum
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3418,7 +4686,7 @@ class ReduceSum(Operation):
         """
         backward of ReduceSum
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3433,30 +4701,35 @@ class ReduceSum(Operation):
 
 def reduce_sum(x, axes=None, keepdims=1):
     """
-    Init a ReduceSum, computes the sum of the input tensor's element along the 
provided axes.
-    Args:
-        x: CTensor, input tensor.
+    Init a ReduceSum, computes the sum of the input tensor's element along 
+    the provided axes.
     Args:
-        axes: list of ints, A list of integers, along which to reduce. 
Accepted range is [-r, r-1] where r = rank(data).
-        The default is None, which reduces over all the dimensions of the 
input tensor.
-    Args:
-        keepdims: int, Keep the reduced dimension or not, default 1 mean keep 
reduced dimension.
+        x (Tensor): input tensor.
+        axes (list of int): A list of integers, along which to reduce. 
+            Accepted range is [-r, r-1] where r = rank(data). The default 
+            is None, which reduces over all the dimensions of the input tensor.
+        keepdims (int): Keep the reduced dimension or not, default 1 mean 
+            keep reduced dimension.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return ReduceSum(axes, keepdims)(x)[0]
 
 
 class ReduceMean(Operation):
+    """
+    Init a ReduceMean, computes the mean of the input tensor's element along 
+    the provided axes.
+    """
 
     def __init__(self, axes=None, keepdims=1):
         """
-        Init a ReduceMean, computes the mean of the input tensor's element 
along the provided axes.
-        Args:
-            axes: list of ints, A list of integers, along which to reduce. 
Accepted range is [-r, r-1] where r = rank(data).
-            The default is None, which reduces over all the dimensions of the 
input tensor.
         Args:
-            keepdims: int, Keep the reduced dimension or not, default 1 mean 
keep reduced dimension.
+            axes (list of int): A list of integers, along which to reduce. 
+                Accepted range is [-r, r-1] where r = rank(data). The default 
+                is None, which reduces over all the dimensions of the input 
tensor.
+            keepdims (int): Keep the reduced dimension or not, default 1 mean 
+                keep reduced dimension.
         """
         super(ReduceMean, self).__init__()
         self.axes = axes
@@ -3466,7 +4739,7 @@ class ReduceMean(Operation):
         """
         forward of ReduceMean
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3493,7 +4766,7 @@ class ReduceMean(Operation):
         """
         backward of ReduceMean
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3507,36 +4780,38 @@ class ReduceMean(Operation):
 
 def reduce_mean(x, axes=None, keepdims=1):
     """
-    Init a ReduceMean, computes the mean of the input tensor's element along 
the provided axes.
-    Args:
-        x: CTensor, input tensor.
-    Args:
-        axes: list of ints, A list of integers, along which to reduce. 
Accepted range is [-r, r-1] where r = rank(data).
-        The default is None, which reduces over all the dimensions of the 
input tensor.
+    Init a ReduceMean, computes the mean of the input tensor's element along 
+    the provided axes.
     Args:
-        keepdims: int, Keep the reduced dimension or not, default 1 mean keep 
reduced dimension.
+        x (Tensor): input tensor.
+        axes (list of int): A list of integers, along which to reduce. 
+            Accepted range is [-r, r-1] where r = rank(data). The default 
+            is None, which reduces over all the dimensions of the input tensor.
+        keepdims (int): Keep the reduced dimension or not, default 1 mean 
+            keep reduced dimension.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return ReduceMean(axes, keepdims)(x)[0]
 
 
 class Slice(Operation):
+    """
+    Init a Slice, Produces a slice of the input tensor along multiple axes. 
+    Similar to numpy: 
https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+    """
 
     def __init__(self, starts, ends, axes=None, steps=None):
         """
-        Init a Slice, Produces a slice of the input tensor along multiple 
axes. Similar to numpy: 
-        https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
-        Args:
-            starts: list of ints, starting indices of corresponding axis
-        Args:
-            ends: list of ints, ending indices of corresponding axis
         Args:
-            axes: list of ints, axes that `starts` and `ends` apply to. 
-            Negative value means counting dimensions from the back. Accepted 
range is [-r, r-1] where r = rank(data).
-        Args:
-            steps: list of ints, slice step of corresponding axis in `axes`. 
-            Negative value means slicing backward. 'steps' cannot be 0. 
Defaults to 1.
+            starts (list of int): starting indices of corresponding axis
+            ends (list of int): ending indices of corresponding axis
+            axes (list of int): axes that `starts` and `ends` apply to. 
+                Negative value means counting dimensions from the back. 
+                Accepted range is [-r, r-1] where r = rank(data).
+            steps (list of int): slice step of corresponding axis in `axes`. 
+                Negative value means slicing backward. 'steps' cannot be 0. 
+                Defaults to 1.
         """
         super(Slice, self).__init__()
         self.starts = starts
@@ -3548,7 +4823,7 @@ class Slice(Operation):
         """
         forward of Slice
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3580,7 +4855,7 @@ class Slice(Operation):
         """
         backward of Slice
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3605,39 +4880,38 @@ class Slice(Operation):
 
 def slice(x, starts, ends, axes=None, steps=None):
     """
-    Init a Slice, Produces a slice of the input tensor along multiple axes. 
Similar to numpy: 
-    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
-    Args:
-        x: CTensor, input tensor.
-    Args:
-        starts: list of ints, starting indices of corresponding axis
-    Args:
-        ends: list of ints, ending indices of corresponding axis
+    Init a Slice, Produces a slice of the input tensor along multiple axes. 
+    Similar to numpy: 
https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
     Args:
-        axes: list of ints, axes that `starts` and `ends` apply to. 
-        Negative value means counting dimensions from the back. Accepted range 
is [-r, r-1] where r = rank(data).
-    Args:
-        steps: list of ints, slice step of corresponding axis in `axes`. 
-        Negative value means slicing backward. 'steps' cannot be 0. Defaults 
to 1.
+        x (Tensor): input tensor.
+        starts (list of int): starting indices of corresponding axis
+        ends (list of int): ending indices of corresponding axis
+        axes (list of int): axes that `starts` and `ends` apply to. 
+            Negative value means counting dimensions from the back. 
+            Accepted range is [-r, r-1] where r = rank(data).
+        steps (list of int): slice step of corresponding axis in `axes`. 
+            Negative value means slicing backward. 'steps' cannot be 0. 
+            Defaults to 1.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Slice(starts, ends, axes, steps)(x)[0]
 
 
 class Ceil(Operation):
+    """
+    Ceil takes one input data (Tensor) and produces one output data (Tensor) 
+    where the ceil is, y = ceil(x), is applied to the tensor elementwise.
+    """
 
     def __init__(self):
-        """
-        Ceil takes one input data (Tensor) and produces one output data 
(Tensor) where the ceil is, y = ceil(x), is applied to the tensor elementwise.
-        """
         super(Ceil, self).__init__()
 
     def forward(self, x):
         """
         forward of Ceil
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3647,7 +4921,7 @@ class Ceil(Operation):
         """
         backward of Ceil
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3658,28 +4932,33 @@ class Ceil(Operation):
 
 def ceil(x):
     """
-    Ceil takes one input data (Tensor) and produces one output data (Tensor) 
where the ceil is, y = ceil(x), is applied to the tensor elementwise.
+    Ceil takes one input data (Tensor) and produces one output data (Tensor) 
+    where the ceil is, y = ceil(x), is applied to the tensor elementwise.
     Args:
-        x: CTensor, input tensor.
+        x (Tensor): input tensor.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Ceil()(x)[0]
 
 
 class Split(Operation):
+    """
+    Init a Split, Split a tensor into a list of tensors, along the specified 
+    'axis'. 
+    """
 
     def __init__(self, axis, parts, num_output=None):
         """
-        Init a Split, Split a tensor into a list of tensors, along the 
specified 'axis'. 
-        Args:
-            axis: int, Which axis to split on. A negative value means counting 
dimensions from the back. 
-            Accepted range is [-rank, rank-1] where r = rank(input).
         Args:
-            parts: list of ints, length of each output, which can be specified 
using argument 'parts'. 
-            Otherwise, the tensor is parts to equal sized parts.
-        Args:
-            num_output: once parts is none, the tensor is split to equal sized 
parts for each output.
+            axis (int): which axis to split on. A negative value means 
+                counting dimensions from the back. Accepted range is 
+                [-rank, rank-1] where r = rank(input).
+            parts (list of int): length of each output, which can be specified 
+                using argument 'parts'. Otherwise, the tensor is parts to 
equal 
+                sized parts.
+            num_output (bool): once parts is none, the tensor is split to 
equal 
+                sized parts for each output.
         """
         super(Split, self).__init__()
         self.axis = axis
@@ -3692,7 +4971,7 @@ class Split(Operation):
         """
         forward of Split
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3722,35 +5001,38 @@ class Split(Operation):
 
 def split(x, axis, parts, num_output=None):
     """
-    Init a Split, Split a tensor into a list of tensors, along the specified 
'axis'. 
-    Args:
-        x: CTensor, input tensor.
-    Args:
-        axis: int, Which axis to split on. A negative value means counting 
dimensions from the back. 
-        Accepted range is [-rank, rank-1] where r = rank(input).
-    Args:
-        parts: list of ints, length of each output, which can be specified 
using argument 'parts'. 
-        Otherwise, the tensor is split to equal sized parts.
+    Init a Split, Split a tensor into a list of tensors, along the specified 
+    'axis'. 
     Args:
-        num_output: once parts is none, the tensor is split to equal sized 
parts for each output.
+        x (Tensor): input tensor.
+        axis (int): which axis to split on. A negative value means 
+            counting dimensions from the back. Accepted range is 
+            [-rank, rank-1] where r = rank(input).
+        parts (list of int): length of each output, which can be specified 
+            using argument 'parts'. Otherwise, the tensor is parts to equal 
+            sized parts.
+        num_output (bool): once parts is none, the tensor is split to equal 
+            sized parts for each output.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Split(axis, parts, num_output)(x)
 
 
 class Gather(Operation):
+    """
+    Init a Gather, Given data tensor of rank r >= 1, and indices tensor of 
+    rank q, gather entries of the axis dimension of data (by default 
outer-most 
+    one as axis=0) indexed by indices, and concatenates them in an output 
tensor of rank q + (r - 1).
+    """
 
     def __init__(self, axis, indices):
         """
-        Init a Gather, Given data tensor of rank r >= 1, and indices tensor of 
rank q, gather entries of 
-        the axis dimension of data (by default outer-most one as axis=0) 
indexed by indices,
-        and concatenates them in an output tensor of rank q + (r - 1).
         Args:
-            axis: int, Which axis to slice on. A negative value means counting 
dimensions from the back. 
-            Accepted range is [-rank, rank-1] where r = rank(input).
-        Args:
-            indices: list of ints, entries of the axis dimension of data.
+            axis (int): which axis to slice on. A negative value means 
counting 
+                dimensions from the back. Accepted range is [-rank, rank-1] 
+                where r = rank(input).
+            indices (list of int): entries of the axis dimension of data.
         """
         super(Gather, self).__init__()
         self.axis = axis
@@ -3760,7 +5042,7 @@ class Gather(Operation):
         """
         forward of Gather
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3792,7 +5074,7 @@ class Gather(Operation):
         """
         backward of Gather
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3839,31 +5121,33 @@ class Gather(Operation):
 
 def gather(x, axis, indices):
     """
-    Init a Gather, Given data tensor of rank r >= 1, and indices tensor of 
rank q, gather entries of 
-    the axis dimension of data (by default outer-most one as axis=0) indexed 
by indices,
-    and concatenates them in an output tensor of rank q + (r - 1).
-    Args:
-        x: CTensor, input tensor.
+    Init a Gather, Given data tensor of rank r >= 1, and indices tensor of 
+    rank q, gather entries of the axis dimension of data (by default 
outer-most 
+    one as axis=0) indexed by indices, and concatenates them in an output 
tensor of rank q + (r - 1).
     Args:
-        axis: int, Which axis to slice on. A negative value means counting 
dimensions from the back. 
-        Accepted range is [-rank, rank-1] where r = rank(input).
-    Args:
-        indices: list of ints, entries of the axis dimension of data.
+        x (Tensor): input tensor.
+        axis (int): which axis to slice on. A negative value means counting 
+            dimensions from the back. Accepted range is [-rank, rank-1] 
+            where r = rank(input).
+        indices (list of int): entries of the axis dimension of data.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Gather(axis, indices)(x)[0]
 
 
 class Tile(Operation):
+    """
+    Init a Tile, Constructs a tensor by tiling a given tensor. This is the 
same 
+    as function tile in Numpy: 
https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
+    """
 
     def __init__(self, repeats):
         """
-        Init a Tile, Constructs a tensor by tiling a given tensor. This is the 
same as function tile in Numpy:
-        https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
         Args:
-            repeats: 1D int64 matrix of the same length as input's dimension 
number,
-            includes numbers of repeated copies along input's dimensions.
+            repeats (list of int): 1D int matrix of the same length as input's 
+                dimension number, includes numbers of repeated copies along 
+                input's dimensions.
         """
         super(Tile, self).__init__()
         self.repeats = [repeats] if isinstance(repeats, int) else repeats
@@ -3872,7 +5156,7 @@ class Tile(Operation):
         """
         forward of Tile
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3896,7 +5180,7 @@ class Tile(Operation):
         """
         backward of Tile
         Args:
-            dy: CTensor, gradient tensor.
+            dy (CTensor): gradient tensor.
         Returns:
             the gradient tensor over input tensor.
         """
@@ -3922,33 +5206,33 @@ class Tile(Operation):
 
 def tile(x, repeats):
     """
-    Init a Tile, Constructs a tensor by tiling a given tensor. This is the 
same as function tile in Numpy:
-    https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
-    Args:
-        x: CTensor, input tensor.
+    Init a Tile, Constructs a tensor by tiling a given tensor. This is the 
same 
+    as function tile in Numpy: 
https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
     Args:
-        repeats: 1D int64 matrix of the same length as input's dimension 
number,
-        includes numbers of repeated copies along input's dimensions.
+        x (Tensor): input tensor.
+        repeats (list of int): 1D int matrix of the same length as input's 
+            dimension number, includes numbers of repeated copies along 
+            input's dimensions.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Tile(repeats)(x)[0]
 
 
 class NonZero(Operation):
+    """
+    Init a NonZero, Constructs a tensor by tiling a given tensor. This is the 
same 
+    as function tile in Numpy: 
https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
+    """
 
     def __init__(self):
-        """
-        Init a NonZero, Constructs a tensor by tiling a given tensor. This is 
the same as function tile in Numpy:
-        https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
-        """
         super(NonZero, self).__init__()
 
     def forward(self, x):
         """
         forward of NonZero
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -3962,33 +5246,36 @@ class NonZero(Operation):
         """
         backward of NonZero
         Args:
-            dy: CTensor, gradient tensor.
-        Returns:
-            the gradient tensor over input tensor.
+            dy (CTensor): gradient tensor.
+        Raises:
+            AssertionError: no backward function for this operator
         """
         assert False, ('no gradient for backward function')
 
 
 def nonzero(x):
     """
-    Returns the indices of the elements that are non-zero (in row-major order 
- by dimension). 
-    NonZero behaves similar to numpy.nonzero: 
https://docs.scipy.org/doc/numpy/reference/generated/numpy.nonzero.html
+    Init a NonZero, Constructs a tensor by tiling a given tensor. This is the 
same 
+    as function tile in Numpy: 
https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
     Args:
-        x: CTensor, input tensor.
+        x (Tensor): input tensor.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return NonZero()(x)[0]
 
 
 class Cast(Operation):
+    """
+    The operator casts the elements of a given input tensor to a data type 
+    specified by the 'to' argument and returns an output tensor of the same 
+    size in the converted type.
+    """
 
     def __init__(self, to):
         """
-        The operator casts the elements of a given input tensor to a data type 
specified by the 'to' argument 
-        and returns an output tensor of the same size in the converted type.
         Args:
-            to: data type, 
+            to (int): data type, float32 = 0; int = 2.
         """
         super(Cast, self).__init__()
         self.to = to
@@ -3997,7 +5284,7 @@ class Cast(Operation):
         """
         forward of Cast
         Args:
-            x: CTensor, input tensor.
+            x (CTensor): input tensor.
         Returns:
             the output CTensor.
         """
@@ -4010,39 +5297,48 @@ class Cast(Operation):
         """
         backward of Cast
         Args:f
-            dy: CTensor, gradient tensor.
-        Returns:
-            the gradient tensor over input tensor.
+            dy (CTensor), gradient tensor.
+        Raises:
+            AssertionError: no backward function for this operator
         """
         assert False, ('no gradient for backward function')
 
 
 def cast(x, to):
     """
-    The operator casts the elements of a given input tensor to a data type 
specified by the 'to' argument 
-    and returns an output tensor of the same size in the converted type.
-    Args:x: 
-        CTensor, input tensor.
+    The operator casts the elements of a given input tensor to a data type 
+    specified by the 'to' argument and returns an output tensor of the same 
+    size in the converted type.
     Args:
-        to: data type
+        x (Tensor): input tensor.
+        to (int): data type, float32 = 0; int = 2.
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return Cast(to)(x)[0]
 
 
 class OneHot(Operation):
+    """
+    Produces a one-hot tensor based on inputs. 
+    """
 
     def __init__(self, axis, depth, values):
         """
-        Produces a one-hot tensor based on inputs. 
-        Args:
-            axis: Axis along which one-hot representation in added. Default: 
axis=-1. 
-            axis=-1 means that the additional dimension will be inserted as 
the innermost/last dimension in the output tensor.
         Args:
-            values: Rank 1 tensor containing exactly two elements, in the 
format [off_value, on_value], 
-            where 'on_value' is the value used for filling locations specified 
in 'indices' input tensor, 
-            and 'off_value' is the value used for filling locations other than 
those specified in 'indices' input tensor.
+            axis (int): Axis along which one-hot representation in added. 
+                Default: axis=-1. axis=-1 means that the additional dimension 
+                will be inserted as the innermost/last dimension in the output 
+                tensor.
+            depth (int): Scalar specifying the number of classes in one-hot 
+                tensor. This is also the size of the one-hot dimension 
+                (specified by 'axis' attribute) added on in the output tensor. 
+                The values in the 'indices' input tensor are expected to be in 
+                the range [-depth, depth-1].
+            values (float): Rank 1 tensor containing exactly two elements, in 
+                the format [off_value, on_value], where 'on_value' is the 
+                value used for filling locations specified in 'indices' input 
+                tensor, 
         """
         super(OneHot, self).__init__()
         self.axis = axis
@@ -4051,13 +5347,11 @@ class OneHot(Operation):
 
     def forward(self, indices):
         """
-        forward of OneHot
-        ! borrow from onnx
+        forward of OneHot, we borrow this function from onnx
         Args:
-            indices: Scalar specifying the number of classes in one-hot 
tensor. 
-            This is also the size of the one-hot dimension (specified by 
'axis' attribute) added on in the output tensor. 
-            The values in the 'indices' input tensor are expected to be in the 
range [-depth, depth-1]. 
-            In case 'depth' is of non-integer type, it will be casted to int64 
before use.
+            indices (CTensor): Scalar specifying the number of classes in 
+                one-hot tensor. The values in the 'indices' input tensor are 
+                expected to be in the range [-depth, depth-1]. 
         Returns:
             the output CTensor.
         """
@@ -4080,10 +5374,10 @@ class OneHot(Operation):
     def backward(self, dy):
         """
         backward of OneHot
-        Args:f
-            dy: CTensor, gradient tensor.
-        Returns:
-            the gradient tensor over input tensor.
+        Args:
+            dy (CTensor):gradient tensor.
+        Raises:
+            AssertionError: no backward function for this operator
         """
         assert False, ('no gradient for backward function')
 
@@ -4092,18 +5386,23 @@ def onehot(axis, indices, depth, values):
     """
     Produces a one-hot tensor based on inputs. 
     Args:
-        axis: Axis along which one-hot representation in added. Default: 
axis=-1. 
-        axis=-1 means that the additional dimension will be inserted as the 
innermost/last dimension in the output tensor.
-    Args:
-        indices: Scalar specifying the number of classes in one-hot tensor. 
-        This is also the size of the one-hot dimension (specified by 'axis' 
attribute) added on in the output tensor. 
-        The values in the 'indices' input tensor are expected to be in the 
range [-depth, depth-1]. 
-        In case 'depth' is of non-integer type, it will be casted to int64 
before use.
-    Args:
-        values: Rank 1 tensor containing exactly two elements, in the format 
[off_value, on_value], 
-        where 'on_value' is the value used for filling locations specified in 
'indices' input tensor, 
-        and 'off_value' is the value used for filling locations other than 
those specified in 'indices' input tensor.
+        axis (int): Axis along which one-hot representation in added. 
+            Default: axis=-1. axis=-1 means that the additional dimension 
+            will be inserted as the innermost/last dimension in the output 
+            tensor.
+        indices (Tensor): Scalar specifying the number of classes in 
+            one-hot tensor. The values in the 'indices' input tensor are 
+            expected to be in the range [-depth, depth-1]. 
+        depth (int): Scalar specifying the number of classes in one-hot 
+            tensor. This is also the size of the one-hot dimension 
+            (specified by 'axis' attribute) added on in the output tensor. 
+            The values in the 'indices' input tensor are expected to be in 
+            the range [-depth, depth-1].
+        values (float): Rank 1 tensor containing exactly two elements, in 
+            the format [off_value, on_value], where 'on_value' is the 
+            value used for filling locations specified in 'indices' input 
+            tensor, 
     Returns:
-        the output CTensor.
+        the output Tensor.
     """
     return OneHot(axis, depth, values)(indices)[0]

[singa] branch dev updated: add function comments for autograd

Reply via email to