SINGA-237 New documentation files for SINGA v1.0 Updated the comments of python files for autodoc to generate python APIs by Sphinx.
Fixed a bug in optimizer which ignored the momentum value Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/8cd55300 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/8cd55300 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/8cd55300 Branch: refs/heads/dev Commit: 8cd55300ab30673414bbeeec7d68f1ddcd6393a2 Parents: 3299b0c Author: Wei Wang <[email protected]> Authored: Fri Aug 12 14:45:41 2016 +0800 Committer: wangwei <[email protected]> Committed: Sun Aug 14 13:47:04 2016 +0800 ---------------------------------------------------------------------- cmake/Dependencies.cmake | 5 +- doc/conf.py | 14 +- doc/docs/device.rst | 29 +- doc/docs/index.rst | 6 + doc/docs/initializer.rst | 12 + doc/docs/layer.rst | 14 + doc/docs/loss.rst | 7 + doc/docs/metric.rst | 8 + doc/docs/optimizer.rst | 11 + doc/docs/tensor.md | 7 - doc/docs/tensor.rst | 30 ++ doc/docs/utils.rst | 6 + doc/index.rst | 28 +- examples/index.rst | 6 + src/python/singa/device.py | 31 ++ src/python/singa/initializer.py | 86 ++++- src/python/singa/layer.py | 417 ++++++++++++++---------- src/python/singa/loss.py | 105 +++++- src/python/singa/metric.py | 49 ++- src/python/singa/optimizer.py | 284 ++++++++-------- src/python/singa/tensor.py | 608 ++++++++++++++++++++++++++++++----- 21 files changed, 1331 insertions(+), 432 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/cmake/Dependencies.cmake ---------------------------------------------------------------------- diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ceef429..e533ca8 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -54,12 +54,13 @@ IF(USE_OPENCV) MESSAGE(STATUS "Found OpenCV_${OpenCV_VERSION} at ${OpenCV_INCLUDE_DIRS}") INCLUDE_DIRECTORIES(SYSTEM ${OpenCV_INCLUDE_DIRS}) LIST(APPEND SINGA_LINKER_LIBS ${OpenCV_LIBRARIES}) -ENDIF() +ENDIF() #LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so") #MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS}) IF(USE_PYTHON) - FIND_PACKAGE(PythonLibs REQUIRED) + FIND_PACKAGE(PythonLibs 2.7 REQUIRED) + FIND_PACKAGE(PythonInterp 2.7 REQUIRED) FIND_PACKAGE(SWIG 3.0 REQUIRED) ENDIF() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/conf.py ---------------------------------------------------------------------- diff --git a/doc/conf.py b/doc/conf.py index 20ba51a..9f52d16 100755 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,7 +19,8 @@ import os import sys sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(1, '../src/python/singa/') +sys.path.insert(1, os.path.abspath('../build/python')) +#autodoc_mock_imports = ['singa.device', 'singa.tensor', 'singa.layer'] # -- General configuration ------------------------------------------------ from recommonmark.parser import CommonMarkParser @@ -35,9 +36,8 @@ source_parsers = { # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [ -'sphinx.ext.autodoc' -] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon'] +napoleon_google_docstring = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -50,7 +50,7 @@ source_suffix = ['.rst', '.md'] # The encoding of source files. # -# source_encoding = 'utf-8-sig' +source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' @@ -150,7 +150,7 @@ html_theme = 'sphinx_rtd_theme' # The name of an image file (relative to this directory) to place at the top # of the sidebar. # -html_logo = '/singa.png' +html_logo = 'image/singa.png' # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 @@ -203,7 +203,7 @@ html_static_path = ['_static'] # If true, links to the reST sources are added to the pages. # -html_show_sourcelink = False +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. # http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/device.rst ---------------------------------------------------------------------- diff --git a/doc/docs/device.rst b/doc/docs/device.rst index aa5defb..53faf48 100644 --- a/doc/docs/device.rst +++ b/doc/docs/device.rst @@ -2,7 +2,10 @@ Device ======= -The Device abstract represent a hardware device with memory and compuation units. +The Device abstract represents any hardware device with memory and compuation units. +All [Tensor operations](tensor.html) are scheduled by the resident device for execution. +Tensor memory is also managed by the device's memory manager. Therefore, optimization +of memory and execution are implemented in the Device class. Specific devices ---------------- @@ -13,24 +16,14 @@ Currently, SINGA has three Device implmentations, 3. OpenclGPU for a GPU card which runs OpenCL code -Create devices ---------------- - Python API -~~~~~~~~~~ - -.. autofunction:: device.create_cuda_gpus - -.. autofunction:: device.create_cuda_gpus_on - -.. autofunction:: device.create_cuda_gpu_on - -.. autofunction:: device.get_default_device +---------- +.. automodule:: singa.device + :members: create_cuda_gpus, create_cuda_gpus_on, get_default_device -The following code shows how to create devices, -.. code:: python +The following code provides examples of creating devices:: from singa import device cuda = device.create_cuda_gpu_on(0) # use GPU card of ID 0 @@ -39,9 +32,5 @@ The following code shows how to create devices, ary2 = device.create_cuda_gpus([0,2]) # create 2 devices on ID 0 and 2 - CPP API -~~~~~~~ - - - +--------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/index.rst ---------------------------------------------------------------------- diff --git a/doc/docs/index.rst b/doc/docs/index.rst index 8a74976..2294054 100644 --- a/doc/docs/index.rst +++ b/doc/docs/index.rst @@ -6,4 +6,10 @@ English installation software_stack device + tensor + layer + initializer + loss + metric + optimizer examples http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/initializer.rst ---------------------------------------------------------------------- diff --git a/doc/docs/initializer.rst b/doc/docs/initializer.rst new file mode 100644 index 0000000..a190702 --- /dev/null +++ b/doc/docs/initializer.rst @@ -0,0 +1,12 @@ +Initializer +=========== + +Python API +---------- + +.. automodule:: singa.initializer + :members: + :member-order: bysource + +CPP API +-------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/layer.rst ---------------------------------------------------------------------- diff --git a/doc/docs/layer.rst b/doc/docs/layer.rst new file mode 100644 index 0000000..62ef3c3 --- /dev/null +++ b/doc/docs/layer.rst @@ -0,0 +1,14 @@ +Layer +====== + +Python API +----------- +.. automodule:: singa.layer + :members: + :member-order: bysource + :show-inheritance: + :undoc-members: + + +CPP API +-------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/loss.rst ---------------------------------------------------------------------- diff --git a/doc/docs/loss.rst b/doc/docs/loss.rst new file mode 100644 index 0000000..27872dd --- /dev/null +++ b/doc/docs/loss.rst @@ -0,0 +1,7 @@ +Loss +========= + + +.. automodule:: singa.loss + :members: + :show-inheritance: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/metric.rst ---------------------------------------------------------------------- diff --git a/doc/docs/metric.rst b/doc/docs/metric.rst new file mode 100644 index 0000000..35fa24e --- /dev/null +++ b/doc/docs/metric.rst @@ -0,0 +1,8 @@ +Metric +========= + + +.. automodule:: singa.metric + :members: + :show-inheritance: + :member-order: bysource http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/optimizer.rst ---------------------------------------------------------------------- diff --git a/doc/docs/optimizer.rst b/doc/docs/optimizer.rst new file mode 100644 index 0000000..486c01e --- /dev/null +++ b/doc/docs/optimizer.rst @@ -0,0 +1,11 @@ +Optimizer +========= + + +.. automodule:: singa.optimizer + :members: + :member-order: bysource + :show-inheritance: + :undoc-members: + + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/tensor.md ---------------------------------------------------------------------- diff --git a/doc/docs/tensor.md b/doc/docs/tensor.md deleted file mode 100644 index eaf8362..0000000 --- a/doc/docs/tensor.md +++ /dev/null @@ -1,7 +0,0 @@ -# Tensor - - -## - - -## http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/tensor.rst ---------------------------------------------------------------------- diff --git a/doc/docs/tensor.rst b/doc/docs/tensor.rst new file mode 100644 index 0000000..ff6142e --- /dev/null +++ b/doc/docs/tensor.rst @@ -0,0 +1,30 @@ +Tensor +======== + +Each Tensor instance is a multi-dimensional array allocated on a specific +Device instance. Tensor instances store variables and provide +linear algebra operations over different types of hardware devices without user +awareness. Note that users need to make sure the tensor operands are +allocated on the same device except copy functions. + + +Tensor implementation +--------------------- + +SINGA has three different sets of implmentations of Tensor functions, one for each +type of Device. + +* 'tensor_math_cpp.h' implements operations using Cpp (with CBLAS) for CppGPU devices. +* 'tensor_math_cuda.h' implements operations using Cuda (with cuBLAS) for CudaGPU devices. +* 'tensor_math_opencl.h' implements operations using OpenCL for OpenclGPU devices. + +Python API +---------- + + +.. automodule:: singa.tensor + :members: + + +CPP API +--------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/docs/utils.rst ---------------------------------------------------------------------- diff --git a/doc/docs/utils.rst b/doc/docs/utils.rst new file mode 100644 index 0000000..5306719 --- /dev/null +++ b/doc/docs/utils.rst @@ -0,0 +1,6 @@ +Misc. +========= + + +.. automodule:: singa.utils + :members: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/doc/index.rst ---------------------------------------------------------------------- diff --git a/doc/index.rst b/doc/index.rst index ec727b1..50c65d7 100755 --- a/doc/index.rst +++ b/doc/index.rst @@ -7,9 +7,9 @@ Welcome to Apache Singa ======================= Recent News -=========== +----------- -* The **third release** is now available, 20 April, 2016. `Download SINGA v0.3.0 <downloads.html>`_ +* The **third release** is now available, 20 April, 2016. `Download SINGA v0.3.0 <downloads.html>`_ * The **second release** is now available, 14 Jan, 2016. `Download SINGA v0.2.0 <downloads.html>`_. @@ -34,7 +34,7 @@ Recent News * SINGA has been accepted by `Apache Incubator <http://incubator.apache.org/>`_, 17 March, 2015. Getting Started -=============== +--------------- * The `Introduction <docs/overview.html>`_ page gives an overview of SINGA. * The `Installation <docs/installation.html>`_ guide describes details on downloading and installing SINGA. @@ -42,7 +42,7 @@ Getting Started * Please follow the `Quick Start <docs/quick-start.html>`_ guide to run simple applications on SINGA. Documentation -============= +------------- * Documentations are listed `here <docs.html>`_. @@ -51,8 +51,8 @@ Documentation * Research publication list is available `here <http://www.comp.nus.edu.sg/~dbsystem/singa/research/publication/>`_. How to contribute -================= - +---------------------- + * Please subscribe to our development mailing list [email protected]. * If you find any issues using SINGA, please report it to the `Issue Tracker <https://issues.apache.org/jira/browse/singa>`_. @@ -62,17 +62,17 @@ How to contribute More details on contributing to SINGA is described `here <develop/how-contribute.html>`_ . Citing SINGA -============ +------------ Please cite the following two papers if you use SINGA in your research: * B. C. Ooi, K.-L. Tan, S. Wang, W. Wang, Q. Cai, G. Chen, J. Gao, Z. Luo, A. K. H. Tung, Y. Wang, Z. Xie, M. Zhang, and K. Zheng. `SINGA: A distributed deep learning platform <http://www.comp.nus.edu.sg/~ooibc/singaopen-mm15.pdf>`_. ACM Multimedia (Open Source Software Competition) 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-oss.txt>`_). -* W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang. `SINGA: putting deep learning in the hands of multimedia users <http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf>`_. ACM Multimedia 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-singa.txt>`_, `Slides <files/mm2015.ppt>`_). +* W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang. `SINGA: putting deep learning in the hands of multimedia users <http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf>`_. ACM Multimedia 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-singa.txt>`_, `Slides <files/mm2015.ppt>`_). .. toctree:: :hidden: - + downloads docs @@ -85,25 +85,25 @@ Please cite the following two papers if you use SINGA in your research: develop/how-contribute develop/contribute-code develop/contribute-docs - + .. toctree:: :hidden: :maxdepth: 2 :caption: Community - + community/source-repository community/mail-lists community/issue-tracking community/team-list - + License -======= +---------- SINGA is released under `Apache License Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`_. Disclaimers -=========== +----------- Apache SINGA is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF. http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/examples/index.rst ---------------------------------------------------------------------- diff --git a/examples/index.rst b/examples/index.rst new file mode 100644 index 0000000..d6faf5d --- /dev/null +++ b/examples/index.rst @@ -0,0 +1,6 @@ +.. toctree:: + + char-rnn/README + imagenet/README + + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/src/python/singa/device.py ---------------------------------------------------------------------- diff --git a/src/python/singa/device.py b/src/python/singa/device.py index aff3587..eff6783 100644 --- a/src/python/singa/device.py +++ b/src/python/singa/device.py @@ -68,21 +68,52 @@ def device_query(id, verbose=False): def create_cuda_gpus(num): + '''Create a list of CudaGPU devices. + + Args: + num (int): number of device to create. + Returns: + a list of swig converted CudaGPU devices. + ''' + return singa.Platform.CreateCudaGPUs(num) def create_cuda_gpu(): + '''Create a single CudaGPU device. + + Returns: + a swig converted CudaGPU device. + ''' + return singa.Platform.CreateCudaGPUs(1)[0] def create_cuda_gpus_on(device_ids): + '''Create a list of CudaGPU devices. + + Args: + device_ids (list): a list of GPU card IDs. + + Returns: + a list of swig converted CudaGPU devices. + ''' return singa.Platform.CreateCudaGPUsOn(device_ids) def create_cuda_gpu_on(device_id): + '''Create a CudaGPU device on the given device ID. + + Args: + device_id (int): GPU card ID. + + Returns: + a swig converted CudaGPU device. + ''' devices = create_cuda_gpus_on([device_id]) return devices[0] def get_default_device(): + '''Get the default host device which is a CppCPU device''' return singa.Platform.GetDefaultDevice() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/src/python/singa/initializer.py ---------------------------------------------------------------------- diff --git a/src/python/singa/initializer.py b/src/python/singa/initializer.py index 15caed3..277fd2f 100644 --- a/src/python/singa/initializer.py +++ b/src/python/singa/initializer.py @@ -15,29 +15,113 @@ # specific language governing permissions and limitations # under the License. # ============================================================================= -"""Popular initialization methods for parameter values (Tensor ojects)""" +'''Popular initialization methods for parameter values (Tensor objects). + +Example usages:: + + from singa import tensor + from singa import initializer + + x = tensor.Tensor((3, 5)) + initializer.xavier(x) +''' import math +''' +TODO(wangwei) update the uniform and gaussian initializers + +def uniform(t, fan_in=0, fan_out=0): + typically, for conv layer weight: fan_in = nb_filter * kh * kw, + fan_out = nb_channel * kh * kw + for dense layer weight, fan_in = input_feature_length, + fan_out = output_feature_length + # Ref: [Bengio and Glorot 2010]: Understanding the difficulty of + training deep feedforward neuralnetworks. + + assert fan_in >0 or fan_out > 0, \ + 'fan_in and fan_out cannot be 0 at the same time' + avg = 1 + if fan_in * fan_out == 0: + avg = 2 + x = math.sqrt(3.0f * avg / (fan_in + fan_out)) + t.uniform(-x, x) + + +def gaussian(t, fan_in=0, fan_out=0): + typically, for conv layer weight: fan_in = nb_filter * kh * kw, + fan_out = nb_channel * kh * kw + for dense layer weight, fan_in = input_feature_length, + fan_out = output_feature_length + + Ref Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Delving Deep into + Rectifiers: Surpassing Human-Level Performance on ImageNet Classification + + assert fan_in >0 or fan_out > 0, \ + 'fan_in and fan_out cannot be 0 at the same time' + avg = 1 + if fan_in * fan_out == 0: + avg = 2 + std = math.sqrt(2.0f * avg / (fan_in + fan_out)) + t.gaussian(0, std) +''' + + def uniform(t, low=0, high=1): + '''Initialize the parameter values following an Uniform distribution. + + Args: + t (Tensor): the parater tensor + low (float): lower bound + high (float): higher bound + ''' t.uniform(low, high) def gaussian(t, mean=0, std=0.01): + '''Initialize the parameter values following an Gaussian distribution. + + Args: + t (Tensor): the parater tensor + mean (float): mean of the distribution + std (float): standard variance + ''' t.gaussian(mean, std) def xavier(t): + '''Initialize the matrix parameter follow a Uniform distribution from + [-sqrt(6/(fan_in + fan_out)), sqrt(6/(fan_in + fan_out))]. + + Args: + t (Tensor): the parater tensor + ''' + scale = math.sqrt(6.0 / (t.shape[0] + t.shape[1])) t.uniform(-scale, scale) def glorot(t): + '''Initialize the matrix parameter follow a Gaussian distribution with + mean = 0 and std = sqrt(2.0 / (nb_row + nb_col)) + + Args: + t (Tensor): the parater tensor + ''' scale = math.sqrt(2.0 / (t.shape[0] + t.shape[1])) t.gaussian(0, 1) t *= scale def msra(t): + '''Initialize the matrix parameter follow a Guassian distribution with + mean = 0, std = math.sqrt(2.0 / nb_row). + + Ref [He, Zhang, Ren and Sun 2015]: Specifically accounts for ReLU + nonlinearities. + + Args: + t (Tensor): the parater tensor + ''' t.gaussian(0, math.sqrt(2.0 / t.shape[0])) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/src/python/singa/layer.py ---------------------------------------------------------------------- diff --git a/src/python/singa/layer.py b/src/python/singa/layer.py index c8c8c05..0759716 100644 --- a/src/python/singa/layer.py +++ b/src/python/singa/layer.py @@ -14,7 +14,30 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= -""" Python layers which wraps the C++ layers by providing easy to construct APIs +""" Python layers wrap the C++ layers to provide simpler construction APIs. + +Example usages:: + + from singa import layer + from singa import tensor + from singa import device + from singa.model_pb2 import kTrain + + layer.engine = 'cudnn' # to use cudnn layers + dev = device.create_cuda_gpu() + + # create a convolution layer + conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32)) + conv.to_device(dev) # move the layer data onto a CudaGPU device + x = tensor.Tensor((3, 32, 32), dev) + x.uniform(-1, 1) + y = conv.foward(kTrain, x) + + dy = tensor.Tensor() + dy.reset_like(y) + dy.set_value(0.1) + # dp is a list of tensors for parameter gradients + dx, dp = conv.backward(kTrain, dy) """ from sets import Set @@ -22,23 +45,37 @@ from . import singa_wrap from .proto import model_pb2 import tensor -# engine could be 'cudnn', 'singa', which is used to create layers. -# e.g., CudnnConvolution layer is identified by 'cudnn_convolution' -# Convolution layer is identified by 'singa_convolution' -# engine is case insensitive + engine = 'cudnn' +'''engine is the prefix of layer identifier. + +The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for +layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively. +For example, CudnnConvolution layer is identified by 'cudnn_convolution'; +'singacpp_convolution' is for Convolution layer; +Some layers' implementation use only Tensor functions, thererfore they are +transparent to the underlying devices. For threse layers, they would have +multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and +singacl_dropout are all for the Dropout layer. + +engine is case insensitive. Each python layer would create the correct specific +layer using the engine attribute. +''' class Layer(object): - """Base Python layer class. + '''Base Python layer class. - Usages: - 1. construct layer without input_sample_shapes, goto 2; - construct layer with input_sample_shapes, goto 3; + Typically, the life cycle of a layer instance includes: + 1. construct layer without input_sample_shapes, goto 2; + construct layer with input_sample_shapes, goto 3; 2. call setup to create the parameters and setup other meta fields 3. call forward or access layer members 4. call backward and get parameters for update - """ + + Args: + name (str): layer name + ''' def __init__(self, name, **kwargs): self.layer = None # layer converted by swig @@ -49,20 +86,24 @@ class Layer(object): self.has_setup = False def param_names(self): + ''' + Returns: + a list of strings, one for the name of one parameter Tensor + ''' names = [] for x in self.param_specs: names.append(x['name']) return names def setup(self, in_shapes): - """Call the C++ setup function to create params and set some meta data. + '''Call the C++ setup function to create params and set some meta data. Args: in_shapes: if the layer accepts a single input Tensor, in_shapes is a single tuple specifying the inpute Tensor shape; if the layer accepts multiple input Tensor (e.g., the concatenation layer), - in_shapes is a tuple of tuples, each for one input Tensor shape - """ + in_shapes is a tuple of tuples, each for one input Tensor + ''' if self.has_setup: return self.layer.Setup(list(in_shapes), @@ -70,54 +111,92 @@ class Layer(object): self.has_setup = True def get_output_sample_shape(self): + '''Called after setup to get the shape of the output sample(s). + + Returns: + a tuple for a single output Tensor or a list of tuples if this layer + has multiple outputs + ''' assert self.has_setup, \ 'Must call setup() before get_output_sample_shape()' return self.layer.GetOutputSampleShape() def param_values(self): - """Return param value tensors. + '''Return param value tensors. - Do not store these tensors as layer members because cpp Tensor could be - moved onto diff devices due to the change of layer device. However, the - py tensors would not update its internal cpp tensor automatically. - """ + Parameter tensors are not stored as layer members because cpp Tensor + could be moved onto diff devices due to the change of layer device, + which would result in inconsistency. + + Returns: + a list of tensors, one for each paramter + ''' return tensor.from_raw_tensors(self.layer.param_values()) - def forward(self, flag, input): + def forward(self, flag, x): '''Forward propagate through this layer. Args: - flag, kTrain or kEval - input, an input tensor + flag (int): kTrain or kEval + x (Tensor or list<Tensor>): an input tensor if the layer is + connected from a single layer; a list of tensors if the layer + is connected from multiple layers. Return: - a tensor for the transformed feature + a tensor if the layer is connected to a single layer; a list of + tensors if the layer is connected to multiple layers; ''' assert self.has_setup, 'Must call setup() before forward()' - assert isinstance(input, tensor.Tensor), 'input must be py Tensor' - y = self.layer.Forward(flag, input.singa_tensor) - return tensor.from_raw_tensor(y) + if type(x) == list: + xs = [] + for t in x: + x.append(t.singa_tensor) + else: + assert isinstance(input, tensor.Tensor), \ + 'input must be a Tensor or a list of Tensor' + xs = x + y = self.layer.Forward(flag, xs) + if type(y) == list: + return tensor.from_raw_tensors(y) + else: + return tensor.from_raw_tensor(y) - def backward(self, flag, grad): - '''Backward propagate through this layer. + def backward(self, flag, dy): + '''Backward propagate gradients through this layer. Args: - flag, for future use. - grad, gradient of the returned values of the forward function. - + flag (int): for future use. + dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the + objective loss Return: - <dx, <dp1, dp2..>>, dx is the gradient of the input of the - forward function, dpi is the gradient of the i-th parameter + <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x + , dpi is the gradient of the i-th parameter ''' - assert isinstance(grad, tensor.Tensor), 'grad must be py Tensor' - ret = self.layer.Backward(flag, grad.singa_tensor) - return tensor.from_raw_tensor(ret[0]), tensor.from_raw_tensors(ret[1]) + if type(dy) == list: + dys = [] + for t in dy: + dys.append(t.singa_tensor) + else: + assert isinstance(dy, tensor.Tensor), \ + 'the input must be a Tensor or a set of Tensor' + dys = dy.singa_tensor + ret = self.layer.Backward(flag, dys) + if type(ret[0]) == list: + dxs = tensor.from_raw_tensors(ret[0]) + else: + dxs = tensor.from_raw_tensor(ret[0]) + return dxs, tensor.from_raw_tensors(ret[1]) def to_device(self, device): + '''Move layer state tensors onto the given device. + + Args: + device: swig converted device, created using singa.device + ''' self.layer.ToDevice(device) def as_type(self, dtype): - self.layer.AsType(dtype) + pass def __copy__(self): pass @@ -127,43 +206,42 @@ class Layer(object): class Conv2D(Layer): + """Construct a layer for 2D convolution. + Args: + nb_kernels (int): num of the channels (kernels) of the input Tensor + kernel: an integer or a pair of integers for kernel height and width + stride: an integer or a pair of integers for stride height and width + border_mode (string): padding mode, case in-sensitive, + 'valid' -> padding is 0 for height and width + 'same' -> padding is half of the kernel (floor), the kernel must be + odd number. + cudnn_prefer (string): the preferred algorithm for cudnn convolution + which could be 'fatest', 'autotune', 'limited_workspace' and + 'no_workspace' + data_format (string): either 'NCHW' or 'NHWC' + use_bias (bool): True or False + pad: an integer or a pair of integers for padding height and width + W_specs (dict): used to specify the weight matrix specs, fields + include, + 'name' for parameter name + 'lr_mult' for learning rate multiplier + 'decay_mult' for weight decay multiplier + 'init' for init method, which could be 'gaussian', 'uniform', + 'xavier' and '' + 'std', 'mean', 'high', 'low' for corresponding init methods + TODO(wangwei) 'clamp' for gradient constraint, value is scalar + 'regularizer' for regularization, currently support 'l2' + b_specs (dict): hyper-parameters for bias vector, similar as W_specs + name (string): layer name. + input_sample_shape: 3d tuple for the shape of the input Tensor + without the batchsize, e.g., (channel, height, width) or + (height, width, channel) + """ def __init__(self, name, nb_kernels, kernel=3, stride=1, border_mode='same', cudnn_prefer='fatest', data_format='NCHW', use_bias=True, W_specs=None, b_specs=None, pad=None, input_sample_shape=None): - """Construct a layer for 2D convolution. - - Args: - nb_kernels (int): num of the channels (kernels) of the input Tensor - kernel: an integer or a pair of integers for kernel height and width - stride: an integer or a pair of integers for stride height and width - border_mode (string): padding mode, case in-sensitive, - 'valid' -> padding is 0 for height and width - 'same' -> padding is half of the kernel (floor), - the kernel must be odd number. - cudnn_prefer (string): the preferred algorithm for cudnn convolution - which could be 'fatest', 'autotune', 'limited_workspace' and - 'no_workspace' - data_format (string): either 'NCHW' or 'NHWC' - use_bias (bool): True or False - pad: an integer or a pair of integers for padding height and width - W_specs (dict): used to specify the weight matrix specs, fields - include, - 'name' for parameter name - 'lr_mult' for learning rate multiplier - 'decay_mult' for weight decay multiplier - 'init' for init method, which could be 'gaussian', 'uniform', - 'xavier' and '' - 'std', 'mean', 'high', 'low' for corresponding init methods - TODO(wangwei) 'clamp' for gradient constraint, value is scalar - 'regularizer' for regularization, currently support 'l2' - b_specs (dict): hyper-parameters for bias vector, similar as W_specs - name (string): layer name. - input_sample_shape: 3d tuple for the shape of the input Tensor - without the batchsize, e.g., (channel, height, width) or - (height, width, channel) - """ super(Conv2D, self).__init__(name) assert data_format == 'NCHW', 'Not supported data format: %s ' \ 'only "NCHW" is enabled currently' % (data_format) @@ -195,19 +273,19 @@ class Conv2D(Layer): class Conv1D(Conv2D): + """Construct a layer for 1D convolution. + + Most of the args are the same as those for Conv2D except the kernel, + stride, pad, which is a scalar instead of a tuple. + input_sample_shape is a tuple with a single value for the input feature + length + """ def __init__(self, name, nb_kernels, kernel=3, stride=1, border_mode='same', cudnn_prefer='fatest', use_bias=True, W_specs={'init': 'Xavier'}, b_specs={'init': 'Constant', 'value': 0}, pad=None, input_sample_shape=None): - """Construct a layer for 1D convolution. - - Most of the args are the same as those for Conv2D except the kernel, - stride, pad, which is a scalar instead of a tuple. - input_sample_shape is a tuple with a single value for the input feature - length - """ pad = None if pad is not None: pad = (0, pad) @@ -227,7 +305,15 @@ class Conv1D(Conv2D): class Pooling2D(Layer): + '''2D pooling layer providing max/avg pooling. + + All args are the same as those for Conv2D, except the following one + Args: + mode: pooling type, model_pb2.PoolingConf.MAX or + model_pb2.PoolingConf.AVE + + ''' def __init__(self, name, mode, kernel=3, stride=2, border_mode='same', pad=None, data_format='NCHW', input_sample_shape=None): super(Pooling2D, self).__init__(name) @@ -312,28 +398,26 @@ class AvgPooling1D(AvgPooling2D): class BatchNormalization(Layer): - # TODO(wangwei) add mode and epsilon arguments + """Batch-normalization. + Args: + momentum (float): for running average mean and variance. + beta_specs (dict): dictionary includes the fields for the beta + param: + 'name' for parameter name + 'lr_mult' for learning rate multiplier + 'decay_mult' for weight decay multiplier + 'init' for init method, which could be 'gaussian', 'uniform', + 'xavier' and '' + 'std', 'mean', 'high', 'low' for corresponding init methods + 'clamp' for gradient constraint, value is scalar + 'regularizer' for regularization, currently support 'l2' + gamma_specs (dict): similar to beta_specs, but for the gamma param. + name (string): layer name + input_sample_shape (tuple): with at least one integer + """ def __init__(self, name, momentum=0.9, beta_specs=None, gamma_specs=None, input_sample_shape=None): - """Batch-normalization. - - Args: - momentum (float): for running average mean and variance. - beta_specs (dict): dictionary includes the fields for the beta - param: - 'name' for parameter name - 'lr_mult' for learning rate multiplier - 'decay_mult' for weight decay multiplier - 'init' for init method, which could be 'gaussian', 'uniform', - 'xavier' and '' - 'std', 'mean', 'high', 'low' for corresponding init methods - 'clamp' for gradient constraint, value is scalar - 'regularizer' for regularization, currently support 'l2' - gamma_specs (dict): similar to beta_specs, but for the gamma param. - name (string): layer name - input_sample_shape (tuple): with at least one integer - """ super(BatchNormalization, self).__init__(name) conf = self.conf.batchnorm_conf conf.factor = momentum @@ -362,16 +446,17 @@ class BatchNormalization(Layer): class LRN(Layer): + """Local response normalization. + + Args: + size (int): # of channels to be crossed + normalization. + mode (string): 'cross_channel' + input_sample_shape (tuple): 3d tuple, (channel, height, width) + """ + def __init__(self, name, size=5, alpha=1, beta=0.75, mode='cross_channel', k=1, input_sample_shape=None): - """Local response normalization. - - Args: - size (int): # of channels to be crossed - normalization. - mode (string): 'cross_channel' - input_sample_shape (tuple): 3d tuple, (channel, height, width) - """ super(LRN, self).__init__(name) conf = self.conf.lrn_conf conf.local_size = size @@ -388,29 +473,28 @@ class LRN(Layer): class Dense(Layer): + """Apply linear/affine transformation, also called inner-product or + fully connected layer. + Args: + num_output (int): output feature length. + use_bias (bool): add a bias vector or not to the transformed feature + W_specs (dict): specs for the weight matrix + 'name' for parameter name + 'lr_mult' for learning rate multiplier + 'decay_mult' for weight decay multiplier + 'init' for init method, which could be 'gaussian', 'uniform', + 'xavier' and '' + 'std', 'mean', 'high', 'low' for corresponding init methods + 'clamp' for gradient constraint, value is scalar + 'regularizer' for regularization, currently support 'l2' + b_specs (dict): specs for the bias vector, same fields as W_specs. + W_transpose (bool): if true, output=x*W.T+b; + input_sample_shape (tuple): input feature length + """ def __init__(self, name, num_output, use_bias=True, W_specs=None, b_specs=None, W_transpose=True, input_sample_shape=None): - """Apply linear/affine transformation, also called inner-product or - fully connected layer. - - Args: - num_output (int): output feature length. - use_bias (bool): add a bias vector or not to the transformed feature - W_specs (dict): specs for the weight matrix - 'name' for parameter name - 'lr_mult' for learning rate multiplier - 'decay_mult' for weight decay multiplier - 'init' for init method, which could be 'gaussian', 'uniform', - 'xavier' and '' - 'std', 'mean', 'high', 'low' for corresponding init methods - 'clamp' for gradient constraint, value is scalar - 'regularizer' for regularization, currently support 'l2' - b_specs (dict): specs for the bias vector, same fields as W_specs. - W_transpose (bool): if true, output=x*W.T+b; - input_sample_shape (tuple): input feature length - """ super(Dense, self).__init__(name) conf = self.conf.dense_conf conf.num_output = num_output @@ -435,14 +519,14 @@ class Dense(Layer): class Dropout(Layer): + """Droput layer. - def __init__(self, name, p=0.5, input_sample_shape=None): - """Droput layer. + Args: + p (float): probability for dropping out the element, i.e., set to 0 + name (string): layer name + """ - Args: - p (float): probability for dropping out the element, i.e., set to 0 - name (string): layer name - """ + def __init__(self, name, p=0.5, input_sample_shape=None): super(Dropout, self).__init__(name) conf = self.conf.dropout_conf conf.dropout_ratio = p @@ -456,15 +540,14 @@ class Dropout(Layer): class Activation(Layer): + """Activation layers. + Args: + name (string): layer name + mode (string): 'relu', 'sigmoid', or 'tanh' + input_sample_shape (tuple): shape of a single sample + """ def __init__(self, name, mode='relu', input_sample_shape=None): - """Activation layers. - - Args: - name (string): layer name - mode (string): 'relu', 'sigmoid', or 'tanh' - input_sample_shape (tuple): shape of a single sample - """ super(Activation, self).__init__(name) self.conf.type = (engine + '_' + mode).lower() _check_engine(engine, ['cudnn', 'singa']) @@ -474,15 +557,14 @@ class Activation(Layer): class Softmax(Layer): + """Apply softmax. + Args: + axis (int): reshape the input as a matrix with the dimension + [0,axis) as the row, the [axis, -1) as the column. + input_sample_shape (tuple): shape of a single sample + """ def __init__(self, name, axis=1, input_sample_shape=None): - """Apply softmax. - - Args: - axis (int): reshape the input as a matrix with the dimension - [0,axis) as the row, the [axis, -1) as the column. - input_sample_shape (tuple): shape of a single sample - """ super(Softmax, self).__init__(name) # conf = self.conf.softmax_conf # conf.axis = axis @@ -493,14 +575,14 @@ class Softmax(Layer): class Flatten(Layer): + """Reshape the input tensor into a matrix. + Args: + axis (int): reshape the input as a matrix with the dimension + [0,axis) as the row, the [axis, -1) as the column. + input_sample_shape (tuple): shape for a single sample + """ def __init__(self, name, axis=1, input_sample_shape=None): - """Reshape the input tensor into a matrix. - Args: - axis (int): reshape the input as a matrix with the dimension - [0,axis) as the row, the [axis, -1) as the column. - input_sample_shape (tuple): shape for a single sample - """ super(Flatten, self).__init__(name) conf = self.conf.flatten_conf conf.axis = axis @@ -511,26 +593,27 @@ class Flatten(Layer): class RNN(Layer): + '''Recurrent layer with 4 types of units, namely lstm, gru, tanh and relu. + + Args: + hidden_size: hidden feature size, the same for all stacks of layers. + rnn_mode: decides the rnn unit, which could be one of 'lstm', 'gru', + 'tanh' and 'relu', refer to cudnn manual for each mode. + num_stacks: num of stacks of rnn layers. It is different to the + unrolling seqence length. + input_mode: 'linear' convert the input feature x by by a linear + transformation to get a feature vector of size hidden_size; + 'skip' does nothing but requires the input feature size equals + hidden_size + bidirection: True for bidirectional RNN + param_specs: config for initializing the RNN parameters. + input_sample_shape: includes a single integer for the input sample + feature size. + ''' + def __init__(self, name, hidden_size, rnn_mode='lstm', dropout=0.0, num_stacks=1, input_mode='linear', bidirectional=False, param_specs=None, input_sample_shape=None): - '''Wrapper for singa::RNN class. - - Args: - hidden_size, hidden feature size, the same for all stacks of layers. - rnn_mode, decides the rnn unit, which could be one of 'lstm', 'gru', - 'tanh' and 'relu', refer to cudnn manual for each mode. - num_stacks, num of stacks of rnn layers. It is different to the - unrolling seqence length. - input_mode, 'linear' convert the input feature x by by a linear - transformation to get a feature vector of size hidden_size; - 'skip' does nothing but requires the input feature size equals - hidden_size - bidirection, True for bidirectional RNN - param_specs, config for initializing the RNN parameters. - input_sample_shape, includes a single integer for the input sample - feature size. - ''' super(RNN, self).__init__(name) conf = self.conf.rnn_conf assert hidden_size > 0, 'Hidden feature size must > 0' @@ -605,7 +688,7 @@ class RNN(Layer): Returns: <dx1, dx2, ... dxn, dhx, dcx>, where dxi is the gradient tensor for - the i-th input, its shape is (batch_size, + the i-th input, its shape is (batch_size, input_feature_length). dhx is the gradient for the initial hidden state. dcx is the gradient for the initial cell state, which is valid only for lstm. @@ -741,5 +824,7 @@ def _construct_param_specs_from_dict(specs): def get_layer_list(): - """ Return a list of strings reprensenting the all supported layers""" + """ Return a list of strings which include the identifiers (tags) of all + supported layers + """ return singa_wrap.GetRegisteredLayers() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/src/python/singa/loss.py ---------------------------------------------------------------------- diff --git a/src/python/singa/loss.py b/src/python/singa/loss.py index acfb813..c88290b 100644 --- a/src/python/singa/loss.py +++ b/src/python/singa/loss.py @@ -15,32 +15,127 @@ # specific language governing permissions and limitations # under the License. # ============================================================================= -""" Python wrappers for optimizers implemented by C++.""" + +''' +Loss module includes a set of training loss implmentations. Some are converted +from C++ implementation, and the rest are implemented directly using python +Tensor. + +Example usage:: + + from singa import tensor + from singa import loss + from singa.proto import model_pb2 + + x = tensor.Tensor((3, 5)) + x.uniform(0, 1) # randomly genearte the prediction activation + y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int)) # set the truth + + f = loss.SoftmaxCrossEntropy() + l = f.forward(model_pb2.kTrain, x, y) # l is tensor with 3 loss values + g = f.backward() # g is a tensor containing all gradients of x w.r.t l +''' + from . import singa_wrap as singa import tensor class Loss(object): + '''Base loss class. + + Subclasses that wrap the C++ loss classes can use the inherited foward, + backward, and evaluate functions of this base class. Other subclasses need + to override these functions + ''' def __init__(self): self.swig_loss = None def forward(self, flag, x, y): - """Return a tensor of floats, one per sample""" + '''Compute the loss values. + + Args: + flag (int): kTrain or kEval. If it is kTrain, then the backward + function must be called before calling forward again. + x (Tensor): the prediction Tensor + y (Tensor): the ground truch Tensor, x.shape[0] must = y.shape[0] + + Returns: + a tensor of floats for the loss values, one per sample + ''' return tensor.from_raw_tensor( self.swig_loss.Forward(flag, x.singa_tensor, y.singa_tensor)) def backward(self): - """Return the grad of x w.r.t. the loss obj""" + ''' + Returns: + the grad of x w.r.t. the loss + ''' return tensor.from_raw_tensor(self.swig_loss.Backward()) - def evaluate(self, flag, x, y): - """Return the averaged loss for all samples in x""" + def evaluate(self, flag, x, y): # TODO(wangwei) remove flag + ''' + Args: + flag (int): must be kEval, to be removed + x (Tensor): the prediction Tensor + y (Tensor): the ground truth Tnesor + + Returns: + the averaged loss for all samples in x. + ''' return self.swig_loss.Evaluate(flag, x.singa_tensor, y.singa_tensor) class SoftmaxCrossEntropy(Loss): + '''This loss function is a combination of SoftMax and Cross-Entropy loss. + + It converts the inputs via SoftMax function and then + computes the cross-entropy loss against the ground truth values. + ''' def __init__(self): self.swig_loss = singa.SoftmaxCrossEntropy() + + +class SquaredError(Loss): + '''This loss evaluates the squared error between the prediction and the + truth values. + + It is implemented using Python Tensor operations. + ''' + def __init__(self): + super(Loss, SquaredError).__init__() + self.err = None + + def forward(self, flag, x, y): + '''Compute the error as 0.5 * ||x-y||^2. + + Args: + flag (int): kTrain or kEval; if kTrain, then the backward must be + called before calling forward again. + x (Tensor): the prediction Tensor + y (Tensor): the truth Tensor, an integer value per sample, whose + value is [0, x.shape[1]) + + Returns: + a Tensor with one error value per sample + ''' + self.err = x - y + return 0.5 * tensor.squared(self.err) + + def backward(self): + '''Compute the gradient of x w.r.t the error. + + Returns: + x - y + ''' + return self.err + + def evaluate(self, flag, x, y): + '''Compuate the averaged error. + + Returns: + a float value as the averaged error + ''' + return tensor.sum(0.5 * tensor.squared(x - y)) / x.size() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/src/python/singa/metric.py ---------------------------------------------------------------------- diff --git a/src/python/singa/metric.py b/src/python/singa/metric.py index 31b6892..3a5750d 100644 --- a/src/python/singa/metric.py +++ b/src/python/singa/metric.py @@ -15,28 +15,71 @@ # specific language governing permissions and limitations # under the License. # ============================================================================= -""" Python wrappers for optimizers implemented by C++.""" +'''This module includes a set of metric classes for evaluating the model's +performance. The specific metric classes could be converted from C++ +implmentation or implemented directly using Python. + + +Example usage:: + + from singa import tensor + from singa import metric + + x = tensor.Tensor((3, 5)) + x.uniform(0, 1) # randomly genearte the prediction activation + x = tensor.SoftMax(x) # normalize the prediction into probabilities + y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int)) # set the truth + + f = metric.Accuracy() + acc = f.evaluate(x, y) # averaged accuracy over all 3 samples in x + +''' from . import singa_wrap as singa import tensor class Metric(object): + '''Base metric class. + + Subclasses that wrap the C++ loss classes can use the inherited foward, + and evaluate functions of this base class. Other subclasses need + to override these functions. Users need to feed in the **predictions** and + ground truth to get the metric values. + ''' def __init__(self): self.swig_metric = None def forward(self, x, y): - """Return a tensor of floats, one per sample""" + '''Compute the metric for each sample. + + Args: + x (Tensor): predictions, one row per sample + y (Tensor): ground truth values, one row per sample + + Returns: + a tensor of floats, one per sample + ''' return tensor.from_raw_tensor( self.swig_metric.Forward(x.singa_tensor, y.singa_tensor)) def evaluate(self, x, y): - """Return the averaged metric for all samples in x""" + '''Compute the averaged metric over all samples. + + Args: + x (Tensor): predictions, one row per sample + y (Tensor): ground truth values, one row per sample + Returns: + a float value for the averaged metric + ''' return self.swig_metric.Evaluate(x.singa_tensor, y.singa_tensor) class Accuracy(Metric): + '''Compute the top one accuracy for singel label prediction tasks. + It calls the C++ functions to do the calculation. + ''' def __init__(self): self.swig_metric = singa.Accuracy() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8cd55300/src/python/singa/optimizer.py ---------------------------------------------------------------------- diff --git a/src/python/singa/optimizer.py b/src/python/singa/optimizer.py index 503527f..5d38997 100644 --- a/src/python/singa/optimizer.py +++ b/src/python/singa/optimizer.py @@ -15,7 +15,22 @@ # specific language governing permissions and limitations # under the License. # ============================================================================= -""" Python wrappers for optimizers implemented by C++.""" +'''This module includes a set of optimizers for updating model parameters. + +Example usage:: + + from singa import optimizer + from singa import tensor + + sgd = optimizer.SGD(lr=0.01, momentum=0.9, weight_decay=1e-4) + p = tensor.Tensor((3,5)) + p.uniform(-1, 1) + g = tensor.Tensor((3,5)) + g.gaussian(0, 0.01) + + sgd.apply(1, g, p, 'param') # use the global lr=0.1 for epoch 1 + sgd.apply_with_lr(2, 0.03, g, p, 'param') # use lr=0.03 for epoch 2 +''' from . import singa_wrap as singa import tensor @@ -23,53 +38,44 @@ from proto import model_pb2 class Optimizer(object): - """Base python optimizer. - - Usages: - 1. construct the optimizer - 2. (optional) register each parameter with its specs. - 3. use the optimizer to update parameter values given parameter - gradients and other optional info - """ - + '''The base python optimizer class. + + Typically, an optimizer is used as follows: + + 1. construct the optimizer + 2. (optional) register each parameter with its specs. + 3. use the optimizer to update parameter values given parameter + gradients and other optional info + + The subclasses should override the apply_with_lr function to do the real + parameter udpate. + + Args: + lr (float): a constant for the learning rate, mutually exclusive with + 'lr_gen'. + momentum (float): a constant for the momentum value + decay (float): the coefficent for L2 regularizer, which is mutually + exclusive with 'regularizer'. + lr_gen (function): a function returns the learning rate given + the current training step/epoch. It is mutually exclusive with lr. + If both are not set, the apply_with_lr function should be used for + param updating. + regularizer: an instance of Regularizer or RegularizerConf; If set, + regularization would be applied in apply_with_lr(). + Users can also do regularization outside. + constraint: an instance of Constraint or ConstraintConf; If set, + constraint would be applied inside apply_with_lr(). Users can + also do regularization outside. + ''' def __init__(self, lr=None, momentum=None, decay=None, lr_gen=None, - momentum_gen=None, regularizer=None, constraint=None): - """Constructor. - - Args: - lr: a constant or a function that generates learning rate given a - step, which is mutually exclusive with 'lr_gen'. - momentum: a constant or a function that generates the momentum value - given a step. - decay (float): the coefficent for L2 regularizer, which is mutually - exclusive with 'regularizer'. - lr_gen (function): a function returns the learning rate given - the current training step. It is mutually exclusive with lr. If - both are not set, the apply_with_lr function should be used for - param updating. - momentum_gen (function): a function returns the momentum value given - the current training step. It is mutually exclusive with - momentum. - regularizer: an instance of Regularizer or RegularizerConf; If set, - regularization would be applied in apply_with_lr(). - Users can also do regularization outside. - constraint: an instance of Constraint or ConstraintConf; If set, - constraint would be applied inside apply_with_lr(). Users can - also do regularization outside. - """ + regularizer=None, constraint=None): if lr is not None: assert lr_gen is None, 'Cannot set lr and lr_gen at the same time' - def lr_gen(step): + def lr_gen(epoch): return lr self.lr_gen = lr_gen - if momentum is not None: - assert momentum_gen is None, 'Cannot set momentum and momentum_gen'\ - ' at the same time' - - def momentum_gen(step): - return momentum - self.momentum_gen = momentum_gen + self.momentum = momentum if decay is not None: assert regularizer is None, \ 'Cannot set decay and regularizer at the same time' @@ -94,14 +100,15 @@ class Optimizer(object): self.learning_rate_multiplier = {} def register(self, name, specs): - """Register the param specs, including creating regularizer and + '''Register the param specs, including creating regularizer and constraint per param object. Param specific regularizer and constraint have higher priority than the global ones. Args: name (str): parameter name - specs (ParamSpec): protobuf obj - """ + specs (ParamSpec): protobuf obj, including regularizer and + constraint, multipliers for learning rate and weight decay. + ''' if specs.HasField('regularizer'): self.regularizers[name] = CppRegularizer(specs.constraint) if specs.HasField('constraint'): @@ -111,8 +118,8 @@ class Optimizer(object): if specs.decay_mult != 1: self.decay_multiplier[name] = specs.decay_mult - def apply_regularizer_constraint(self, value, grad, name=None, step=None): - """Apply regularization and constraint if available. + def apply_regularizer_constraint(self, value, grad, name=None, epoch=None): + '''Apply regularization and constraint if available. If there are both global regularizer (constraint) and param specific regularizer (constraint), it would use the param specific one. @@ -121,46 +128,48 @@ class Optimizer(object): value (Tensor): parameter value Tensor grad (Tensor): parameter gradient Tensor name (string): to get parameter specific regularizer or constraint - step (int): some regularizer or constraint would use step + epoch (int): some regularizer or constraint would use epoch - Return: + Returns: the updated gradient Tensor - """ + ''' if name is not None and name in self.constraints: - self.constraints[name].apply(value, grad, step) + self.constraints[name].apply(value, grad, epoch) elif self.constraint is not None: - self.constraint.apply(step, value, grad) + self.constraint.apply(epoch, value, grad) if name is not None and name in self.regularizers: - self.regularizers[name].apply(value, grad, step) + self.regularizers[name].apply(value, grad, epoch) elif self.regularizer is not None: - self.regularizer.apply(step, value, grad) + self.regularizer.apply(epoch, value, grad) return grad - def apply_with_lr(self, step, lr, grad, value, name=None): - """Do update with given learning rate. + def apply_with_lr(self, epoch, lr, grad, value, name=None): + '''Do update with given learning rate. The subclass optimizer must override this function. + Args: - step (int): training step (could be iteration or epoch) + epoch (int): training epoch (could be iteration or epoch) lr (float): learning rate grad (Tensor): parameter gradient value (Tesnor): parameter value name (string): paramter name to retrieval parameter specific updating rules (including regularizer and constraint) - Return: + Returns: updated parameter value - """ + ''' assert False, 'This is the base function, pls call the subclass func' return value - def apply(self, step, grad, value, name=None): - """Do update assume the learning rate generator is set. + def apply(self, epoch, grad, value, name=None): + '''Do update assuming the learning rate generator is set. The subclass optimizer does not need to override this function. + Args: - step (int): training step (could be iteration or epoch) + epoch (int): training epoch (could be iteration or epoch) grad (Tensor): parameter gradient value (Tesnor): parameter value name (string): paramter name to retrieval parameter specific @@ -168,97 +177,109 @@ class Optimizer(object): Return: updated parameter value - """ - + ''' assert self.lr_gen is not None, 'Learning rate generator is not set.'\ 'Either set the lr_gen in constructor or call apply_with_lr' - lr = self.lr_gen(step) - return self.apply_with_lr(step, lr, grad, value, name) + lr = self.lr_gen(epoch) + return self.apply_with_lr(epoch, lr, grad, value, name) class SGD(Optimizer): + '''The vallina Stochasitc Gradient Descent algorithm with momentum. - def __init__(self, lr=None, momentum=None, decay=None, **kwargs): - """The vallina Stochasitc Gradient Descent algorithm. + See the base Optimizer for all arguments. + ''' - See the base Optimizer for all arguments. - """ - super(SGD, self).__init__(lr, momentum, decay) + def __init__(self, lr=None, momentum=None, decay=None, lr_gen=None, + regularizer=None, constraint=None): + super(SGD, self).__init__(lr, momentum, decay, lr_gen, regularizer, + constraint) conf = model_pb2.OptimizerConf() + conf.momentum = self.momentum + conf.type = 'sgd' self.opt = singa.CreateOptimizer('SGD') self.opt.Setup(conf.SerializeToString()) - def apply_with_lr(self, step, lr, grad, value, name): - self.apply_regularizer_constraint(step, value, grad, name) - self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor) + def apply_with_lr(self, epoch, lr, grad, value, name): + self.apply_regularizer_constraint(epoch, value, grad, name) + self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor) return value class Nesterov(Optimizer): + '''The SGD with Nesterov momentum. - def __init__(self, lr=None, momentum=0.9, decay=None, **kwargs): - """The SGD with Nesterov momentum + See the base Optimizer for all arguments. + ''' - See the base Optimizer for all arguments. - """ - super(Nesterov, self).__init__(lr, momentum, decay, kwargs) + def __init__(self, lr=None, momentum=0.9, decay=None, lr_gen=None, + regularizer=None, constraint=None): + super(Nesterov, self).__init__(lr, momentum, decay, lr_gen, regularizer, + constraint) conf = model_pb2.OptimizerConf() + conf.momentum = momentum + conf.type = 'nesterov' self.opt = singa.CreateOptimizer('Nesterov') self.opt.Setup(conf.SerializeToString()) - def apply_with_lr(self, step, lr, grad, value, name): - self.apply_regularizer_constraint(step, value, grad, name) - self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor) + def apply_with_lr(self, epoch, lr, grad, value, name): + self.apply_regularizer_constraint(epoch, value, grad, name) + self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor) return value class AdaGrad(Optimizer): + '''AdaGrad optimizer. - def __init__(self, epsilon=1e-8, lr=None, decay=None, **kwargs): - """AdaGrad optimizer. + See the base Optimizer for all constructor args. - See the base Optimizer for all constructor args. - Args: - epsilon (float): small number for preventing numeric error. - """ - super(RMSProp, self).__init__(lr, decay, **kwargs) + Args: + epsilon (float): small number for preventing numeric error. + ''' + def __init__(self, epsilon=1e-8, lr=None, decay=None, lr_gen=None, + regularizer=None, constraint=None): + super(RMSProp, self).__init__(lr, decay, lr_gen, regularizer, + constraint) conf = model_pb2.OptimizerConf() conf.delta = epsilon + conf.type = 'adagrad' self.opt = singa.CreateOptimizer('AdaGrad') self.opt.Setup(conf.SerializeToString()) - def apply_with_lr(self, step, lr, grad, value, name): - grad = self.apply_regularizer_constraint(step, value, grad, name) - self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor) + def apply_with_lr(self, epoch, lr, grad, value, name): + grad = self.apply_regularizer_constraint(epoch, value, grad, name) + self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor) return value class RMSProp(Optimizer): + '''RMSProp optimizer. - def __init__(self, rho=0.9, epsilon=1e-8, lr=None, decay=None, **kwargs): - """RMSProp optimizer. + See the base Optimizer for all constructor args. - See the base Optimizer for all constructor args. - Args: - rho (float): float within [0, 1] - epsilon (float): small value for preventing numeric error - """ - super(RMSProp, self).__init__(lr, decay, kwargs) + Args: + rho (float): float within [0, 1] + epsilon (float): small value for preventing numeric error + ''' + + def __init__(self, rho=0.9, epsilon=1e-8, lr=None, decay=None, lr_gen=None, + regularizer=None, constraint=None): + super(RMSProp, self).__init__(lr, decay, lr_gen, regularizer, + constraint) conf = model_pb2.OptimizerConf() conf.rho = rho conf.delta = epsilon self.opt = singa.CreateOptimizer('RMSProp') self.opt.Setup(conf.SerializeToString()) - def apply_with_lr(self, step, lr, grad, value, name): - grad = self.apply_regularizer_constraint(step, value, grad, name) - self.opt.Apply(step, lr, name, grad.singa_tensor, value.singa_tensor) + def apply_with_lr(self, epoch, lr, grad, value, name): + grad = self.apply_regularizer_constraint(epoch, value, grad, name) + self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor) return value class Regularizer(object): - """Base Python regularizer for parameter gradients. - """ + '''Base Python regularizer for parameter gradients.''' def apply(self, value, grad): assert False, 'Not Implemented. Call the subclass function.' @@ -266,34 +287,32 @@ class Regularizer(object): class CppRegularizer(Regularizer): - """Wrapper for regularizer implemented using C++. - """ + '''Wrapper for regularizer implemented using C++. - def __init__(self, conf): - """Constructor. + Args: + conf (RegularizerConf): protobuf message for the configuration. + ''' - Args: - conf (RegularizerConf): protobuf message for the configuration. - """ + def __init__(self, conf): self.reg = singa.CreateRegularizer(conf.type) self.reg.Setup(conf.SerializeToString()) - def apply(self, step, value, grad): - self.reg.Apply(step, value.singa_tensor, grad.singa_tensor) + def apply(self, epoch, value, grad): + self.reg.Apply(epoch, value.singa_tensor, grad.singa_tensor) return grad class L2Regularizer(Regularizer): - """L2 regularization""" + '''L2 regularization + + Args: + coefficient (float): regularization coefficient. + ''' def __init__(self, coefficient): - """ - Args: - coefficient (float): regularization coefficient. - """ self.coefficient = coefficient - def apply(self, step, value, grad, coefficient=None): + def apply(self, epoch, value, grad, coefficient=None): if coefficient is None: assert self.coefficient is not None, 'Must set the coefficient' coefficient = self.coefficient @@ -302,39 +321,34 @@ class L2Regularizer(Regularizer): class Constraint(object): - """Base Python constraint class for paramter gradients. - """ + '''Base Python constraint class for paramter gradients''' - def apply(self, step, value, grad): + def apply(self, epoch, value, grad): return grad class CppConstraint(Constraint): - """Wrapper for constraints implemented using C++. - """ + '''Wrapper for constraints implemented using C++. + Args: + conf (ConstraintConf): protobuf message for the configuration. + ''' def __init__(self, conf): - """Constructor. - - Args: - conf (ConstraintConf): protobuf message for the configuration. - """ self.constraint = singa.CreateConstraint(conf.type) self.constraint.Setup(conf.SerializeToString()) - def apply(self, step, value, grad): - self.constraint.Apply(step, value.singa_tensor, grad.singa_tensor) + def apply(self, epoch, value, grad): + self.constraint.Apply(epoch, value.singa_tensor, grad.singa_tensor) return grad class L2Constraint(Constraint): - """Rescale the gradient to make the L2 norm <= a given threshold. - """ + '''Rescale the gradient to make the L2 norm <= a given threshold''' def __init__(self, threshold=None): self.threshold = threshold - def apply(self, step, value, grad, threshold=None): + def apply(self, epoch, value, grad, threshold=None): if threshold is None: assert self.threshold is not None, 'Must set the threshold' threshold = self.threshold
