SINGA-41 Support single node single GPU training

Remove hard code of the argument for Param::GenUpdateMsg();
Now, we have to copy the gradients from gpu to cpu, and then copy the
parameter values from cpu to gpu. It is necessary for multple gpu cards
as the gradients must be aggregated on CPU (or one GPU).
But for single GPU card, we may do some special checks to avoid the
memory copy (since the parameter updates can be done in the same GPU
card as the worker works on).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/cb30eadc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/cb30eadc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/cb30eadc

Branch: refs/heads/gpu
Commit: cb30eadc8d4203b9ec071198b65ded95941a2fa3
Parents: 2818605
Author: Wei Wang <[email protected]>
Authored: Tue Sep 15 22:45:18 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Tue Sep 29 10:22:12 2015 +0800

----------------------------------------------------------------------
 Makefile.gpu                      |   2 +-
 include/mshadow/tensor.h          |   9 +-
 include/mshadow/tensor_random.h   | 447 ++++++++++-----------
 include/singa.h                   |   6 +
 include/utils/blob.h              |   1 +
 include/utils/param.h             |   5 +
 src/neuralnet/connection_layer.cc |   6 +-
 src/neuralnet/neuron_layer.cc     | 684 ---------------------------------
 src/neuralnet/neuron_layer.cu     | 678 ++++++++++++++++++++++++++++++++
 src/trainer/server.cc             | 256 ++++++++++++
 src/trainer/trainer.cc            |  23 +-
 src/utils/blob.cc                 |   2 +
 src/utils/param.cc                |   5 +
 src/utils/updater.cc              |  38 +-
 14 files changed, 1226 insertions(+), 936 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/Makefile.gpu
----------------------------------------------------------------------
diff --git a/Makefile.gpu b/Makefile.gpu
index 0a3bd74..6bcd361 100644
--- a/Makefile.gpu
+++ b/Makefile.gpu
@@ -61,7 +61,7 @@ TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, 
$(TEST_CUDA_SRCS:.cu=.o)))
 -include $(TEST_CUDA_OBJS:%.o=%.P)
 
 
-SINGA_CUDA_SRCS :=$(shell find src/neuralnet/ -maxdepth 1 -name "*.cu")
+SINGA_CUDA_SRCS :=$(shell find src/ -maxdepth 2 -name "*.cu")
 SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, 
$(SINGA_CUDA_SRCS:.cu=.o)))
 -include $(SINGA_CUDA_OBJS:%.o=%.P)
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/include/mshadow/tensor.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h
index 42d13d3..e4ba0aa 100644
--- a/include/mshadow/tensor.h
+++ b/include/mshadow/tensor.h
@@ -196,6 +196,13 @@ namespace mshadow {
         const static int kDevMask = 1<<1;
     };
 
+#ifndef CPU_ONLY
+    #define xpu gpu
+#else
+    #define xpu cpu
+#endif
+
+
     // more compact template
     /*!
      * \brief general tensor
@@ -463,7 +470,7 @@ namespace mshadow{
 #include "tensor_gpu-inl.hpp"
 // extension of expressions
 #include "tensor_expr_ext.h"
-// io 
+// io
 #include "tensor_io.h"
 // container
 #include "tensor_container.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/include/mshadow/tensor_random.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_random.h b/include/mshadow/tensor_random.h
index 59ef082..c82e8aa 100644
--- a/include/mshadow/tensor_random.h
+++ b/include/mshadow/tensor_random.h
@@ -29,232 +29,241 @@ namespace mshadow {
         /*!
          * \brief constructor of random engine using default seed
          */
-        Random<cpu> (){
-          // obtain a seed from the system clock:
-          unsigned s= 
std::chrono::system_clock::now().time_since_epoch().count();
-          Seed(s);
-        }
-        /*!
-         * \brief constructor of random engine
-         * \param seed random number seed
-         */
-        Random<cpu>( int seed ){
-            #if MSHADOW_USE_MKL
-            int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
-            utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine 
failed to be initialized.\n" );
-            #else
-            //srand(seed);
-            gen_.seed(seed);
-            #endif
-            buffer_.Resize( Shape1( kRandBufferSize ) );
-        }
-        ~Random<cpu>() {
-            #if MSHADOW_USE_MKL
-            vslDeleteStream(&vStream_);
-            #endif
-        }
-        /*!
-         * \brief seed random number generator using this seed
-         * \param seed seed of prng
-         */
-        inline void Seed( int seed ){
-            #if MSHADOW_USE_MKL
-            int status = vslDeleteStream(&vStream_);
-            utils::Assert(status == VSL_STATUS_OK);
-            status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
-            utils::Assert(status == VSL_STATUS_OK);
-            #else
-            // srand( seed );
-            gen_.seed(seed);
-            #endif
-        }
-        template<int dim>
-        inline void SampleBinary(Tensor<cpu, dim> &src) {
-          SampleBinary(src, src);
-        }
+      Random<cpu> (){
+        // obtain a seed from the system clock:
+        unsigned s= 
std::chrono::system_clock::now().time_since_epoch().count();
+        Seed(s);
+      }
+      /*!
+        * \brief constructor of random engine
+        * \param seed random number seed
+        */
+      Random<cpu>( int seed ){
+          #if MSHADOW_USE_MKL
+          int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
+          utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine 
failed to be initialized.\n" );
+          #else
+          //srand(seed);
+          gen_.seed(seed);
+          #endif
+          buffer_.Resize( Shape1( kRandBufferSize ) );
+      }
+      ~Random<cpu>() {
+          #if MSHADOW_USE_MKL
+          vslDeleteStream(&vStream_);
+          #endif
+      }
+      /*!
+        * \brief seed random number generator using this seed
+        * \param seed seed of prng
+        */
+      inline void Seed( int seed ){
+          #if MSHADOW_USE_MKL
+          int status = vslDeleteStream(&vStream_);
+          utils::Assert(status == VSL_STATUS_OK);
+          status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
+          utils::Assert(status == VSL_STATUS_OK);
+          #else
+          // srand( seed );
+          gen_.seed(seed);
+          #endif
+      }
+      template<int dim>
+      inline void SampleBinary(Tensor<cpu, dim> &src) {
+        SampleBinary(src, src);
+      }
 
-        /*!
-         * \brief generate binary data according to a probability matrix
-         * \param src source
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleBinary(Tensor<cpu, dim> &dst, Tensor<cpu, dim> &src) 
{
-            real_t a=0.0f;
-            real_t b=1.0f;
-            Tensor<cpu, 2> dmat = dst.FlatTo2D();
-            Tensor<cpu, 2> smat = src.FlatTo2D();
-            std::uniform_real_distribution<real_t> distribution (a,b);
-            for ( index_t i = 0; i < dmat.shape[1]; ++i ) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
-                #else
-                int status = vdRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate 
random number by MKL.\n" );
-                #else
-                // use stdlib
-                /*
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = this->RandNext()*(b-a) + a;
-                }
-                */
-                for ( index_t j = 0; j < dmat.shape[0]; ++j ) {
-                    dmat[i][j] = distribution(gen_) > smat[i][j] ? 0.0f: 1.0f;
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief generate data from uniform [a,b)
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleUniform( Tensor<cpu, dim> &dst, real_t a=0.0f, 
real_t b=1.0f ) {
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
-            std::uniform_real_distribution<real_t> distribution (a,b);
-            for ( index_t i = 0; i < mat.shape[1]; ++i ) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
-                #else
-                int status = vdRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate 
random number by MKL.\n" );
-                #else
-                // use stdlib
-                /*
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = this->RandNext()*(b-a) + a;
-                }
-                */
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = distribution(gen_);
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief generate data from standard gaussian
-         * \param dst destination
-         * \param mu mean variable
-         * \param sigma standard deviation
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleGaussian( Tensor<cpu, dim> &dst, real_t mu = 0.0f, 
real_t sigma = 1.0f ) {
-            if( sigma <= 0.0f ) {
-                dst = mu; return;
-            }
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
-            std::normal_distribution<real_t> distribution (mu, sigma);
-            for (index_t i = 0; i < mat.shape[1]; ++i) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngGaussian( 0, vStream_, mat.shape[0], 
mat[i].dptr, mu, sigma );
-                #else
-                int status = vdRngGaussian( 0, vStream_, mat.shape[0], 
mat[i].dptr, mu, sigma );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate 
random number by MKL.\n" );
-                #else
-                /*
-                real_t g1 = 0.0f, g2 = 0.0f;
-                for (index_t j = 0; j < mat.shape[0]; ++j) {
-                    if( (j & 1) == 0 ){
-                        this->SampleNormal2D( g1, g2 );
-                        mat[i][j] = mu + g1 * sigma;
-                    }else{
-                        mat[i][j] = mu + g2 * sigma;
-                    }
-                }
-                */
-                for (index_t j = 0; j < mat.shape[0]; ++j) {
+      /*!
+        * \brief generate binary data according to a probability matrix
+        * \param src source
+        * \param dst destination
+        * \param a lower bound of uniform
+        * \param b upper bound of uniform
+        * \tparam dim dimension of tensor
+        */
+      template<int dim>
+      inline void SampleBinary(Tensor<cpu, dim> &dst, Tensor<cpu, dim> &src) {
+          real_t a=0.0f;
+          real_t b=1.0f;
+          Tensor<cpu, 2> dmat = dst.FlatTo2D();
+          Tensor<cpu, 2> smat = src.FlatTo2D();
+          std::uniform_real_distribution<real_t> distribution (a,b);
+          for ( index_t i = 0; i < dmat.shape[1]; ++i ) {
+              #if MSHADOW_USE_MKL
+              #if MSHADOW_SINGLE_PRECISION
+              int status = vsRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
+              #else
+              int status = vdRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
+              #endif
+              utils::Assert(status == VSL_STATUS_OK, "Failed to generate 
random number by MKL.\n" );
+              #else
+              // use stdlib
+              /*
+              for ( index_t j = 0; j < mat.shape[0]; ++j ) {
+                  mat[i][j] = this->RandNext()*(b-a) + a;
+              }
+              */
+              for ( index_t j = 0; j < dmat.shape[0]; ++j ) {
+                  dmat[i][j] = distribution(gen_) > smat[i][j] ? 0.0f: 1.0f;
+              }
+              #endif
+          }
+      }
+      /*!
+        * \brief generate data from uniform [a,b)
+        * \param dst destination
+        * \param a lower bound of uniform
+        * \param b upper bound of uniform
+        * \tparam dim dimension of tensor
+        */
+      template<int dim>
+      inline void SampleUniform( Tensor<cpu, dim> &dst, real_t a=0.0f, real_t 
b=1.0f ) {
+          Tensor<cpu, 2> mat = dst.FlatTo2D();
+          std::uniform_real_distribution<real_t> distribution (a,b);
+          for ( index_t i = 0; i < mat.shape[1]; ++i ) {
+              #if MSHADOW_USE_MKL
+              #if MSHADOW_SINGLE_PRECISION
+              int status = vsRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
+              #else
+              int status = vdRngUniform( 0, vStream_, mat.shape[0], 
mat[i].dptr, a, b );
+              #endif
+              utils::Assert(status == VSL_STATUS_OK, "Failed to generate 
random number by MKL.\n" );
+              #else
+              // use stdlib
+              /*
+              for ( index_t j = 0; j < mat.shape[0]; ++j ) {
+                  mat[i][j] = this->RandNext()*(b-a) + a;
+              }
+              */
+              for ( index_t j = 0; j < mat.shape[0]; ++j ) {
                   mat[i][j] = distribution(gen_);
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief return a temporal expression storing standard gaussian 
random variables
-         *        the temporal tensor is only valid before next call of 
gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * 
gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) 
invalid
-         *           A = gaussian(s1)*B+C; is correct; use one 
gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> gaussian( Shape<dim> 
shape ){
-            buffer_.Resize( Shape1( shape.Size() ) );
-            this->SampleGaussian( buffer_, 0.0f, 1.0f );
-            return expr::reshape( buffer_, shape );
-        }
-        /*!
-         * \brief return a temporal expression storing standard uniform [0,1)
-         *        the temporal tensor is only valid before next call of 
gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * 
gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) 
invalid
-         *           A = gaussian(s1)*B+C; is correct; use one 
gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> uniform( Shape<dim> shape 
){
-            buffer_.Resize( Shape1( shape.Size() ) );
-            this->SampleUniform( buffer_, 0.0f, 1.0f );
-            return expr::reshape( buffer_, shape );
-        }
-    private:
-        /*! \brief get next random number from rand */
-        inline real_t RandNext( void ){
-            return static_cast<real_t>(rand()) / 
(static_cast<real_t>(RAND_MAX)+1.0f);
-        }
-        /*! \brief return a real numer uniform in (0,1) */
-        inline real_t RandNext2( void ){
-            return (static_cast<real_t>( rand() ) + 1.0 ) / 
(static_cast<real_t>(RAND_MAX) + 2.0);
-        }
-        /*!
-         * \brief sample iid xx,yy ~N(0,1)
-         * \param xx first  gaussian output
-         * \param yy second gaussian output
-         */
-        inline void SampleNormal2D( real_t &xx, real_t &yy ){
-            real_t x,y,s;
-            do{
-                x = 2.0f * RandNext2() - 1.0f;
-                y = 2.0f * RandNext2() - 1.0f;
-                s = x*x + y*y;
-            }while( s >= 1.0f || s == 0.0f );
-            real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ;
-            xx = x * t; yy = y * t;
-        }
-    private:
-        #if MSHADOW_USE_MKL
-        /*! \brief stream used by MKL VSL */
-        VSLStreamStatePtr vStream_;
-        #endif
-        /*! \brief temporal space used to store random numbers */
-        TensorContainer<cpu,1> buffer_;
+              }
+              #endif
+          }
+      }
+      /*!
+        * \brief generate data from standard gaussian
+        * \param dst destination
+        * \param mu mean variable
+        * \param sigma standard deviation
+        * \tparam dim dimension of tensor
+        */
+      template<int dim>
+      inline void SampleGaussian( Tensor<cpu, dim> &dst, real_t mu = 0.0f, 
real_t sigma = 1.0f ) {
+          if( sigma <= 0.0f ) {
+              dst = mu; return;
+          }
+          Tensor<cpu, 2> mat = dst.FlatTo2D();
+          std::normal_distribution<real_t> distribution (mu, sigma);
+          for (index_t i = 0; i < mat.shape[1]; ++i) {
+              #if MSHADOW_USE_MKL
+              #if MSHADOW_SINGLE_PRECISION
+              int status = vsRngGaussian( 0, vStream_, mat.shape[0], 
mat[i].dptr, mu, sigma );
+              #else
+              int status = vdRngGaussian( 0, vStream_, mat.shape[0], 
mat[i].dptr, mu, sigma );
+              #endif
+              utils::Assert(status == VSL_STATUS_OK, "Failed to generate 
random number by MKL.\n" );
+              #else
+              /*
+              real_t g1 = 0.0f, g2 = 0.0f;
+              for (index_t j = 0; j < mat.shape[0]; ++j) {
+                  if( (j & 1) == 0 ){
+                      this->SampleNormal2D( g1, g2 );
+                      mat[i][j] = mu + g1 * sigma;
+                  }else{
+                      mat[i][j] = mu + g2 * sigma;
+                  }
+              }
+              */
+              for (index_t j = 0; j < mat.shape[0]; ++j) {
+                mat[i][j] = distribution(gen_);
+              }
+              #endif
+          }
+      }
+      /*!
+        * \brief return a temporal expression storing standard gaussian random 
variables
+        *        the temporal tensor is only valid before next call of 
gaussian or uniform
+        *        can be used as part of expression
+        *  Caution: this means expression such as A = gaussian(s1) * 
gaussian(s2) will give invalid result,
+        *           since second call of gaussian(s2) makes gaussian(s1) 
invalid
+        *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform 
in each expression
+        * \param shape shape of the tensor
+        * \tparam dim dimension of tensor
+        */
+      template<int dim>
+      inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> gaussian( Shape<dim> shape 
){
+          buffer_.Resize( Shape1( shape.Size() ) );
+          this->SampleGaussian( buffer_, 0.0f, 1.0f );
+          return expr::reshape( buffer_, shape );
+      }
+      /*!
+        * \brief return a temporal expression storing standard uniform [0,1)
+        *        the temporal tensor is only valid before next call of 
gaussian or uniform
+        *        can be used as part of expression
+        *  Caution: this means expression such as A = gaussian(s1) * 
gaussian(s2) will give invalid result,
+        *           since second call of gaussian(s2) makes gaussian(s1) 
invalid
+        *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform 
in each expression
+        * \param shape shape of the tensor
+        * \tparam dim dimension of tensor
+        */
+      template<int dim>
+      inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> uniform( Shape<dim> shape ){
+          buffer_.Resize( Shape1( shape.Size() ) );
+          this->SampleUniform( buffer_, 0.0f, 1.0f );
+          return expr::reshape( buffer_, shape );
+      }
+  private:
+      /*! \brief get next random number from rand */
+      inline real_t RandNext( void ){
+          return static_cast<real_t>(rand()) / 
(static_cast<real_t>(RAND_MAX)+1.0f);
+      }
+      /*! \brief return a real numer uniform in (0,1) */
+      inline real_t RandNext2( void ){
+          return (static_cast<real_t>( rand() ) + 1.0 ) / 
(static_cast<real_t>(RAND_MAX) + 2.0);
+      }
+      /*!
+        * \brief sample iid xx,yy ~N(0,1)
+        * \param xx first  gaussian output
+        * \param yy second gaussian output
+        */
+      inline void SampleNormal2D( real_t &xx, real_t &yy ){
+          real_t x,y,s;
+          do{
+              x = 2.0f * RandNext2() - 1.0f;
+              y = 2.0f * RandNext2() - 1.0f;
+              s = x*x + y*y;
+          }while( s >= 1.0f || s == 0.0f );
+          real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ;
+          xx = x * t; yy = y * t;
+      }
+  private:
+      #if MSHADOW_USE_MKL
+      /*! \brief stream used by MKL VSL */
+      VSLStreamStatePtr vStream_;
+      #endif
+      /*! \brief temporal space used to store random numbers */
+      TensorContainer<cpu,1> buffer_;
 
-        /*! \brief c++11 random generator, added for SINGA use */
-        std::mt19937 gen_;
-    }; // class Random<cpu>
+      /*! \brief c++11 random generator, added for SINGA use */
+      std::mt19937 gen_;
+  }; // class Random<cpu>
 
 #ifdef __CUDACC__
 
-    /*! \brief GPU random number generator */
-    template<>
-    class Random<gpu> {
+  /*! \brief GPU random number generator */
+  template<>
+  class Random<gpu> {
     public:
+        Random<gpu>() {
+           // obtain a seed from the system clock:
+          unsigned seed= 
std::chrono::system_clock::now().time_since_epoch().count();
+          curandStatus_t status;
+          status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT);
+          utils::Assert(status == CURAND_STATUS_SUCCESS, "Can not create 
CURAND Generator");
+          this->Seed( seed );
+          buffer_.Resize( Shape1(kRandBufferSize) );
+        }
         /*!
          * \brief constructor of random engine
          * \param seed random number seed

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/include/singa.h
----------------------------------------------------------------------
diff --git a/include/singa.h b/include/singa.h
index 6c801ab..a913ab7 100644
--- a/include/singa.h
+++ b/include/singa.h
@@ -33,4 +33,10 @@
 #include "utils/factory.h"
 #include "./driver.h"
 
+#ifndef CPU_ONLY
+    #define xpu mshadow::gpu
+#else
+    #define xpu mshadow::cpu
+#endif
+
 #endif  // SINGA_SINGA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/include/utils/blob.h
----------------------------------------------------------------------
diff --git a/include/utils/blob.h b/include/utils/blob.h
index a7079de..903845d 100644
--- a/include/utils/blob.h
+++ b/include/utils/blob.h
@@ -169,6 +169,7 @@ class Blob {
     CHECK(data);
     data_->set_cpu_data(data);
   }
+
   inline const Dtype* gpu_data() const {
     CHECK(data_);
     return static_cast<const Dtype*>(data_->gpu_data());

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
index 9829334..0c3716e 100644
--- a/include/utils/param.h
+++ b/include/utils/param.h
@@ -211,13 +211,18 @@ class Param {
   inline Blob<float>* mutable_data() { return data_.get(); }
   inline const Blob<float> &grad() const { return grad_; }
   inline Blob<float> *mutable_grad() { return &grad_; }
+  inline const Blob<float> &history() const { return history_; }
+  inline Blob<float> *mutable_history() { return &history_; }
+
   inline float* mutable_cpu_data() { return data_->mutable_cpu_data(); }
   inline float* mutable_cpu_grad() { return grad_.mutable_cpu_data(); }
   inline float* mutable_cpu_history() { return history_.mutable_cpu_data(); }
 
+  /*
   inline float* mutable_xpu_data() { return data_->mutable_xpu_data(); }
   inline float* mutable_xpu_grad() { return grad_.mutable_xpu_data(); }
   inline float* mutable_xpu_history() { return history_.mutable_xpu_data(); }
+  */
   /**
    * @return slice start ID
    */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/neuralnet/connection_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/connection_layer.cc 
b/src/neuralnet/connection_layer.cc
index acf243d..750a511 100644
--- a/src/neuralnet/connection_layer.cc
+++ b/src/neuralnet/connection_layer.cc
@@ -27,9 +27,9 @@ using std::vector;
 /********* Implementation for BridgeDstLayer **************/
 void BridgeDstLayer::Setup(const LayerProto& proto,
     const vector<Layer*>& srclayers) {
-  Layer::Setup(proto, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  data_.Reshape(srclayers[0]->data(this).shape());
+  Layer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(), 1);
+  data_.Reshape(srclayers_[0]->data(this).shape());
   grad_.ReshapeLike(data_);
 }
 /************* Implementation for ConcateLayer ***********/

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/neuralnet/neuron_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer.cc b/src/neuralnet/neuron_layer.cc
deleted file mode 100644
index 2d84fea..0000000
--- a/src/neuralnet/neuron_layer.cc
+++ /dev/null
@@ -1,684 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "neuralnet/neuron_layer.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-#include "utils/singleton.h"
-#include "mshadow/tensor.h"
-#include "mshadow/cxxnet_op.h"
-
-namespace singa {
-
-using namespace mshadow;
-using namespace mshadow::expr;
-using mshadow::cpu;
-
-using mshadow::Shape;
-using mshadow::Shape1;
-using mshadow::Shape2;
-using mshadow::Shape3;
-using mshadow::Shape4;
-using mshadow::Tensor;
-
-using std::string;
-using std::vector;
-
-#ifndef CPU_ONLY
-    #define xpu mshadow::gpu
-#else
-    #define xpu mshadow::cpu
-#endif
-
-inline Tensor<cpu, 4> Tensor4(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 4> tensor(blob->mutable_cpu_data(),
-      Shape4(shape[0], shape[1], shape[2], shape[3]));
-  return tensor;
-}
-
-inline Tensor<cpu, 3> Tensor3(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 3> tensor(blob->mutable_cpu_data(),
-      Shape3(shape[0], shape[1], blob->count() / shape[0] / shape[1]));
-  return tensor;
-}
-
-inline Tensor<cpu, 2> Tensor2(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 2> tensor(blob->mutable_cpu_data(),
-      Shape2(shape[0], blob->count() / shape[0]));
-  return tensor;
-}
-
-inline Tensor<cpu, 1> Tensor1(Blob<float>* blob) {
-  Tensor<cpu, 1> tensor(blob->mutable_cpu_data(), Shape1(blob->count()));
-  return tensor;
-}
-
-inline Tensor<xpu, 4> Tensor4XPU(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<xpu, 4> tensor(blob->mutable_xpu_data(),
-         Shape4(shape[0], shape[1], shape[2], shape[3]));
-  return tensor;
-}
-
-inline Tensor<xpu, 3> Tensor3XPU(Blob<float>* blob){
-  const vector<int>& shape = blob->shape();
-  Tensor<xpu, 3> tensor(blob->mutable_xpu_data(),
-         Shape3(shape[0], shape[1], blob->count() / shape[0] / shape[1]));
-  return tensor;
-}
-inline Tensor<xpu, 2> Tensor2XPU(Blob<float>* blob){
-  const vector<int>& shape = blob->shape();
-  Tensor<xpu, 2> tensor(blob->mutable_xpu_data(),
-         Shape2(shape[0], blob->count() / shape[0]));
-  return tensor;
-}
-inline Tensor<xpu, 1> Tensor1XPU(Blob<float>* blob){
-  Tensor<xpu, 1> tensor(blob->mutable_xpu_data(), Shape1(blob->count()));
-  return tensor;
-}
-
-/************ Implementation for ConvolutionLayer*************************/
-ConvolutionLayer::~ConvolutionLayer() {
-  delete weight_;
-  delete bias_;
-}
-void ConvolutionLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  ConvolutionProto conv_conf = conf.convolution_conf();
-  kernel_ = conv_conf.kernel();
-  CHECK_GT(kernel_, 0) << "Filter size cannot be zero.";
-  pad_ = conv_conf.pad();
-  stride_ = conv_conf.stride();
-  num_filters_ = conv_conf.num_filters();
-  if (partition_dim() > 0)
-    num_filters_ /= srclayers.at(0)->num_partitions();
-  const vector<int>& srcshape = srclayers[0]->data(this).shape();
-  int dim = srcshape.size();
-  CHECK_GT(dim, 2);
-  width_ = srcshape[dim - 1];
-  height_ = srcshape[dim - 2];
-  if (dim > 3)
-    channels_ = srcshape[dim - 3];
-  else if (dim > 2)
-    channels_ = 1;
-  batchsize_ = srcshape[0];
-  conv_height_ = (height_ + 2 * pad_ - kernel_) / stride_ + 1;
-  conv_width_ = (width_ + 2 * pad_ - kernel_) / stride_ + 1;
-  col_height_ = channels_ * kernel_ * kernel_;
-  col_width_ = conv_height_ * conv_width_;
-  vector<int> shape{batchsize_, num_filters_, conv_height_, conv_width_};
-  data_.Reshape(shape);
-  grad_.Reshape(shape);
-  col_data_.Reshape(vector<int>{col_height_, col_width_});
-  col_grad_.Reshape(vector<int>{col_height_, col_width_});
-  weight_ = Param::Create(conf.param(0));
-  bias_ = Param::Create(conf.param(1));
-  weight_->Setup(vector<int>{num_filters_, col_height_});
-  bias_->Setup(vector<int>{num_filters_});
-}
-
-void ConvolutionLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4XPU(srclayers_[0]->mutable_data(this));
-  auto data = Tensor3XPU(&data_);
-  auto col = Tensor2XPU(&col_data_);
-  auto weight = Tensor2XPU(weight_->mutable_data());
-  auto bias = Tensor1XPU(bias_->mutable_data());
-  for (int n = 0; n < batchsize_; n++) {
-    if (pad_ > 0)
-      col = expr::unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
-    else
-      col = expr::unpack_patch2col(src[n], kernel_, stride_);
-    data[n] = dot(weight, col);
-  }
-  data += expr::broadcast<1>(bias, data.shape);
-}
-
-void ConvolutionLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4XPU(srclayers_[0]->mutable_data(this));
-  auto col = Tensor2XPU(&col_data_);
-  auto weight = Tensor2XPU(weight_->mutable_data());
-  auto grad = Tensor3XPU(&grad_);
-  auto gcol = Tensor2XPU(&col_grad_);
-  auto gweight = Tensor2XPU(weight_->mutable_grad());
-  auto gbias = Tensor1XPU(bias_->mutable_grad());
-  Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
-  Tensor<xpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
-  if (gsrcblob != nullptr)
-    gsrc.dptr = gsrcblob->mutable_xpu_data();
-  gbias = expr::sumall_except_dim<1>(grad);
-  gweight = 0.0f;
-  Shape<3> padshp(gsrc.shape.SubShape());
-  padshp[0] += 2 * pad_;
-  padshp[1] += 2 * pad_;
-  Shape<2> imgshp = Shape2(height_, width_);
-  for (int n = 0; n < batchsize_; n++) {
-    if (pad_ > 0)
-      col = expr::unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
-    else
-      col = expr::unpack_patch2col(src[n], kernel_, stride_);
-    gweight += dot(grad[n], col.T());
-    if (gsrcblob != nullptr) {
-      gcol = dot(weight.T(), grad[n]);
-      gsrc[n] = crop(expr::pack_col2patch(gcol, padshp, kernel_, stride_),
-          imgshp);
-    }
-  }
- // weight_->mutable_data()->mutable_cpu_data();
-}
-
-/******************* Implementation for CConvolutionLayer *********/
-void CConvolutionLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor3(&data_);
-  auto col = Tensor2(&col_data_);
-  auto weight = Tensor2(weight_->mutable_data());
-  auto bias = Tensor1(bias_->mutable_data());
-
-  for (int n = 0; n < batchsize_; n++) {
-    Im2col(src[n].dptr, channels_, height_, width_,
-        kernel_, kernel_, pad_, pad_, stride_, stride_, col.dptr);
-    data[n] = dot(weight, col);
-  }
-  data += expr::broadcast<1>(bias, data.shape);
-}
-
-void CConvolutionLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto col = Tensor2(&col_data_);
-  auto weight = Tensor2(weight_->mutable_data());
-
-  auto grad = Tensor3(&grad_);
-  auto gcol = Tensor2(&col_grad_);
-  auto gweight = Tensor2(weight_->mutable_grad());
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gweight = 0.f;
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
-  if (gsrcblob != nullptr)
-    gsrc.dptr = gsrcblob->mutable_cpu_data();
-  gbias = expr::sumall_except_dim<1>(grad);
-  for (int n = 0; n < batchsize_; n++) {
-    Im2col(src[n].dptr, channels_, height_, width_,
-        kernel_, kernel_, pad_, pad_, stride_, stride_, col.dptr);
-    gweight += dot(grad[n], col.T());
-    if (gsrcblob != nullptr) {
-      gcol = dot(weight.T(), grad[n]);
-      Col2im(gcol.dptr, channels_, height_, width_,
-          kernel_, kernel_, pad_, pad_, stride_, stride_, gsrc[n].dptr);
-    }
-  }
-}
-
-/****************** Implementation for DropoutLayer ***********************/
-void DropoutLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(*srclayers[0]->mutable_grad(this));
-  mask_.Reshape(srclayers[0]->data(this).shape());
-  pdrop_ = conf.dropout_conf().dropout_ratio();
-}
-
-void DropoutLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  // check training
-  if ((flag & kTrain) != kTrain) {
-    data_.CopyFrom(srclayers[0]->data(this));
-    return;
-  }
-  float pkeep = 1 - pdrop_;
-  auto mask = Tensor1(&mask_);
-  mask = expr::F<op::threshold>(TSingleton<Random<cpu>>::Instance() \
-                      ->uniform(mask.shape), pkeep) * (1.0f/pkeep);
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = src * mask;
-}
-
-void DropoutLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)  
{
-  auto mask = Tensor1(&mask_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = grad * mask;
-}
-
-
-/**************** Implementation for RBMLayer********************/
-Blob<float>* RBMLayer::Sample(int flag) {
-  Tensor<cpu, 2> sample, data;
-  if ((flag & kPositive) == kPositive || first_gibbs_) {
-    data = Tensor2(&data_);
-    sample = Tensor2(&sample_);
-  } else {
-    data = Tensor2(&neg_data_);
-    sample = Tensor2(&neg_sample_);
-  }
-  auto random = TSingleton<Random<cpu>>::Instance();
-  if (gaussian_) {
-    random->SampleGaussian(sample, 0.0f, 1.0f);
-    sample += data;
-  } else {
-    random->SampleBinary(sample, data);
-  }
-  return (flag & kPositive) == kPositive || first_gibbs_ ?
-    &sample_ : &neg_sample_;
-}
-void RBMLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  hdim_ = conf.rbm_conf().hdim();
-  gaussian_ = conf.rbm_conf().gaussian();
-  first_gibbs_ = true;
-}
-/**************** Implementation for RBMVisLayer********************/
-RBMVisLayer::~RBMVisLayer() {
-  delete weight_;
-  delete bias_;
-}
-
-void RBMVisLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 2);
-  RBMLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 2);
-  hid_layer_ = nullptr;
-  for (auto src : srclayers) {
-    if (typeid(*src) == typeid(RBMHidLayer)) {
-      // note the hid layer has may not been set up.
-      CHECK(hid_layer_ == nullptr);
-      hid_layer_ = dynamic_cast<RBMHidLayer*>(src);
-    }
-  }
-  input_layer_ = srclayers[0] != hid_layer_ ? srclayers[0]: srclayers[1];
-  const auto& src = input_layer_->data(this);
-  batchsize_ = src.shape()[0];
-  data_.ReshapeLike(src);
-  neg_data_.ReshapeLike(data_);
-  neg_sample_.ReshapeLike(data_);
-  vdim_ = src.count() / batchsize_;
-  weight_ = Param::Create(conf.param(0));
-  weight_ ->Setup(vector<int>{hdim_, vdim_});
-  bias_ = Param::Create(conf.param(1));
-  bias_->Setup(vector<int>{vdim_});
-}
-
-void RBMVisLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if ((flag & kPositive) == kPositive) {
-    data_.CopyFrom(input_layer_->data(this), true);
-    first_gibbs_ = true;
-  } else if ((flag & kNegative) == kNegative) {
-    // fetch sampling results from hidden layer
-    auto hid_sample = Tensor2(hid_layer_->Sample(flag));
-    auto data = Tensor2(&neg_data_);
-    auto weight = Tensor2(weight_->mutable_data());
-    auto bias = Tensor1(bias_->mutable_data());
-    data = dot(hid_sample, weight);
-    data += expr::repmat(bias, batchsize_);
-    data = expr::F<op::sigmoid>(data);
-    if ((flag & kTest) == kTest) {
-      const float *dptr = data_.cpu_data(), *rcns = neg_data_.cpu_data();
-      float err = 0.f;
-      for (int i = 0; i < data_.count(); i++) {
-        err += (dptr[i] - rcns[i]) * (dptr[i] - rcns[i]);
-      }
-      metric_.Add("Squared Error", err / batchsize_);
-    }
-    first_gibbs_ = false;
-  }
-}
-
-void RBMVisLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto vis_pos = Tensor2(&data_);
-  auto vis_neg = Tensor2(&neg_data_);
-  auto hid_pos = Tensor2(hid_layer_->mutable_data(this));
-  auto hid_neg = Tensor2(hid_layer_->mutable_neg_data(this));
-
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gbias = expr::sum_rows(vis_neg);
-  gbias -= expr::sum_rows(vis_pos);
-  gbias /= batchsize_;
-
-  auto gweight = Tensor2(weight_->mutable_grad());
-  gweight = dot(hid_neg.T(), vis_neg);
-  gweight -= dot(hid_pos.T(), vis_pos);
-  gweight /= batchsize_;
-}
-/**************** Implementation for RBMHidLayer********************/
-RBMHidLayer::~RBMHidLayer() {
-  delete weight_;
-  delete bias_;
-}
-
-void RBMHidLayer::Setup(const LayerProto& conf,
-      const vector<Layer*>& srclayers) {
-  RBMLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  const auto& src_data = srclayers[0]->data(this);
-  batchsize_ = src_data.shape()[0];
-  vdim_ = src_data.count() / batchsize_;
-  data_.Reshape(vector<int>{batchsize_, hdim_});
-  neg_data_.ReshapeLike(data_);
-  sample_.ReshapeLike(data_);
-  neg_sample_.ReshapeLike(data_);
-  weight_ = Param::Create(conf.param(0));
-  weight_->Setup(vector<int>{hdim_, vdim_});
-  bias_ = Param::Create(conf.param(1));
-  bias_->Setup(vector<int>{hdim_});
-  vis_layer_ = dynamic_cast<RBMVisLayer*> (srclayers[0]);
-}
-
-void RBMHidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto weight = Tensor2(weight_->mutable_data());
-  auto bias = Tensor1(bias_->mutable_data());
-
-  Tensor<cpu, 2> data, src;
-  if ((flag & kPositive) == kPositive) {
-    data = Tensor2(&data_);
-    src = Tensor2(vis_layer_->mutable_data(this));
-    first_gibbs_ = true;
-  } else {
-    data = Tensor2(&neg_data_);
-    // hinton's science paper does not sample the vis layer
-    src = Tensor2(vis_layer_->mutable_neg_data(this));
-    first_gibbs_ = false;
-  }
-  data = dot(src, weight.T());
-  data += expr::repmat(bias, batchsize_);
-
-  if (!gaussian_)
-    data = expr::F<op::sigmoid>(data);
-}
-
-void RBMHidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto hid_pos = Tensor2(&data_);
-  auto hid_neg = Tensor2(&neg_data_);
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gbias = expr::sum_rows(hid_neg);
-  gbias -= expr::sum_rows(hid_pos);
-  gbias /= batchsize_;
-}
-/*********** Implementation for InnerProductLayer**********/
-InnerProductLayer::~InnerProductLayer() {
-  delete weight_;
-  delete bias_;
-}
-
-void InnerProductLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  const auto& src = srclayers[0]->data(this);
-  batchsize_ = src.shape()[0];
-  vdim_ = src.count() / batchsize_;
-  hdim_ = layer_conf_.innerproduct_conf().num_output();
-  transpose_ = conf.innerproduct_conf().transpose();
-  if (partition_dim() > 0)
-    hdim_ /= srclayers.at(0)->num_partitions();
-  data_.Reshape(vector<int>{batchsize_, hdim_});
-  grad_.ReshapeLike(data_);
-  weight_ = Param::Create(conf.param(0));
-  bias_ = Param::Create(conf.param(1));
-  if (transpose_)
-    weight_->Setup(vector<int>{vdim_, hdim_});
-  else
-    weight_->Setup(vector<int>{hdim_, vdim_});
-  bias_->Setup(vector<int>{hdim_});
-}
-
-void InnerProductLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  auto data = Tensor2XPU(&data_);
-  auto src = Tensor2XPU(srclayers_[0]->mutable_data(this));
-  auto weight = Tensor2XPU(weight_->mutable_data());
-  auto bias = Tensor1XPU(bias_->mutable_data());
-  if (transpose_)
-    data = dot(src, weight);
-  else
-    data = dot(src, weight.T());
-  // repmat: repeat bias vector into batchsize rows
-  data += expr::repmat(bias, batchsize_);
-}
-
-void InnerProductLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor2XPU(srclayers_[0]->mutable_data(this));
-  auto grad = Tensor2XPU(&grad_);
-  auto weight = Tensor2XPU(weight_->mutable_data());
-  auto gweight = Tensor2XPU(weight_->mutable_grad());
-  auto gbias = Tensor1XPU(bias_->mutable_grad());
-
-  gbias = expr::sum_rows(grad);
-  if (transpose_)
-    gweight = dot(src.T(), grad);
-  else
-    gweight = dot(grad.T(), src);
-  if (srclayers[0]->mutable_grad(this) != nullptr) {
-    auto gsrc = Tensor2XPU(srclayers_[0]->mutable_grad(this));
-    if (transpose_)
-      gsrc = dot(grad, weight.T());
-    else
-      gsrc = dot(grad, weight);
-  }
-}
-/***************** Implementation for LRNLayer *************************/
-void LRNLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  lsize_ = conf.lrn_conf().local_size();
-  CHECK_EQ(lsize_ % 2, 1) << "LRN only supports odd values for Localvol";
-  knorm_ = conf.lrn_conf().knorm();
-  alpha_ = conf.lrn_conf().alpha();
-  beta_ = conf.lrn_conf().beta();
-  const vector<int>& s = srclayers[0]->data(this).shape();
-  data_.Reshape(s);
-  grad_.Reshape(s);
-  norm_.Reshape(s);
-  batchsize_ = s[0];
-  channels_ = s[1];
-  height_ = s[2];
-  width_ = s[3];
-}
-
-void LRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  const float salpha = alpha_ / lsize_;
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor4(&data_);
-  auto norm = Tensor4(&norm_);
-  // stores normalizer without power
-  norm = expr::chpool<red::sum>(expr::F<op::square>(src), lsize_) * salpha
-    + knorm_;
-  data = src * expr::F<op::power>(norm, -beta_);
-}
-
-void LRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  const float salpha = alpha_ / lsize_;
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto norm = Tensor4(&norm_);
-  auto grad = Tensor4(&grad_);
-  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
-
-  gsrc = grad * expr::F<op::power>(norm, -beta_);
-  gsrc += (- 2.0f * beta_ * salpha) * expr::chpool<red::sum>(
-      grad * src * expr::F<op::power>(norm, -beta_ - 1.0f), lsize_)  * src;
-}
-
-/******************** Implementation for PoolingLayer******************/
-void PoolingLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  PoolingProto pool_conf = conf.pooling_conf();
-  kernel_ = pool_conf.kernel();
-  stride_ = pool_conf.stride();
-  CHECK_LT(pad_, kernel_);
-  pool_ = conf.pooling_conf().pool();
-  CHECK(pool_ == PoolingProto_PoolMethod_AVG
-        || pool_ == PoolingProto_PoolMethod_MAX)
-        << "Padding implemented only for average and max pooling.";
-  const auto& srcshape = srclayers[0]->data(this).shape();
-  int dim = srcshape.size();
-  CHECK_GT(dim, 2);
-  width_ = srcshape[dim - 1];
-  height_ = srcshape[dim - 2];
-  if (dim > 3)
-    channels_ = srcshape[dim-3];
-  else
-    channels_ = 1;
-  batchsize_ = srcshape[0];
-  pooled_height_ = static_cast<int>((height_ - kernel_) / stride_) + 1;
-  pooled_width_ = static_cast<int>((width_ - kernel_) / stride_) + 1;
-  data_.Reshape(vector<int>{batchsize_, channels_, pooled_height_,
-                            pooled_width_});
-  grad_.ReshapeLike(data_);
-}
-
-void PoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor4(&data_);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    data = expr::pool<red::maximum>(src, kernel_, stride_);
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    data = expr::pool<red::sum>(src, kernel_, stride_)
-      * (1.0f / (kernel_ * kernel_));
-}
-
-/*
- * partition only on num/channel dim
- * assume grad and data have the same paritition
- */
-void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
-  auto data = Tensor4(&data_);
-  auto grad = Tensor4(&grad_);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    gsrc = expr::unpool<red::maximum>(src, data, grad, kernel_, stride_);
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    gsrc = expr::unpool<red::sum>(src, data, grad, kernel_, stride_)
-           * (1.0f / (kernel_ * kernel_));
-}
-
-/***************** Implementation of CPoolingLayer ***************/
-
-void CPoolingLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  PoolingLayer::Setup(conf, srclayers);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-      mask_.ReshapeLike(data_);
-}
-void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    ForwardMaxPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-        batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, data_.mutable_cpu_data(), mask_.mutable_cpu_data());
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    ForwardAvgPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-        batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, data_.mutable_cpu_data());
-  else
-    LOG(FATAL) << "unknow pooling method";
-}
-
-void CPoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) 
{
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    BackwardMaxPooling(grad_.cpu_data(), mask_.cpu_data(), batchsize_,
-        channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, 
srclayers[0]->mutable_grad(this)->mutable_cpu_data());
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    BackwardAvgPooling(grad_.cpu_data(), batchsize_,
-        channels_, height_, width_, kernel_, kernel_, pad_, pad_,
-        stride_, stride_, 
srclayers[0]->mutable_grad(this)->mutable_cpu_data());
-  else
-    LOG(FATAL) << "unknow pooling method";
-}
-
-/***************** Implementation for ReLULayer *****************************/
-void ReLULayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
-}
-
-void ReLULayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = expr::F<op::relu>(src);
-}
-
-void ReLULayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = expr::F<op::relu_grad>(data)*grad;
-}
-
-/*******************Implementation of SigmoidLayer***************************/
-void SigmoidLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
-}
-
-void SigmoidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = expr::F<op::sigmoid>(src);
-}
-
-void SigmoidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = expr::F<op::sigmoid_grad>(data) * grad;
-}
-/*******************Implementation of TanLayer***************************/
-void STanhLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
-}
-
-void STanhLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = expr::F<op::stanh>(src);
-}
-
-void STanhLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = expr::F<op::stanh_grad>(data) * grad;
-}
-
-
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/neuralnet/neuron_layer.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer.cu b/src/neuralnet/neuron_layer.cu
new file mode 100644
index 0000000..affb02f
--- /dev/null
+++ b/src/neuralnet/neuron_layer.cu
@@ -0,0 +1,678 @@
+/************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+#include "neuralnet/neuron_layer.h"
+
+#include <glog/logging.h>
+#include <algorithm>
+#include "utils/singleton.h"
+#include "mshadow/tensor.h"
+#include "mshadow/cxxnet_op.h"
+
+namespace singa {
+
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using mshadow::cpu;
+  using mshadow::xpu;
+
+  using mshadow::Shape;
+  using mshadow::Shape1;
+  using mshadow::Shape2;
+  using mshadow::Shape3;
+  using mshadow::Shape4;
+  using mshadow::Tensor;
+
+  using std::string;
+  using std::vector;
+
+  inline Tensor<cpu, 4> Tensor4CPU(Blob<float>* blob) {
+    const vector<int>& shape = blob->shape();
+    Tensor<cpu, 4> tensor(blob->mutable_cpu_data(),
+        Shape4(shape[0], shape[1], shape[2], shape[3]));
+    return tensor;
+  }
+
+  inline Tensor<cpu, 3> Tensor3CPU(Blob<float>* blob) {
+    const vector<int>& shape = blob->shape();
+    Tensor<cpu, 3> tensor(blob->mutable_cpu_data(),
+        Shape3(shape[0], shape[1], blob->count() / shape[0] / shape[1]));
+    return tensor;
+  }
+
+  inline Tensor<cpu, 2> Tensor2CPU(Blob<float>* blob) {
+    const vector<int>& shape = blob->shape();
+    Tensor<cpu, 2> tensor(blob->mutable_cpu_data(),
+        Shape2(shape[0], blob->count() / shape[0]));
+    return tensor;
+  }
+
+  inline Tensor<cpu, 1> Tensor1CPU(Blob<float>* blob) {
+    Tensor<cpu, 1> tensor(blob->mutable_cpu_data(), Shape1(blob->count()));
+    return tensor;
+  }
+
+  inline Tensor<xpu, 4> Tensor4(Blob<float>* blob) {
+    const vector<int>& shape = blob->shape();
+    Tensor<xpu, 4> tensor(blob->mutable_xpu_data(),
+        Shape4(shape[0], shape[1], shape[2], shape[3]));
+    return tensor;
+  }
+
+  inline Tensor<xpu, 3> Tensor3(Blob<float>* blob){
+    const vector<int>& shape = blob->shape();
+    Tensor<xpu, 3> tensor(blob->mutable_xpu_data(),
+        Shape3(shape[0], shape[1], blob->count() / shape[0] / shape[1]));
+    return tensor;
+  }
+  inline Tensor<xpu, 2> Tensor2(Blob<float>* blob){
+    const vector<int>& shape = blob->shape();
+    Tensor<xpu, 2> tensor(blob->mutable_xpu_data(),
+        Shape2(shape[0], blob->count() / shape[0]));
+    return tensor;
+  }
+  inline Tensor<xpu, 1> Tensor1(Blob<float>* blob){
+    Tensor<xpu, 1> tensor(blob->mutable_xpu_data(), Shape1(blob->count()));
+    return tensor;
+  }
+
+  /************ Implementation for ConvolutionLayer*************************/
+  ConvolutionLayer::~ConvolutionLayer() {
+    delete weight_;
+    delete bias_;
+  }
+  void ConvolutionLayer::Setup(const LayerProto& conf,
+      const vector<Layer*>& srclayers) {
+    CHECK_EQ(srclayers.size(), 1);
+    Layer::Setup(conf, srclayers);
+    ConvolutionProto conv_conf = conf.convolution_conf();
+    kernel_ = conv_conf.kernel();
+    CHECK_GT(kernel_, 0) << "Filter size cannot be zero.";
+    pad_ = conv_conf.pad();
+    stride_ = conv_conf.stride();
+    num_filters_ = conv_conf.num_filters();
+    if (partition_dim() > 0)
+      num_filters_ /= srclayers.at(0)->num_partitions();
+    const vector<int>& srcshape = srclayers[0]->data(this).shape();
+    int dim = srcshape.size();
+    CHECK_GT(dim, 2);
+    width_ = srcshape[dim - 1];
+    height_ = srcshape[dim - 2];
+    if (dim > 3)
+      channels_ = srcshape[dim - 3];
+    else if (dim > 2)
+      channels_ = 1;
+    batchsize_ = srcshape[0];
+    conv_height_ = (height_ + 2 * pad_ - kernel_) / stride_ + 1;
+    conv_width_ = (width_ + 2 * pad_ - kernel_) / stride_ + 1;
+    col_height_ = channels_ * kernel_ * kernel_;
+    col_width_ = conv_height_ * conv_width_;
+    vector<int> shape{batchsize_, num_filters_, conv_height_, conv_width_};
+    data_.Reshape(shape);
+    grad_.Reshape(shape);
+    col_data_.Reshape(vector<int>{col_height_, col_width_});
+    col_grad_.Reshape(vector<int>{col_height_, col_width_});
+    weight_ = Param::Create(conf.param(0));
+    bias_ = Param::Create(conf.param(1));
+    weight_->Setup(vector<int>{num_filters_, col_height_});
+    bias_->Setup(vector<int>{num_filters_});
+  }
+
+void ConvolutionLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto data = Tensor3(&data_);
+  auto col = Tensor2(&col_data_);
+  auto weight = Tensor2(weight_->mutable_data());
+  auto bias = Tensor1(bias_->mutable_data());
+  for (int n = 0; n < batchsize_; n++) {
+    if (pad_ > 0)
+      col = expr::unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
+    else
+      col = expr::unpack_patch2col(src[n], kernel_, stride_);
+    data[n] = dot(weight, col);
+  }
+  data += expr::broadcast<1>(bias, data.shape);
+}
+
+void ConvolutionLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto col = Tensor2(&col_data_);
+  auto weight = Tensor2(weight_->mutable_data());
+  auto grad = Tensor3(&grad_);
+  auto gcol = Tensor2(&col_grad_);
+  auto gweight = Tensor2(weight_->mutable_grad());
+  auto gbias = Tensor1(bias_->mutable_grad());
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
+  Tensor<xpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
+  if (gsrcblob != nullptr)
+    gsrc.dptr = gsrcblob->mutable_xpu_data();
+  gbias = expr::sumall_except_dim<1>(grad);
+  gweight = 0.0f;
+  Shape<3> padshp(gsrc.shape.SubShape());
+  padshp[0] += 2 * pad_;
+  padshp[1] += 2 * pad_;
+  Shape<2> imgshp = Shape2(height_, width_);
+  for (int n = 0; n < batchsize_; n++) {
+    if (pad_ > 0)
+      col = expr::unpack_patch2col(pad(src[n], pad_), kernel_, stride_);
+    else
+      col = expr::unpack_patch2col(src[n], kernel_, stride_);
+    gweight += dot(grad[n], col.T());
+    if (gsrcblob != nullptr) {
+      gcol = dot(weight.T(), grad[n]);
+      gsrc[n] = crop(expr::pack_col2patch(gcol, padshp, kernel_, stride_),
+          imgshp);
+    }
+  }
+ // weight_->mutable_data()->mutable_cpu_data();
+}
+
+/******************* Implementation for CConvolutionLayer *********/
+void CConvolutionLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto data = Tensor3(&data_);
+  auto col = Tensor2(&col_data_);
+  auto weight = Tensor2(weight_->mutable_data());
+  auto bias = Tensor1(bias_->mutable_data());
+
+  for (int n = 0; n < batchsize_; n++) {
+    Im2col(src[n].dptr, channels_, height_, width_,
+        kernel_, kernel_, pad_, pad_, stride_, stride_, col.dptr);
+    data[n] = dot(weight, col);
+  }
+  data += expr::broadcast<1>(bias, data.shape);
+}
+
+void CConvolutionLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto col = Tensor2(&col_data_);
+  auto weight = Tensor2(weight_->mutable_data());
+
+  auto grad = Tensor3(&grad_);
+  auto gcol = Tensor2(&col_grad_);
+  auto gweight = Tensor2(weight_->mutable_grad());
+  auto gbias = Tensor1(bias_->mutable_grad());
+  gweight = 0.f;
+  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
+  Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
+  if (gsrcblob != nullptr)
+    gsrc.dptr = gsrcblob->mutable_cpu_data();
+  gbias = expr::sumall_except_dim<1>(grad);
+  for (int n = 0; n < batchsize_; n++) {
+    Im2col(src[n].dptr, channels_, height_, width_,
+        kernel_, kernel_, pad_, pad_, stride_, stride_, col.dptr);
+    gweight += dot(grad[n], col.T());
+    if (gsrcblob != nullptr) {
+      gcol = dot(weight.T(), grad[n]);
+      Col2im(gcol.dptr, channels_, height_, width_,
+          kernel_, kernel_, pad_, pad_, stride_, stride_, gsrc[n].dptr);
+    }
+  }
+}
+
+/****************** Implementation for DropoutLayer ***********************/
+void DropoutLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(*srclayers[0]->mutable_grad(this));
+  mask_.Reshape(srclayers[0]->data(this).shape());
+  pdrop_ = conf.dropout_conf().dropout_ratio();
+}
+
+void DropoutLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  // check training
+  if ((flag & kTrain) != kTrain) {
+    data_.CopyFrom(srclayers[0]->data(this));
+    return;
+  }
+  float pkeep = 1 - pdrop_;
+  auto mask = Tensor1(&mask_);
+  mask = expr::F<op::threshold>(TSingleton<Random<xpu>>::Instance() \
+                      ->uniform(mask.shape), pkeep) * (1.0f/pkeep);
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
+  data = src * mask;
+}
+
+void DropoutLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)  
{
+  auto mask = Tensor1(&mask_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
+  gsrc = grad * mask;
+}
+
+
+/**************** Implementation for RBMLayer********************/
+Blob<float>* RBMLayer::Sample(int flag) {
+  Tensor<cpu, 2> sample, data;
+  if ((flag & kPositive) == kPositive || first_gibbs_) {
+    data = Tensor2CPU(&data_);
+    sample = Tensor2CPU(&sample_);
+  } else {
+    data = Tensor2CPU(&neg_data_);
+    sample = Tensor2CPU(&neg_sample_);
+  }
+  auto random = TSingleton<Random<cpu>>::Instance();
+  if (gaussian_) {
+    random->SampleGaussian(sample, 0.0f, 1.0f);
+    sample += data;
+  } else {
+    random->SampleBinary(sample, data);
+  }
+  return (flag & kPositive) == kPositive || first_gibbs_ ?
+    &sample_ : &neg_sample_;
+}
+void RBMLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  hdim_ = conf.rbm_conf().hdim();
+  gaussian_ = conf.rbm_conf().gaussian();
+  first_gibbs_ = true;
+}
+/**************** Implementation for RBMVisLayer********************/
+RBMVisLayer::~RBMVisLayer() {
+  delete weight_;
+  delete bias_;
+}
+
+void RBMVisLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  CHECK_EQ(srclayers.size(), 2);
+  RBMLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 2);
+  hid_layer_ = nullptr;
+  for (auto src : srclayers) {
+    if (typeid(*src) == typeid(RBMHidLayer)) {
+      // note the hid layer has may not been set up.
+      CHECK(hid_layer_ == nullptr);
+      hid_layer_ = dynamic_cast<RBMHidLayer*>(src);
+    }
+  }
+  input_layer_ = srclayers[0] != hid_layer_ ? srclayers[0]: srclayers[1];
+  const auto& src = input_layer_->data(this);
+  batchsize_ = src.shape()[0];
+  data_.ReshapeLike(src);
+  neg_data_.ReshapeLike(data_);
+  neg_sample_.ReshapeLike(data_);
+  vdim_ = src.count() / batchsize_;
+  weight_ = Param::Create(conf.param(0));
+  weight_ ->Setup(vector<int>{hdim_, vdim_});
+  bias_ = Param::Create(conf.param(1));
+  bias_->Setup(vector<int>{vdim_});
+}
+
+void RBMVisLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  if ((flag & kPositive) == kPositive) {
+    data_.CopyFrom(input_layer_->data(this), true);
+    first_gibbs_ = true;
+  } else if ((flag & kNegative) == kNegative) {
+    // fetch sampling results from hidden layer
+    auto hid_sample = Tensor2CPU(hid_layer_->Sample(flag));
+    auto data = Tensor2CPU(&neg_data_);
+    auto weight = Tensor2CPU(weight_->mutable_data());
+    auto bias = Tensor1CPU(bias_->mutable_data());
+    data = dot(hid_sample, weight);
+    data += expr::repmat(bias, batchsize_);
+    data = expr::F<op::sigmoid>(data);
+    if ((flag & kTest) == kTest) {
+      const float *dptr = data_.cpu_data(), *rcns = neg_data_.cpu_data();
+      float err = 0.f;
+      for (int i = 0; i < data_.count(); i++) {
+        err += (dptr[i] - rcns[i]) * (dptr[i] - rcns[i]);
+      }
+      metric_.Add("Squared Error", err / batchsize_);
+    }
+    first_gibbs_ = false;
+  }
+}
+
+void RBMVisLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto vis_pos = Tensor2CPU(&data_);
+  auto vis_neg = Tensor2CPU(&neg_data_);
+  auto hid_pos = Tensor2CPU(hid_layer_->mutable_data(this));
+  auto hid_neg = Tensor2CPU(hid_layer_->mutable_neg_data(this));
+
+  auto gbias = Tensor1CPU(bias_->mutable_grad());
+  gbias = expr::sum_rows(vis_neg);
+  gbias -= expr::sum_rows(vis_pos);
+  gbias /= batchsize_;
+
+  auto gweight = Tensor2CPU(weight_->mutable_grad());
+  gweight = dot(hid_neg.T(), vis_neg);
+  gweight -= dot(hid_pos.T(), vis_pos);
+  gweight /= batchsize_;
+}
+/**************** Implementation for RBMHidLayer********************/
+RBMHidLayer::~RBMHidLayer() {
+  delete weight_;
+  delete bias_;
+}
+
+void RBMHidLayer::Setup(const LayerProto& conf,
+      const vector<Layer*>& srclayers) {
+  RBMLayer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  const auto& src_data = srclayers[0]->data(this);
+  batchsize_ = src_data.shape()[0];
+  vdim_ = src_data.count() / batchsize_;
+  data_.Reshape(vector<int>{batchsize_, hdim_});
+  neg_data_.ReshapeLike(data_);
+  sample_.ReshapeLike(data_);
+  neg_sample_.ReshapeLike(data_);
+  weight_ = Param::Create(conf.param(0));
+  weight_->Setup(vector<int>{hdim_, vdim_});
+  bias_ = Param::Create(conf.param(1));
+  bias_->Setup(vector<int>{hdim_});
+  vis_layer_ = dynamic_cast<RBMVisLayer*> (srclayers[0]);
+}
+
+void RBMHidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto weight = Tensor2CPU(weight_->mutable_data());
+  auto bias = Tensor1CPU(bias_->mutable_data());
+
+  Tensor<cpu, 2> data, src;
+  if ((flag & kPositive) == kPositive) {
+    data = Tensor2CPU(&data_);
+    src = Tensor2CPU(vis_layer_->mutable_data(this));
+    first_gibbs_ = true;
+  } else {
+    data = Tensor2CPU(&neg_data_);
+    // hinton's science paper does not sample the vis layer
+    src = Tensor2CPU(vis_layer_->mutable_neg_data(this));
+    first_gibbs_ = false;
+  }
+  data = dot(src, weight.T());
+  data += expr::repmat(bias, batchsize_);
+
+  if (!gaussian_)
+    data = expr::F<op::sigmoid>(data);
+}
+
+void RBMHidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto hid_pos = Tensor2CPU(&data_);
+  auto hid_neg = Tensor2CPU(&neg_data_);
+  auto gbias = Tensor1CPU(bias_->mutable_grad());
+  gbias = expr::sum_rows(hid_neg);
+  gbias -= expr::sum_rows(hid_pos);
+  gbias /= batchsize_;
+}
+/*********** Implementation for InnerProductLayer**********/
+InnerProductLayer::~InnerProductLayer() {
+  delete weight_;
+  delete bias_;
+}
+
+void InnerProductLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  const auto& src = srclayers[0]->data(this);
+  batchsize_ = src.shape()[0];
+  vdim_ = src.count() / batchsize_;
+  hdim_ = layer_conf_.innerproduct_conf().num_output();
+  transpose_ = conf.innerproduct_conf().transpose();
+  if (partition_dim() > 0)
+    hdim_ /= srclayers.at(0)->num_partitions();
+  data_.Reshape(vector<int>{batchsize_, hdim_});
+  grad_.ReshapeLike(data_);
+  weight_ = Param::Create(conf.param(0));
+  bias_ = Param::Create(conf.param(1));
+  if (transpose_)
+    weight_->Setup(vector<int>{vdim_, hdim_});
+  else
+    weight_->Setup(vector<int>{hdim_, vdim_});
+  bias_->Setup(vector<int>{hdim_});
+}
+
+void InnerProductLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  auto data = Tensor2(&data_);
+  auto src = Tensor2(srclayers[0]->mutable_data(this));
+  auto weight = Tensor2(weight_->mutable_data());
+  auto bias = Tensor1(bias_->mutable_data());
+  if (transpose_)
+    data = dot(src, weight);
+  else
+    data = dot(src, weight.T());
+  // repmat: repeat bias vector into batchsize rows
+  data += expr::repmat(bias, batchsize_);
+}
+
+void InnerProductLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+  auto src = Tensor2(srclayers[0]->mutable_data(this));
+  auto grad = Tensor2(&grad_);
+  auto weight = Tensor2(weight_->mutable_data());
+  auto gweight = Tensor2(weight_->mutable_grad());
+  auto gbias = Tensor1(bias_->mutable_grad());
+
+  gbias = expr::sum_rows(grad);
+  if (transpose_)
+    gweight = dot(src.T(), grad);
+  else
+    gweight = dot(grad.T(), src);
+  if (srclayers[0]->mutable_grad(this) != nullptr) {
+    auto gsrc = Tensor2(srclayers[0]->mutable_grad(this));
+    if (transpose_)
+      gsrc = dot(grad, weight.T());
+    else
+      gsrc = dot(grad, weight);
+  }
+}
+/***************** Implementation for LRNLayer *************************/
+void LRNLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  lsize_ = conf.lrn_conf().local_size();
+  CHECK_EQ(lsize_ % 2, 1) << "LRN only supports odd values for Localvol";
+  knorm_ = conf.lrn_conf().knorm();
+  alpha_ = conf.lrn_conf().alpha();
+  beta_ = conf.lrn_conf().beta();
+  const vector<int>& s = srclayers[0]->data(this).shape();
+  data_.Reshape(s);
+  grad_.Reshape(s);
+  norm_.Reshape(s);
+  batchsize_ = s[0];
+  channels_ = s[1];
+  height_ = s[2];
+  width_ = s[3];
+}
+
+void LRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  const float salpha = alpha_ / lsize_;
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto data = Tensor4(&data_);
+  auto norm = Tensor4(&norm_);
+  // stores normalizer without power
+  norm = expr::chpool<red::sum>(expr::F<op::square>(src), lsize_) * salpha
+    + knorm_;
+  data = src * expr::F<op::power>(norm, -beta_);
+}
+
+void LRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  const float salpha = alpha_ / lsize_;
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto norm = Tensor4(&norm_);
+  auto grad = Tensor4(&grad_);
+  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
+
+  gsrc = grad * expr::F<op::power>(norm, -beta_);
+  gsrc += (- 2.0f * beta_ * salpha) * expr::chpool<red::sum>(
+      grad * src * expr::F<op::power>(norm, -beta_ - 1.0f), lsize_)  * src;
+}
+
+/******************** Implementation for PoolingLayer******************/
+void PoolingLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  CHECK_EQ(srclayers.size(), 1);
+  PoolingProto pool_conf = conf.pooling_conf();
+  kernel_ = pool_conf.kernel();
+  stride_ = pool_conf.stride();
+  CHECK_LT(pad_, kernel_);
+  pool_ = conf.pooling_conf().pool();
+  CHECK(pool_ == PoolingProto_PoolMethod_AVG
+        || pool_ == PoolingProto_PoolMethod_MAX)
+        << "Padding implemented only for average and max pooling.";
+  const auto& srcshape = srclayers[0]->data(this).shape();
+  int dim = srcshape.size();
+  CHECK_GT(dim, 2);
+  width_ = srcshape[dim - 1];
+  height_ = srcshape[dim - 2];
+  if (dim > 3)
+    channels_ = srcshape[dim-3];
+  else
+    channels_ = 1;
+  batchsize_ = srcshape[0];
+  pooled_height_ = static_cast<int>((height_ - kernel_) / stride_) + 1;
+  pooled_width_ = static_cast<int>((width_ - kernel_) / stride_) + 1;
+  data_.Reshape(vector<int>{batchsize_, channels_, pooled_height_,
+                            pooled_width_});
+  grad_.ReshapeLike(data_);
+}
+
+void PoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto data = Tensor4(&data_);
+  if (pool_ == PoolingProto_PoolMethod_MAX)
+    data = expr::pool<red::maximum>(src, kernel_, stride_);
+  else if (pool_ == PoolingProto_PoolMethod_AVG)
+    data = expr::pool<red::sum>(src, kernel_, stride_)
+      * (1.0f / (kernel_ * kernel_));
+}
+
+/*
+ * partition only on num/channel dim
+ * assume grad and data have the same paritition
+ */
+void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto src = Tensor4(srclayers[0]->mutable_data(this));
+  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
+  auto data = Tensor4(&data_);
+  auto grad = Tensor4(&grad_);
+  if (pool_ == PoolingProto_PoolMethod_MAX)
+    gsrc = expr::unpool<red::maximum>(src, data, grad, kernel_, stride_);
+  else if (pool_ == PoolingProto_PoolMethod_AVG)
+    gsrc = expr::unpool<red::sum>(src, data, grad, kernel_, stride_)
+           * (1.0f / (kernel_ * kernel_));
+}
+
+/***************** Implementation of CPoolingLayer ***************/
+
+void CPoolingLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  PoolingLayer::Setup(conf, srclayers);
+  if (pool_ == PoolingProto_PoolMethod_MAX)
+      mask_.ReshapeLike(data_);
+}
+void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  if (pool_ == PoolingProto_PoolMethod_MAX)
+    ForwardMaxPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+        batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
+        stride_, stride_, data_.mutable_cpu_data(), mask_.mutable_cpu_data());
+  else if (pool_ == PoolingProto_PoolMethod_AVG)
+    ForwardAvgPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
+        batchsize_, channels_, height_, width_, kernel_, kernel_, pad_, pad_,
+        stride_, stride_, data_.mutable_cpu_data());
+  else
+    LOG(FATAL) << "unknow pooling method";
+}
+
+void CPoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) 
{
+  if (pool_ == PoolingProto_PoolMethod_MAX)
+    BackwardMaxPooling(grad_.cpu_data(), mask_.cpu_data(), batchsize_,
+        channels_, height_, width_, kernel_, kernel_, pad_, pad_,
+        stride_, stride_, 
srclayers[0]->mutable_grad(this)->mutable_cpu_data());
+  else if (pool_ == PoolingProto_PoolMethod_AVG)
+    BackwardAvgPooling(grad_.cpu_data(), batchsize_,
+        channels_, height_, width_, kernel_, kernel_, pad_, pad_,
+        stride_, stride_, 
srclayers[0]->mutable_grad(this)->mutable_cpu_data());
+  else
+    LOG(FATAL) << "unknow pooling method";
+}
+
+/***************** Implementation for ReLULayer *****************************/
+void ReLULayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
+}
+
+void ReLULayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
+  data = expr::F<op::relu>(src);
+}
+
+void ReLULayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto data = Tensor1(&data_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
+  gsrc = expr::F<op::relu_grad>(data)*grad;
+}
+
+/*******************Implementation of SigmoidLayer***************************/
+void SigmoidLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(srclayers[0]->grad(this));
+}
+
+void SigmoidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
+  data = expr::F<op::sigmoid>(src);
+}
+
+void SigmoidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto data = Tensor1(&data_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
+  gsrc = expr::F<op::sigmoid_grad>(data) * grad;
+}
+/*******************Implementation of TanLayer***************************/
+void STanhLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  Layer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(srclayers[0]->grad(this));
+}
+
+void STanhLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers[0]->mutable_data(this));
+  data = expr::F<op::stanh>(src);
+}
+
+void STanhLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
+  auto data = Tensor1(&data_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
+  gsrc = expr::F<op::stanh_grad>(data) * grad;
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/trainer/server.cc
----------------------------------------------------------------------
diff --git a/src/trainer/server.cc b/src/trainer/server.cc
new file mode 100644
index 0000000..f5a0560
--- /dev/null
+++ b/src/trainer/server.cc
@@ -0,0 +1,256 @@
+#include <thread>
+#include <chrono>
+#include "mshadow/tensor.h"
+#include "trainer/server.h"
+#include "utils/param.h"
+#include "utils/singleton.h"
+#include "utils/factory.h"
+#include "utils/cluster.h"
+#include "proto/common.pb.h"
+
+namespace singa {
+
+using namespace mshadow;
+using std::vector;
+
+Server::Server(int thread_id,int group_id, int server_id):
+  thread_id_(thread_id),grp_id_(group_id), id_(server_id){
+}
+
+void Server::Setup(const UpdaterProto& proto,
+    std::unordered_map<int, ParamEntry*>* shard,
+    const vector<int>& slice2group) {
+  updater_ = Updater::Create(proto);
+  shard_ = shard;
+  slice2group_ = slice2group;
+}
+
+Server::~Server() {
+  delete updater_;
+}
+
+void Stop(void * running) {
+  *static_cast<bool *>(running) = false;
+}
+
+void Server::Run() {
+  LOG(ERROR) << "Server (group = " << grp_id_ <<", id = " << id_ << ") start";
+  auto dealer = new Dealer(2*thread_id_);
+  CHECK(dealer->Connect(kInprocRouterEndpoint));
+  Msg* ping = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
+  ping->set_type(kConnect);
+  dealer->Send(&ping);
+
+  auto cluster = Cluster::Get();
+  bool running = true;
+  CHECK(cluster->runtime()->WatchSGroup(grp_id_, id_, Stop, &running));
+
+  int nserver_grps = cluster->nserver_groups();
+  vector<Param*> master_params;
+  size_t syncEntry=0;
+  Poller poll(dealer);
+  // start recv loop and process requests
+  while (running) {
+    auto *sock = poll.Wait(cluster->poll_time());
+    if (poll.Terminated()) {
+      LOG(ERROR) << "Connection broken!";
+      exit(0);
+    } else if (sock == nullptr) {
+      continue;
+    }
+    Msg* msg=dealer->Receive();
+    if (msg==nullptr) break;
+    Msg* response=nullptr;
+    int type=msg->type();
+    int slice_id = SliceID(msg->trgt_val());
+    if (type == kPut) {
+      response = HandlePut(&msg);
+      if(slice2group_[slice_id] == grp_id_)
+        master_params.push_back(shard_->at(slice_id)->shares.at(0));
+    } else {
+      if (shard_->find(slice_id) == shard_->end()) {
+        // delay the processing by re-queue the msg.
+        response = msg;
+      } else if (type == kSyncReminder) {
+        DeleteMsg(&msg);
+        if(syncEntry >= master_params.size())
+          continue;
+        auto param = master_params.at(syncEntry);
+        // control the frequency of synchronization
+        // currently sync is triggerred only when the slice is updated
+        // by local worker or other workers for at least nserver_groups times.
+        // TODO may optimize the trigger condition.
+        if (abs(param->local_version() - param->version()) >= nserver_grps) {
+          for (auto msg : GenSyncMsgs(param))
+            dealer->Send(&msg);
+          syncEntry = (syncEntry+1) % master_params.size();
+        }
+      } else {
+        switch (type) {
+          case kGet:
+            response = HandleGet(&msg);
+            break;
+          case kUpdate:
+            for (auto reply : HandleUpdate(&msg))
+              dealer->Send(&reply);
+            break;
+          case kSyncRequest:
+            response = HandleSyncRequest(&msg);
+            break;
+          default:
+            LOG(ERROR)<<"Unknown message type "<<type;
+            break;
+        }
+      }
+    }
+    if (response != nullptr)
+      dealer->Send(&response);
+  }
+
+  // send stop msg to stub
+  Msg* msg = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
+  msg->set_type(kStop);
+  dealer->Send(&msg);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  LOG(ERROR) << "Server (group = " << grp_id_ << ", id = " << id_ << ") stops";
+  delete dealer;
+}
+
+const vector<Msg*> Server::GenSyncMsgs(Param* param) {
+  vector<Msg*> ret;
+  // TODO replace the argument (0,0) to sync a chunk instead of a slice
+  auto msg = param->GenSyncMsg(0, 0);
+  auto cluster = Cluster::Get();
+  for (int i = 0; i < cluster->nserver_groups(); i++) {
+    if (i != grp_id_) {
+      Msg* tmp = msg;
+      if (i < cluster->nserver_groups() - 1)
+        tmp = new Msg(*msg);
+      // assume only one server per group, TODO generalize it
+      tmp->set_dst(Addr(i, 0, kServer));
+      tmp->set_src(Addr(grp_id_, id_, kServer));
+      ret.push_back(tmp);
+      param->set_version(param->local_version());
+      //LOG(ERROR)<<"sync slice="<<param->id()<<" to procs "<<i;
+    }
+  }
+  return ret;
+}
+
+Msg* Server::HandlePut(Msg **msg) {
+  int version = (*msg)->trgt_version();
+  int slice_id = SliceID((*msg)->trgt_val());
+  if (shard_->find(slice_id) != shard_->end())
+    LOG(FATAL) << "Param (" << slice_id << ") is put more than once";
+
+  // TODO(wangwei) replace hard coded param type 0
+  auto  param = Singleton<Factory<Param>>::Instance()->Create(0);
+  auto response = param->HandlePutMsg(msg, true);
+  // parse num of shares of this param from a worker group
+  int num_shares = 1;
+  if ((*msg)->NextFrame())
+    (*msg)->ParseFormatFrame("i", &num_shares);
+  DeleteMsg(msg);
+  (*shard_)[slice_id] = new ParamEntry(num_shares, param);
+  // must set version after HandlePutMsg which allocates the memory
+  param->set_version(version);
+  param->set_local_version(version);
+  param->set_id(slice_id);
+  //LOG(ERROR)<<"put norm "<<param->data().asum_data()<<", "<<pid;
+  // allocate blob for param sync between groups.
+  if (Cluster::Get()->nserver_groups() > 1 && slice2group_[slice_id] != 
grp_id_) {
+    last_data_[slice_id] = std::make_shared<Blob<float>>();
+    last_data_[slice_id]->ReshapeLike(param->data());
+    last_data_[slice_id]->CopyFrom(param->data());
+  }
+  LOG(INFO)<<"server (group = " << grp_id_ << ", id = " << id_ <<") put slice="
+    << slice_id << " size=" << param->size();
+  return response;
+}
+
+Msg* Server::HandleGet(Msg **msg) {
+  int val = (*msg)->trgt_val();
+  auto param = shard_->at(SliceID(val))->shares.at(0);
+  // re-queue the request if the param is not updated to the required version
+  if(param->version()<(*msg)->trgt_version())
+    return *msg;
+  else {
+    // LOG(ERROR) << "get " << slice << " from "<<(*msg)->src_first();
+    auto reply = param->HandleGetMsg(msg, false);
+    reply->set_trgt(val, param->version());
+    return reply;
+  }
+}
+
+const vector<Msg*> Server::HandleUpdate(Msg **msg) {
+  vector<Msg*> ret;
+  int sliceid = SliceID((*msg)->trgt_val());
+  auto entry = shard_->at(sliceid);
+  buffer_requests_[sliceid].push_back(*msg);
+  int num_update;
+  (*msg)->LastFrame();
+  (*msg)->ParseFormatFrame("i", &num_update);
+  (*msg)->FirstFrame();
+  entry->num_update += num_update;
+  // LOG(ERROR) << "update "<<sliceid<< " from "<<(*msg)->src_second()
+  //  << ", " << num_update << " total " << entry->num_total;
+  // do update until recv gradients from all shares of this param/slice
+  if (entry->num_update >= entry->num_total) {
+    CHECK_EQ(entry->num_update, entry->num_total);
+    auto& request = buffer_requests_.at(sliceid);
+    int step = (*msg)->trgt_version();
+    auto param = entry->shares.at(0);
+    // extract and aggregate gradients
+    param->ParseUpdateMsgs(request);
+    updater_->Update(step, param, 1.0f / entry->num_total);
+    param->set_local_version(param->local_version() + 1);
+    // response to all shares of this param
+    for (auto response : param->GenUpdateResponseMsgs(&request, false)) {
+      response->set_trgt((*msg)->trgt_val(), param->local_version());
+      ret.push_back(response);
+    }
+    entry->num_update = 0;
+  }
+  *msg = nullptr;
+  return ret;
+}
+
+Msg* Server::HandleSyncRequest(Msg **msg) {
+  Msg* msgg = *msg;
+  int slice = SliceID(msgg->trgt_val());
+  auto param = shard_->at(slice)->shares.at(0);
+  Msg* response=nullptr;
+  auto shape=Shape1(param->size());
+  CHECK_EQ(msgg->FrameSize(), param->size()*sizeof(float));
+  Tensor<cpu, 1> tmp(static_cast<float*>(msgg->FrameData()), shape);
+  Tensor<cpu, 1> cur(param->mutable_data()->mutable_cpu_data(), shape);
+  //LOG(ERROR)<<"Recv sync for "<<param->id();
+  if (slice2group_[slice] == grp_id_) {
+    // recv sync msg on slice I am mastering
+    cur+=tmp;
+    param->set_local_version(param->local_version()+1);
+  } else {  // recv sync msg on slice mastered by others
+    TensorContainer<cpu, 1> diff(shape);
+    Tensor<cpu, 1> prev(last_data_[param->id()]->mutable_cpu_data(), shape);
+    diff=cur-prev;
+    msgg->NextFrame();
+    int bandwidth;
+    msgg->ParseFormatFrame("i", &bandwidth);
+    if (bandwidth > 0) {
+      // send back my updates to the server group mastering this param
+      response=new Msg(msgg->dst(), msgg->src());
+      response->set_type(kSyncRequest);
+      response->set_trgt(param->id(), param->version());
+      response->AddFrame(diff.dptr, param->size()*sizeof(float));
+      prev=diff+tmp;
+      Copy(cur, prev);
+    } else {  // no bandwidth, aggregate my updates for next sync
+      Copy(prev, tmp);
+      cur=tmp+diff;
+    }
+  }
+  DeleteMsg(msg);
+  return response;
+}
+} /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/trainer/trainer.cc
----------------------------------------------------------------------
diff --git a/src/trainer/trainer.cc b/src/trainer/trainer.cc
index 22b5757..c62e0d1 100644
--- a/src/trainer/trainer.cc
+++ b/src/trainer/trainer.cc
@@ -9,7 +9,6 @@
 #include "utils/common.h"
 #include "proto/common.pb.h"
 #include "trainer/trainer.h"
-#include "mshadow/tensor.h"
 
 
 namespace singa {
@@ -440,15 +439,15 @@ void Trainer::GenMsgs(int type, int version, ParamEntry* 
entry,
     Msg* new_msg = nullptr;
     if (type == kPut) {
       CHECK_GT(entry->num_total, 0);
-      //new_msg = param->GenPutMsg(procs != procs_id_, idx);
-      new_msg = param->GenPutMsg(true, idx);
+      new_msg = param->GenPutMsg(procs != procs_id_, idx);
+      // new_msg = param->GenPutMsg(true, idx);
       new_msg->AddFormatFrame("i", entry->num_total);
     } else if (type == kGet) {
-      //new_msg = param->GenGetMsg(procs != procs_id_, idx);
-      new_msg = param->GenGetMsg(true, idx);
+      new_msg = param->GenGetMsg(procs != procs_id_, idx);
+      // new_msg = param->GenGetMsg(true, idx);
     } else if (type == kUpdate) {
-      //new_msg = param->GenUpdateMsg(procs != procs_id_, idx);
-      new_msg = param->GenUpdateMsg(true, idx);
+      new_msg = param->GenUpdateMsg(procs != procs_id_, idx);
+      // new_msg = param->GenUpdateMsg(true, idx);
       new_msg->AddFormatFrame("i", entry->num_local);
     } else {
       LOG(FATAL) << "Wrong type";
@@ -478,13 +477,13 @@ const vector<Msg*> Trainer::HandleUpdate(ParamEntry 
*entry, Msg** msg) {
     // average local gradient
     if (entry->num_local > 1) {
       auto it = entry->shares.begin();
-      auto shape=mshadow::Shape1((*it)->size());
-      mshadow::Tensor<mshadow::cpu,1> sum((*it)->mutable_cpu_grad(), shape);
+      float* sum = (*it)->mutable_grad()->mutable_cpu_data();
       for (++it; it != entry->shares.end(); it++) {
-        mshadow::Tensor<mshadow::cpu,1> grad((*it)->mutable_cpu_grad(), shape);
-        sum += grad;
+        float* grad = (*it)->mutable_grad()->mutable_cpu_data();
+        for (int i = 0; i < (*it)->size(); i++) {
+          sum[i] += grad[i];
+        }
       }
-      sum /= entry->num_total;
     }
     int step = (*msg)->trgt_version();
     GenMsgs(kUpdate, step, entry, *msg, &ret);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/utils/blob.cc
----------------------------------------------------------------------
diff --git a/src/utils/blob.cc b/src/utils/blob.cc
index ef582fa..b27f7db 100644
--- a/src/utils/blob.cc
+++ b/src/utils/blob.cc
@@ -61,7 +61,9 @@
 #include "utils/blob.h"
 
 #include <cblas.h>
+#ifndef CPU_ONLY
 #include <cuda_runtime.h>
+#endif
 #include <math.h>
 #include <utility>
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cb30eadc/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 83bd818..7c09397 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -237,6 +237,10 @@ Msg* Param::GenUpdateMsg(bool copy, int idx) {
   msg->set_type(kUpdate);
   msg->AddFormatFrame("i", copy);
   void* ptr = grad_.mutable_cpu_data() + slice_offset_[idx];
+  // to change the head of SyncMem to cpu; otherwise, the updated parameter
+  // values would not be synced to gpu (since the head is at gpu).
+  mutable_cpu_data();
+  // LOG(ERROR) << id() << ptr;
   if (copy) {
     msg->AddFrame(ptr, slice_size_[idx]*sizeof(float));
   } else {
@@ -341,6 +345,7 @@ void Param::ParseUpdateMsgs(const vector<Msg*>& msgs) {
       }
     }
   }
+  // LOG(ERROR) << id() << server_grad;
   grad_.set_cpu_data(server_grad);
 }
 


Reply via email to