[08/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

wangwei Sun, 03 May 2015 07:05:06 -0700

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr_engine-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr_engine-inl.hpp 
b/include/mshadow/tensor_expr_engine-inl.hpp
new file mode 100644
index 0000000..9c5f2c7
--- /dev/null
+++ b/include/mshadow/tensor_expr_engine-inl.hpp
@@ -0,0 +1,416 @@
+#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
+#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
+/*!
+ * \file tensor_expr_engine-inl.hpp
+ * \brief definitions of how expressions should be evaluated
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_expr.h"
+#include "tensor.h"
+
+namespace mshadow{
+    namespace expr{
+        /*! 
+         * \brief a general class that allows extension that makes tensors of 
some shape
+         * \tparam SubType type of subclass
+         * \tparam SrcExp source expression of the MakeTensorExp, the source 
of operation
+         * \tparam dim dimension of the expression
+         */
+        template<typename SubType, typename SrcExp, int dim>
+        struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, 
type::kMapper >{
+            /*! \brief the shape of this expression */
+            Shape<dim> shape_;
+            /*! \brief true self of subtype */
+            inline const SubType& real_self( void ) const{
+                return *static_cast<const SubType*>(this);
+            }
+        };
+    };
+    
+    namespace expr{
+        /*! \brief This part of code gives plan that can be used to carry out 
execution */
+        template<typename ExpType>
+        class Plan{
+        public:
+            /*!
+             * \brief evaluate the expression at index [y][x]
+             *        to be implemented by SubType
+             */
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const;
+        };
+
+        template <typename Device, int dim>
+        class Plan< Tensor<Device,dim> >{
+        public:
+            Plan( const Tensor<Device,dim> &t )
+                :dptr_(t.dptr),stride_(t.shape.stride_){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ y * stride_ + x ];
+            }
+        private:
+            const real_t  *dptr_;
+            index_t stride_;
+        };
+        // special evaluation case for 1d tensor
+        template <typename Device>
+        class Plan< Tensor<Device,1> >{
+        public:
+            Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ x ];
+            }
+        private:
+            const real_t  *dptr_;
+        };
+        
+        template<>
+        class Plan<ScalarExp>{
+        public:
+            Plan( real_t scalar ):scalar_(scalar){}
+            /*! \brief evaluate at [y][x] */
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                    return scalar_;
+            }
+        private:
+            real_t scalar_;
+        };
+
+        template<typename OP, typename TA, typename TB,int etype>
+        class Plan< BinaryMapExp<OP,TA,TB,etype> >{
+        public:
+            Plan( const Plan<TA> &lhs, const Plan<TB> &rhs )
+                :lhs_(lhs), rhs_(rhs){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
+            }
+        private:
+            Plan<TA> lhs_;
+            Plan<TB> rhs_;
+        };
+
+        template<typename OP, typename TA, int etype>
+        class Plan< UnaryMapExp<OP,TA,etype> >{
+        public:
+            Plan( const Plan<TA> &src ):src_(src){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return OP::Map( src_.Eval( y, x ) );
+            }
+        private:
+            Plan<TA> src_;
+        };
+
+        
+        template<typename SubType, typename SrcExp, int dim>
+        struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{
+        public:
+            Plan( const Plan<SubType> &src ):src_(src){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return src_.Eval( y, x );
+            }
+        private:
+            Plan<SubType> src_;  
+        };
+
+        // allow UnaryMap see the plan
+        template<typename OP, typename TA, typename TB, int etype>
+        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const 
BinaryMapExp<OP,TA,TB,etype> &e );
+
+        // translate from exp to execution plan
+        inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){
+            return Plan<ScalarExp>( e.scalar_ );
+        }
+
+        template<typename T>
+        inline Plan<T> MakePlan( const ContainerExp<T> &e ){
+            return Plan<T>( e.self() );
+        }
+
+        template<typename T, typename SrcExp, int dim>
+        inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){
+            return Plan< T >( e.real_self() );
+        }
+
+        template<typename OP, typename TA, int etype>
+        inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const 
UnaryMapExp<OP,TA,etype> &e ){
+            return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) );
+        }
+
+        template<typename OP, typename TA, typename TB, int etype>
+        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const 
BinaryMapExp<OP,TA,TB,etype> &e ){
+            return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), 
MakePlan(e.rhs_) );
+        }
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief static type inference template, 
+         *        used to get the dimension of each expression, 
+         *        if ExpInfo<E>::kDim == -1, this means here are mismatch in 
expression
+         *        if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means 
this expression can be assigned to cpu
+         * \tparam E expression
+         */
+        template<typename E>
+        struct ExpInfo{
+            const static int kDim = -1;
+            const static int kDevMask = 0;
+        };
+        template<>
+        struct ExpInfo<ScalarExp>{
+            const static int kDim = 0;
+            const static int kDevMask = 0xffff;
+        };
+        template<typename Device, int dim>
+        struct ExpInfo< Tensor<Device,dim> >{
+            const static int kDim = dim;
+            const static int kDevMask = Device::kDevMask;            
+        };
+        template<typename T, typename SrcExp, int dim>
+        struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{
+            const static int kDimSrc = ExpInfo<SrcExp>::kDim;
+            const static int kDim = kDimSrc >= 0 ? dim : -1;
+            const static int kDevMask = ExpInfo<SrcExp>::kDevMask;
+        };
+        template<typename OP, typename TA, int etype>
+        struct ExpInfo< UnaryMapExp<OP,TA,etype> >{
+            const static int kDim = ExpInfo<TA>::kDim;
+            const static int kDevMask = ExpInfo<TA>::kDevMask;
+        };
+        template<typename OP, typename TA, typename TB, int etype>
+        struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{
+            const static int kDimLhs = ExpInfo<TA>::kDim;
+            const static int kDimRhs = ExpInfo<TB>::kDim;
+            const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \
+                ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? 
kDimLhs : -1 ) ):-1;
+            const static int kDevMask = ExpInfo<TA>::kDevMask & 
ExpInfo<TB>::kDevMask;
+        };
+
+        /*! \brief template to do type check */
+        template<typename Device, int dim, typename E>
+        struct TypeCheck{
+            /*! \brief dimension of expression*/
+            const static int kExpDim = ExpInfo<E>::kDim;
+            /*! \brief whether the expression device type matches */
+            const static bool kDevPass = (ExpInfo<E>::kDevMask & 
Device::kDevMask) != 0;
+            /*! \brief whether the expression can be mapped to expression of 
dim */
+            const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && 
kDevPass;
+            /*! \brief whether the expression can be reduced to expression of 
dim */
+            const static bool kRedPass = (kExpDim > dim) && kDevPass;
+        };
+
+        template<bool kPass>
+        struct TypeCheckPass;
+        template<>
+        struct TypeCheckPass<false>{};
+        template<>
+        struct TypeCheckPass<true>{
+            inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( 
void ){}
+            inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void 
){}
+            inline static void Error_Expression_Does_Not_Meet_Dimension_Req( 
void ){}
+        };
+    }; // namespace expr
+    
+    namespace expr{
+        // check shape consistency
+        template<int dim,typename E>
+        struct ShapeCheck{
+            inline static Shape<dim> Check( const E &t );
+        };
+        
+        template<int dim>
+        struct ShapeCheck<dim,ScalarExp>{
+            inline static Shape<dim> Check( const ScalarExp &exp ){
+                // use lowest dimension to mark scalar exp
+                Shape<dim> shape; shape[0] = 0; 
+                return shape;
+            }
+        };
+        template<int dim,typename Device>
+        struct ShapeCheck<dim,Tensor<Device,dim> >{
+            inline static Shape<dim> Check( const Tensor<Device,dim> &t ){
+                return t.shape;
+            }
+        };
+        template<int dim,typename SrcExp,typename T>
+        struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{
+            inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> 
&t ){
+                return t.shape_;
+            }
+        };
+        template<int dim, typename OP, typename TA, int etype>
+        struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{
+            inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t 
){
+                Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ );
+                return s;
+            }
+        };
+        template<int dim, typename OP, typename TA, typename TB, int etype>
+        struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
+            inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> 
&t ){
+                Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ );
+                Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ );
+                if( shape1[0] == 0 ) return shape2;
+                if( shape2[0] == 0 ) return shape1;
+                utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two 
tensors in BinaryMapExp expression is not the same");
+                return shape1;
+            }
+        };
+    }; // namespace expr
+
+    // the matrix OP depends on BLAS
+    namespace expr{
+        template<typename SV,typename Device, int ddim, int ldim, int rdim, 
bool ltrans, bool rtrans>
+        struct DotEngine{
+            inline static void Eval( Tensor<Device,ddim> &dst, const 
Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale );
+        };
+
+        // handles the dot
+        template<typename Device>
+        struct BLASEngine;
+
+        #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL)
+        template<>
+        struct BLASEngine<cpu>{
+            inline static CBLAS_TRANSPOSE GetT( bool t ){
+                return t ? CblasTrans : CblasNoTrans;
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, 
int k, float alpha, \
+                                     const float *A, int lda, const float *B, 
int ldb, float beta, float *C, int ldc ){
+                cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), 
m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, 
int k, double alpha, \
+                                     const double *A, int lda, const double 
*B, int ldb, double beta, double *C, int ldc ){
+                cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), 
m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+            }
+            inline static void gemv( bool trans, int m, int n, float alpha, 
const float *A, int lda, \
+                                     const float *X, int incX, float beta, 
float *Y, int incY ){
+                cblas_sgemv(CblasColMajor, GetT(trans), 
m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void gemv( bool trans, int m, int n, double alpha, 
const double *A, int lda, \
+                                     const double *X, int incX, double beta, 
double *Y, int incY ){
+                cblas_dgemv(CblasColMajor, GetT(trans), 
m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void ger( int m, int n, float alpha, const float *X, 
int incX, const float *Y, int incY, float *A, int lda ){
+                cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+            inline static void ger( int m, int n, double alpha, const double 
*X, int incX, const double *Y, int incY, double *A, int lda ){
+                cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+        };
+        #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
+
+        #if MSHADOW_USE_CUDA
+        // All CuBLAS goes to here, use legacy API: not threadsafe
+        template<>
+        struct BLASEngine<gpu>{
+            inline static char GetT( bool t ){
+                return t ? 'T' : 'N';
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, 
int k, float alpha, 
+                                     const float *A, int lda, const float *B, 
int ldb, float beta, float *C, int ldc ){
+                
cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
+            }
+            inline static void gemm( bool transa, bool transb, int m, int n, 
int k, double alpha, 
+                                     const double *A, int lda, const double 
*B, int ldb, double beta, double *C, int ldc ){
+                
cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);      
          
+            }
+            inline static void gemv( bool trans, int m, int n, float alpha, 
const float *A, int lda, \
+                                     const float *X, int incX, float beta, 
float *Y, int incY ){
+                cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void gemv( bool trans, int m, int n, double alpha, 
const double *A, int lda, \
+                                     const double *X, int incX, double beta, 
double *Y, int incY ){
+                cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
+            }
+            inline static void ger( int m, int n, float alpha, const float *X, 
int incX, const float *Y, int incY, float *A, int lda ){
+                cublasSger(m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+            inline static void ger( int m, int n, double alpha, const double 
*X, int incX, const double *Y, int incY, double *A, int lda ){
+                cublasDger(m,n,alpha,X,incX,Y,incY,A,lda);
+            }
+        };
+        #endif
+
+        // helper function to decide which shape we are in 
+        inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose 
){
+            return transpose ? Shape2(shape[0],shape[1]) : shape;
+        }
+        // dst = dot( lhs[.T], rhs[.T] )
+        template<typename SV, typename xpu, bool transpose_left, bool 
transpose_right>
+        struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{
+            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> 
&lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
+                Shape<2> sleft  = GetShape( lhs.shape, transpose_left );
+                Shape<2> sright = GetShape( rhs.shape, transpose_right );
+                utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == 
sright[0] \
+                               && sleft[0] == sright[1] , "dot-gemm: matrix 
shape mismatch" );
+                // use column major argument to compatible with most BLAS
+                BLASEngine<xpu>::gemm
+                    ( transpose_right , transpose_left,
+                      transpose_right ? rhs.shape[1] : rhs.shape[0],
+                      transpose_left  ? lhs.shape[0] : lhs.shape[1],
+                      transpose_right ? rhs.shape[0] : rhs.shape[1], 
+                      scale * SV::kAlphaBLAS, 
+                      rhs.dptr, rhs.shape.stride_,
+                      lhs.dptr, lhs.shape.stride_,
+                      SV::kBetaBLAS, 
+                      dst.dptr, dst.shape.stride_ );
+            }
+        };
+        template<typename SV, typename xpu, bool transpose_right>
+        struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{
+            inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> 
&lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
+                Shape<2> sright = GetShape( rhs.shape, transpose_right );
+                utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == 
sright[1], "dot-gemv: matrix shape mismatch");
+                BLASEngine<xpu>::gemv
+                    ( transpose_right, 
+                      rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS,
+                      rhs.dptr, rhs.shape.stride_,
+                      lhs.dptr, 1, SV::kBetaBLAS,
+                      dst.dptr, 1 );
+            }
+        };        
+        template<typename SV, typename xpu>
+        struct DotEngine<SV,xpu,2,1,1,true,false>{
+            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> 
&lhs, const Tensor<xpu,1> &rhs, real_t scale ) {
+                utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == 
rhs.shape[0], "dot-ger: matrix shape mismatch" );
+                if( SV::kBetaBLAS < 1e-6f ){
+                    BLASEngine<xpu>::ger
+                        ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS,
+                          rhs.dptr, 1, lhs.dptr, 1, dst.dptr, 
dst.shape.stride_ );
+                }else{
+                    DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, 
lhs.FlatTo2D(), rhs.FlatTo2D(), scale );
+                }
+            }
+        };
+
+    }; // namespace expr
+
+    namespace expr{
+        /*! \brief some engine that evaluate complex expression */
+        template<typename SV, typename Device, int dim, typename E>
+        struct ExpComplexEngine{
+            inline static void Eval( Tensor<Device,dim>& dst, const E &exp );
+        };
+        template<typename SV, typename Device, int dim>
+        struct ExpEngine<SV, Tensor<Device,dim> >{
+            template<typename E>
+            inline static void Eval( Tensor<Device,dim>& dst, const 
Exp<E,type::kMapper> &exp ){
+                MapExp<SV,dim,E>( dst, exp );
+            }
+            template<typename E>
+            inline static void Eval( Tensor<Device,dim>& dst, const 
Exp<E,type::kContainer> &exp ){
+                MapExp<SV,dim,E>( dst, exp );
+            }
+            template<typename E>
+            inline static void Eval( Tensor<Device,dim>& dst, const 
Exp<E,type::kComplex> &exp ){
+                ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() );
+            }
+        };
+        template<typename SV, typename Device, int dim, int ldim,int rdim,bool 
ltrans,bool rtrans>
+        struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, 
Tensor<Device,rdim>, ltrans, rtrans > >{
+            inline static void Eval( Tensor<Device,dim> &dst, const DotExp< 
Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){
+                DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, 
exp.lhs_, exp.rhs_, exp.scale_ );
+            }
+        };
+    }; // namespace expr
+};
+#endif


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr_ext.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr_ext.h 
b/include/mshadow/tensor_expr_ext.h
new file mode 100644
index 0000000..8399b1b
--- /dev/null
+++ b/include/mshadow/tensor_expr_ext.h
@@ -0,0 +1,978 @@
+#ifndef MSHADOW_TENSOR_EXPR_EXT_H
+#define MSHADOW_TENSOR_EXPR_EXT_H
+/*!
+ * \file tensor_expr_ext.h
+ * \brief some extension of expressions, used to support something beyond 
elementwise op
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_expr_engine-inl.hpp"
+namespace mshadow{
+    // Declaration of expressions goes here
+    namespace expr{
+        /*!
+         * \brief broadcast Tensor1D into a higher dimension Tensor
+         * input: Tensor<Device,1>: ishape[0]
+         * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0]
+         * \tparam Device which device it lies
+         * \tparam dimdst  target tensor dimension
+         * \tparam dimcast the dimension where the 1D tensor fills in by index
+         */
+        template<typename Device, int dimdst, int dimcast>
+        struct Broadcast1DExp: public MakeTensorExp< 
Broadcast1DExp<Device,dimdst,dimcast>,Tensor<Device,1>,dimdst>{
+            /*! \brief source operand */
+            const Tensor<Device,1> src_;
+            /*! \brief constructor */
+            Broadcast1DExp( const Tensor<Device,1> &src, Shape<dimdst> shape 
):src_(src){
+                this->shape_ = shape;
+            }
+        };
+
+        /*!
+         * \brief unpack local (overlap) patches of image to column of mat, 
can be used to implement convolution, this expression allow unpack of a batch   
     
+         *  this is a version support unpacking multiple images
+         *  after getting unpacked mat, we can use: output = dot( weight, mat 
) to get covolved results, the relations:
+         * \tparam SrcExp source expression
+         * \tparam dstdim destination dimension
+         */
+        template<typename SrcExp, int srcdim>
+        struct UnpackPatchToColXExp: public MakeTensorExp< 
UnpackPatchToColXExp<SrcExp,srcdim>, SrcExp, 2>{
+            /*! \brief source operand */
+            const SrcExp& img_;
+            /*! \brief patch size */
+            index_t psize_;
+            /*! \brief patch stride */
+            index_t pstride_;
+            /*! \brief number of input channel */
+            index_t i_channel_;
+            /*! \brief height of img */
+            index_t i_height_;
+            /*! \brief width of img */
+            index_t i_width_;            
+            /*! \brief constructor */
+            UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t 
pstride )
+                :img_(img), psize_(psize), pstride_(pstride){
+                Shape<srcdim> imshape = ShapeCheck<srcdim,SrcExp>::Check( img_ 
);
+                utils::Assert( imshape[0] >= psize && imshape[1] >= psize, 
"UnpackPatchToCol:image shape smaller than patch size");
+                this->i_channel_ = imshape[2];
+                this->i_height_  = imshape[1];
+                this->i_width_   = imshape[0];
+                // calculate number of batches 
+                const index_t num = imshape.ProdShape( 3, srcdim );
+                const index_t o_height = ( i_height_ - psize ) / pstride + 1;
+                const index_t o_width  = ( i_width_  - psize ) / pstride + 1;
+                this->shape_[0] = o_height * o_width * num;
+                this->shape_[1] = psize * psize * imshape[2];
+            }
+        };
+
+        /*!
+         * \brief reverse operation of UnpackPatchToCol, used to backprop 
gradient back
+         *    this is a version supporting multiple images
+         * \tparam Device which device it lies
+         * \tparam dstdim destination dimension
+         */
+        template<typename Device, int dstdim>
+        struct PackColToPatchXExp: public MakeTensorExp< 
PackColToPatchXExp<Device,dstdim>, Tensor<Device,2>, dstdim>{
+            /*! \brief source operand */
+            const Tensor<Device,2>& mat_;
+            /*! \brief patch size */
+            index_t psize_;
+            /*! \brief patch stride */
+            index_t pstride_;
+            /*! \brief constructor */
+            PackColToPatchXExp( const Tensor<Device,2> &mat, Shape<dstdim> 
imshape, index_t psize, index_t pstride )
+                :mat_(mat), psize_(psize), pstride_(pstride){
+                this->shape_ = imshape;
+                const index_t o_height = ( imshape[1]  - psize ) / pstride + 
1;                
+                const index_t o_width  = ( imshape[0]  - psize ) / pstride + 
1;                
+                utils::Assert( mat.shape[0] == o_height * o_width * 
imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" );
+                utils::Assert( mat.shape[1] == psize * psize * imshape[2], 
"PackColToPatchExp: mat.shape[1] mismatch" );
+            }
+        };
+
+        /*!
+         * \brief reshape the content to another shape
+         * input: Tensor<Device,dimsrc>: ishape
+         * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size()
+         * \tparam SrcExp source expression
+         * \tparam dimdst target dimension
+         * \tparam dimsrc source dimension
+         */
+        template<typename SrcExp, int dimdst, int dimsrc>
+        struct ReshapeExp: public MakeTensorExp< 
ReshapeExp<SrcExp,dimdst,dimsrc>, SrcExp, dimdst>{
+            /*! \brief source expression */
+            const SrcExp& src_;
+            /*! \brief smallest dimension of input */
+            index_t ishape0_;
+            /*! \brief constructor */
+            ReshapeExp( const SrcExp &src, Shape<dimdst> shape ):src_(src){
+                Shape<dimsrc> ishape = ShapeCheck<dimsrc,SrcExp>::Check( src_ 
);
+                utils::Assert( ishape.Size() == shape.Size(), "reshape size 
must match" );
+                ishape0_ = ishape[0];
+                this->shape_ = shape;
+            }
+        };
+
+        /*!
+         * \brief swap two axis of a tensor
+         * input: Tensor<Device,dim>: ishape
+         * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = 
ishape[a2],oshape[a1]
+         *
+         * \tparam SrcExp type of source expression
+         * \tparam dimsrc source dimension
+         * \tparam a1 smaller dimension to be swapped
+         * \tparam a2 larger dimension to be swapped
+         */
+        template<typename SrcExp,int dimsrc, int a1, int a2>
+        struct SwapAxisExp: public MakeTensorExp< 
SwapAxisExp<SrcExp,dimsrc,a1,a2>, SrcExp, dimsrc>{
+            /*! \brief source expression */
+            const SrcExp& src_;
+            /*! \brief constructor */
+            SwapAxisExp( const SrcExp &src ):src_(src){                
+                this->shape_ = ShapeCheck<dimsrc,SrcExp>::Check(src); 
+                std::swap( this->shape_[a1], this->shape_[a2] );
+            }
+        };
+
+        /*!
+         * \brief reduction to 1 dimension tensor
+         * input: Tensor<Device,k>: ishape
+         * output: Tensor<Device,1> shape[0] = ishape[dimkeep];
+         *
+         * \tparam EType type of expression to be reduced
+         * \tparam Reducer which reducer to use
+         * \tparam srcdim dimension of source
+         * \tparam dimkeep which dimension to be kept,
+         */
+        template<typename EType, typename Reducer,int dimkeep>
+        struct ReduceTo1DExp: public Exp< ReduceTo1DExp<EType,Reducer, 
dimkeep>, type::kComplex >{
+            /*! \brief source operand */
+            const EType& src_;
+            /*! \brief source operand, scale of the  */
+            real_t scale_;
+            /*! \brief construct a repmat expression from src and nrow */
+            ReduceTo1DExp( const EType& src, real_t scale 
):src_(src),scale_(scale){}
+        };
+
+        /*!
+         * \brief pooling expression, do reduction over local patches of a 
image
+         * \tparam Reducer reduction method during pooling
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct PoolingExp: public MakeTensorExp< PoolingExp<Reducer, 
SrcExp,srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief kernel size */
+            index_t ksize_;
+            /*! \brief kernel stride */
+            index_t kstride_;
+            /*! \brief source height shape[1] */
+            index_t src_height_;
+            /*! \brief source width shape[0] */
+            index_t src_width_;
+            /*! \brief constructor */
+            PoolingExp( const SrcExp &src, index_t ksize, index_t kstride )
+                : src_(src), ksize_(ksize), kstride_(kstride) {
+                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( 
src_ );
+                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, 
"pool: kernel must be smaller than image" );
+                this->src_height_ = sshape[1];
+                this->src_width_  = sshape[0];
+                this->shape_ = sshape;
+                this->shape_[1] =  (src_height_ - ksize) / kstride + 1;        
        
+                this->shape_[0] =  (src_width_  - ksize) / kstride + 1;
+            }
+            /*! \brief constructor, specify shape */
+            PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, 
index_t kstride )
+                : src_(src), ksize_(ksize), kstride_(kstride) {
+                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( 
src_ );
+                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, 
"pool: kernel must be smaller than image" );
+                this->src_height_ = sshape[1];
+                this->src_width_  = sshape[0];
+                this->shape_    = sshape;
+                this->shape_[1] = pshape[1];
+                this->shape_[0] = pshape[0];
+            } 
+        };
+
+        /*!
+         * \brief unpooling expr reverse operation of pooling, used to pass 
gradient back
+         * \tparam Reducer specifies reduction operation during pooling
+         * \tparam Device which device it lies
+         */
+        template<typename Reducer, typename Device>
+        struct UnPoolingExp: public MakeTensorExp< UnPoolingExp<Reducer, 
Device>, Tensor<Device,4>, 4> {
+            /*! \brief source input, corresponds to src in pooling */
+            const Tensor<Device, 4>& data_src_;
+            /*! \brief result of pooled data, corresponds to result of pooling 
*/
+            const Tensor<Device, 4>& data_pooled_;
+            /*! \brief gradient data of pooled part, to be propgate down */
+            const Tensor<Device, 4>& grad_pooled_;
+            /*! \brief kernel size */
+            index_t ksize_;
+            /*! \brief kernel stride */
+            index_t kstride_;
+            /*! \brief constructor */
+            UnPoolingExp( const Tensor<Device,4> &data_src,  const 
Tensor<Device,4> &data_pooled,
+                          const Tensor<Device,4> &grad_pooled, index_t ksize, 
index_t kstride )
+                : data_src_(data_src), data_pooled_(data_pooled), 
grad_pooled_(grad_pooled),
+                  ksize_(ksize), kstride_(kstride) {
+                utils::Assert( grad_pooled.shape == data_pooled.shape, 
"UnPoolingExp: pooled shape mismatch" );
+                utils::Assert( grad_pooled.shape[2] == data_src.shape[2], 
"UnPoolingExp: pool and src shape mismatch" );
+                utils::Assert( grad_pooled.shape[3] == data_src.shape[3], 
"UnPoolingExp: pool and src shape mismatch" );
+                this->shape_ = data_src_.shape;
+            }
+        };
+
+        /*!
+         * \brief padding expression, pad a image with zeros
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename SrcExp, int srcdim>
+        struct PaddingExp : public MakeTensorExp<PaddingExp<SrcExp, srcdim>, 
SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief pad size */
+            index_t pad_;
+            /*! \brief source tensor height */
+            index_t src_height_;
+            /*! \brief source tensor width */
+            index_t src_width_;
+            /*! \brief constructor */
+            PaddingExp( const SrcExp &src, index_t pad )
+                : src_(src), pad_(pad) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                src_height_ = this->shape_[1];
+                src_width_  = this->shape_[0];
+                this->shape_[1] += pad * 2; // height
+                this->shape_[0] += pad * 2; // width
+            }
+        };
+
+        /*!
+         * \brief crop expression, cut off the boundary region, reverse 
operation of padding
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename SrcExp, int srcdim>
+        struct CroppingExp : public MakeTensorExp< CroppingExp<SrcExp, 
srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief pad height */
+            index_t pad_height_;
+            /*! \brief pad height */
+            index_t pad_width_;
+            /*! \brief src height */
+            index_t src_height_;
+            /*! \brief constructor */
+            CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: 
height requirement not met");
+                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: 
width requirement not met");
+                pad_height_ = (this->shape_[1] - cshape[1]) / 2;
+                pad_width_ = (this->shape_[0] - cshape[0]) / 2;
+                src_height_ = this->shape_[1];
+                this->shape_[1] = cshape[1]; // width
+                this->shape_[0] = cshape[0]; // height
+            }
+            /*! \brief constructor */
+            CroppingExp(const SrcExp &src, Shape<2> cshape, index_t 
start_height, index_t start_width  )
+                : src_(src), pad_height_(start_height), 
pad_width_(start_width) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: 
height requirement not met");
+                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: 
width requirement not met");
+                src_height_ = this->shape_[1];
+                this->shape_[1] = cshape[1]; // width
+                this->shape_[0] = cshape[0]; // height
+            }
+
+        }; // struct CroppingExp
+
+
+        /*!
+         * \brief mirror expression, mirror a image in width
+         * \tparam SrcExp source expression to be mirrored
+         * \tparam srcdim dimension of src
+         */
+        template<typename SrcExp, int srcdim>
+        struct MirroringExp : public MakeTensorExp<MirroringExp<SrcExp, 
srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief constructor */
+            MirroringExp( const SrcExp &src ): src_(src) {
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+            }
+        };
+
+        /*!
+         * \brief channel pooling expression, do reduction over (local nearby) 
channels, used to implement local response normalization
+         * \tparam Reducer reduction method during pooling
+         * \tparam SrcExp source expression to be pooled from
+         * \tparam srcdim dimension of src
+         */
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct ChannelPoolingExp: public MakeTensorExp< 
ChannelPoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
+            /*! \brief source operand */
+            const SrcExp& src_;
+            /*! \brief neighbor size */
+            index_t nsize_;            
+            /*! \brief constructor */
+            ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), 
nsize_(nsize){
+                utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size 
must be odd, to make it symmetric" );
+                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
+                utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: 
local size need to be smaller than number of channels" );
+            }
+        };
+    }; // namespace expr
+
+
+    // Declaration of all functions go here
+    namespace expr{
+        /*! \brief operator overload */
+        template<typename E, typename R,int d>
+        inline ReduceTo1DExp<E,R,d> operator*( const ReduceTo1DExp<E,R,d> &e, 
real_t scale ){
+            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
+        }
+        /*! \brief operator overload */
+        template<typename E, typename R,int d>
+        inline ReduceTo1DExp<E,R,d> operator*( real_t scale, const 
ReduceTo1DExp<E,R,d> &e ){
+            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
+        }
+
+        /*!
+         * \brief a expression that replicate a 1 dimension tensor in 
dimension dimcast
+         * \param src Tensor<Device,1>: shape[0]
+         * \param shape shape of output
+         * \return a expresion with type Tensor<Device,dimdst>
+         * \tparam dimcast target dimension where the 1D tensor will be 
broadcasted
+         * \tparam Device which device it lies
+         * \tparam dimdst dimension of destination tensor
+         */
+        template<int dimcast,typename Device,int dimdst>
+        inline Broadcast1DExp<Device,dimdst,dimcast> broadcast( const 
Tensor<Device,1> &src, Shape<dimdst> shape ){
+            TypeCheckPass< dimcast<dimdst 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+            utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape 
mismatch" );
+            return Broadcast1DExp<Device,dimdst,dimcast>( src, shape );
+        }
+
+        /*!
+         * \brief  unpack local (overlap) patches of image to column of mat, 
can be used to implement convolution
+         *  after getting unpacked mat, we can use: output = dot( weight, mat 
) to get covolved results, the relations:
+         *
+         *  weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize
+         *  output; shape[1]: out_channel, shape[0]: out_height*out_width * 
num_of_images
+         *  out_height = ( in_height - psize ) / pstride + 1, this means we 
pad inperfect patch with 0
+         *  out_width  = ( in_width - psize ) / pstride + 1
+         *
+         * \return mat target matrix; shape[1]: in_channel*psize*psize  
shape[0]: out_height*out_width * num_of_images
+         * \param img source image; shape[2]:  in_channels, shape[1]: 
in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images)
+         * \param psize height and width of each patch
+         * \param pstride stride of each patch
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename SrcExp, int etype>
+        inline UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim > 
unpack_patch2col( const Exp<SrcExp,etype> &img, index_t psize, index_t pstride 
){
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim >( 
img.self(), psize, pstride );
+        }
+
+        /*!
+         * \brief reverse operation of pack_col2patch, can be used to 
implement deconvolution
+         * \return packed img expression
+         * \param mat source matrix
+         * \param imshape shape of target img
+         * \param psize height and width of each patch
+         * \param pstride stride of each patch
+         * \tparam Device the Device where input data lies
+         */
+        template<typename Device, int dstdim>
+        inline PackColToPatchXExp<Device,dstdim> pack_col2patch( const 
Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride ){
+            utils::Assert( imshape[0] >= psize && imshape[1] >= psize, 
"PackColToPatch:image shape smaller than patch size");
+            return PackColToPatchXExp<Device,dstdim>( mat, imshape, psize, 
pstride );
+        }
+        /*!
+         * \brief a expression that reshapes a tensor to another shape
+         * \param src Tensor<Device,dimsrc>:
+         * \param oshape target shape
+         * \return a expresion with type Tensor<Device,dimdst>
+         * \tparam SrcExp source expression
+         * \tparam etype source expression type
+         * \tparam dimdst target dimension
+         */
+        template<typename SrcExp, int etype, int dimdst>
+        inline ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim > reshape( 
const Exp<SrcExp,etype> &src, Shape<dimdst> oshape ){
+            return ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim >( 
src.self(), oshape );
+        }
+
+        /*!
+         * \brief a expression that reshapes a tensor to another shape
+         * \param src Tensor<Device,dimsrc>:
+         * \return a expresion with type Tensor<Device,dimdst>
+         * \tparam a1 smaller dimension to be swapped
+         * \tparam a2 larger dimension to be swapped
+         * \tparam SrcExp source expression
+         * \tparam etype source expression type
+         */
+        template<int a1, int a2, typename SrcExp, int etype>
+        inline SwapAxisExp< SrcExp, ExpInfo<SrcExp>::kDim, a1,a2> swapaxis( 
const Exp<SrcExp,etype> &src ){ 
+            typedef ExpInfo<SrcExp> Info;
+            TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= 
a2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() );
+        }
+
+        /*!
+         * \brief a sum over all dimensions, except dimkeep
+         * \param exp input expression that must be a matrix Tensor<?,2>
+         * \return a expresion with type Tensor<Device,1>
+         * \tparam dimkeep the dimension that will be kept
+         * \tparam SrcExp expression
+         * \tparam etype type of expression
+         */
+        template<int dimkeep,  typename SrcExp, int etype>
+        inline ReduceTo1DExp<SrcExp, red::sum, dimkeep > sumall_except_dim( 
const Exp<SrcExp,etype> &exp ){
+            return ReduceTo1DExp<SrcExp,red::sum,dimkeep>( exp.self(), 1.0f );
+        }
+
+        /*!
+         * \brief pooling subregion results together
+         * \param src source image, shape[3]: batch, shape[2]: channel 
shape[1]: height shape[0]:width
+         * \param ksize kernel size
+         * \param kstride stride for each kernel
+         * \return expression of pooled result
+         * \tparam Reducer reducer type
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename Reducer, typename SrcExp, int etype>
+        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const 
Exp<SrcExp,etype> &src, index_t ksize, index_t kstride ) {
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim 
>(src.self(), ksize, kstride);
+        }
+        /*! 
+         * \brief same as pool, except the output shape is specified by pshape
+         * \param src source image
+         * \param pshape ouput shape 
+         * \param ksize kernel size
+         * \param kstride stride for each kernel
+         * \return expression of pooled result
+         * \tparam Reducer reducer type
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename Reducer, typename SrcExp, int etype>
+        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const 
Exp<SrcExp,etype> &src, Shape<2> pshape, index_t ksize, index_t kstride ) {
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim 
>(src.self(), pshape, ksize, kstride);
+        }
+        /*!
+         * \brief unpooling gradient for 4D, backprop gradient value back, 
revserse operation of pooling
+         * \param data_src  source input, corresponds to src in pooling
+         * \param data_pooled result of pooled data, corresponds to result of 
pooling
+         * \param grad_pooled gradient data of pooled part, to be propgate down
+         * \param ksize kernel size
+         * \param kstride stride for each kernel
+         * \return expression corresponding to unpooled 4D Tensor, storing 
backproped gradient
+         * \tparam Reducer reducer type
+         * \tparam Device device where data lies
+         */
+         template<typename Reducer, typename Device>
+         inline UnPoolingExp<Reducer, Device> unpool( const 
Tensor<Device,4>&data_src, const Tensor<Device,4> &data_pooled,
+                                                      const Tensor<Device,4> 
&grad_pooled, index_t ksize, index_t kstride ) {
+             return UnPoolingExp<Reducer, Device>(data_src, data_pooled, 
grad_pooled,ksize, kstride);
+         }
+
+        /*!
+         * \brief padding expression, pad a image with zeros on boundaries, 
padding affects shape[0], and shape[1]
+         * \param src original image batches
+         * \param pad padding size
+         * \return expression corresponding to padded result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim> pad(const 
Exp<SrcExp, etype> &src, index_t pad) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), pad);
+         }
+
+        /*!
+         * \brief revserse operationg of padding, cut off boundaries, crop 
output from center of input
+         * \param src original image batches
+         * \param oshape output shape to be cropped
+         * \return expression corresponding to padded result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const 
Exp<SrcExp, etype> &src, Shape<2> oshape ) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), 
oshape);
+         }
+        /*!
+         * \brief same as crop, but can specify starting position to do 
cropping
+         * \param src original image batches
+         * \param oshape output shape to be cropped
+         * \param start_height start height position to do cropping
+         * \param start_width  start width position to do cropping
+         * \return expression corresponding to padded result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const 
Exp<SrcExp, etype> &src, Shape<2> oshape, index_t start_height, index_t 
start_width ) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), 
oshape, start_height, start_width);
+         }
+
+        /*!
+         * \brief mirroring expression, mirror images in width
+         * \param src original image batches
+         * \return expression corresponding to mirrored result
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+         template<typename SrcExp, int etype>
+         inline MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim> mirror(const 
Exp<SrcExp, etype> &src) {
+             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+             return MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self());
+         }
+
+        /*!
+         * \brief  channel pooling, do reduction over (local nearby) channels, 
used to implement local response normalization
+         * \param src source data 
+         * \param nsize neighbor size 
+         * \return expression of pooled result
+         * \tparam Reducer reducer type
+         * \tparam SrcExp source expression
+         * \tparam etype type of expression
+         */
+        template<typename Reducer, typename SrcExp, int etype>
+        inline ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > 
chpool( const Exp<SrcExp,etype> &src, index_t nsize ) {
+            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+            return ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim 
>(src.self(),nsize);
+        }
+        // short cut functions
+        /*!
+         * \brief a expression that replicate a 1 dimension tensor for nrow 
times
+         * \param src Tensor<Device,1>: shape[0]
+         * \param nrow number of rows to replicate
+         * \return a expresion with type Tensor<Device,2> shape[0], shape[1] = 
nrow
+         * \tparam Device which device it lies
+         */
+        template<typename Device>
+        inline Broadcast1DExp<Device,2,0> repmat( const Tensor<Device,1> &src, 
index_t nrow ){
+            return broadcast<0>( src, Shape2( nrow, src.shape[0] ) );
+        }
+        /*!
+         * \brief a expression that sum over rows of a matrix
+         * \param exp input expression that must be a matrix Tensor<?,2>
+         * \return a expresion with type Tensor<Device,1>
+         * \tparam SrcExp expression
+         * \tparam etype type of expression
+         */
+        template<typename SrcExp, int etype>
+        inline ReduceTo1DExp<SrcExp, red::sum, 0 > sum_rows( const 
Exp<SrcExp,etype> &exp ){
+            return sumall_except_dim<0>( exp );
+        }
+
+    }; // namespace expr
+}; // namespace mshadow
+
+// ==================================================
+//  implementations afterwards,
+//  no need to read if only use the functions
+// --------------------------------------------------
+namespace mshadow{
+    namespace expr{
+        template<typename SV, typename Device, typename EType, typename 
Reducer, int dimkeep>
+        struct ExpComplexEngine< SV, Device, 1, 
ReduceTo1DExp<EType,Reducer,dimkeep> >{
+            inline static void Eval( Tensor<Device,1> &dst, const 
ReduceTo1DExp<EType,Reducer,dimkeep> &exp ){
+                TypeCheckPass< dimkeep!=0 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+                MapReduceKeepHighDim<SV,Reducer,dimkeep>( dst, exp.src_, 
exp.scale_ );
+            }
+        };
+
+        template<typename SV, typename Device, typename EType, typename 
Reducer>
+        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,0> 
>{
+            inline static void Eval( Tensor<Device,1> &dst, const 
ReduceTo1DExp<EType,Reducer,0> &exp ){
+                MapReduceKeepLowest<SV,Reducer>( dst, exp.src_, exp.scale_ );
+            }
+        };
+    }; // namespace expr
+
+    namespace expr{
+        /*! \brief execution plan of Broadcast1DExp */
+        template<typename Device, int dimdst, int dimcast>
+        struct Plan< Broadcast1DExp<Device,dimdst,dimcast> >{
+        public:
+            Plan( const Broadcast1DExp<Device,dimdst,dimcast> &e )
+                : dptr_( e.src_.dptr ), 
+                  ystride_( e.shape_.ProdShape(1,dimcast) ),
+                  length_(e.shape_[dimcast]){
+                TypeCheckPass< dimcast!=0 
>::Error_Expression_Does_Not_Meet_Dimension_Req();
+            }
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ (y / ystride_) % length_ ];
+            }
+        private:
+            const real_t  *dptr_;
+            const index_t  ystride_, length_;
+        };
+
+        /*! \brief execution plan of Broadcast1DExp */
+        template<typename Device, int dimdst>
+        struct Plan< Broadcast1DExp<Device,dimdst,0> >{
+        public:
+            Plan( const Broadcast1DExp<Device,dimdst,0> &e ): dptr_( 
e.src_.dptr ){}
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ x ];
+            }
+        private:
+            const real_t *dptr_;
+        };
+    }; // namespace expr
+
+    namespace expr{
+        template<typename SrcExp, int srcdim>
+        struct Plan< UnpackPatchToColXExp<SrcExp,srcdim> >{
+        public:
+            Plan( const UnpackPatchToColXExp<SrcExp,srcdim> &e )
+                :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_),
+                 i_channel_(e.i_channel_), i_height_(e.i_height_), 
i_width_(e.i_width_),                 
+                 o_height_(( i_height_  - psize_ ) / pstride_ + 1),
+                 o_width_ (( i_width_   - psize_ ) / pstride_ + 1){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
+                const index_t x_offset = i % psize_;
+                const index_t idivp    = i / psize_;
+                const index_t y_offset = idivp % psize_;
+                const index_t c = idivp / psize_;                
+                const index_t x = (j % o_width_) * pstride_ + x_offset;
+                const index_t jdivw = j / o_width_;
+                const index_t y = (jdivw % o_height_) * pstride_ + y_offset;
+                const index_t n = jdivw / o_height_;
+
+                if( x < i_width_ && y < i_height_ ){
+                    return src_.Eval( ( n * i_channel_  + c ) * i_height_ + y, 
x );
+                }else{
+                    return 0.0f;
+                }
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t psize_, pstride_, i_channel_, i_height_, i_width_, 
o_height_, o_width_;
+        };
+
+        template<typename Device, int dstdim>
+        struct Plan< PackColToPatchXExp<Device, dstdim> >{
+        public:
+            Plan( const PackColToPatchXExp<Device, dstdim> &e )
+                :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_),
+                 i_channel_(e.shape_[2]), i_height_(e.shape_[1]),
+                 o_width_(( e.shape_[0]  - psize_ ) / pstride_ + 1),
+                 o_height_(( e.shape_[1]  - psize_ ) / pstride_ + 1){
+                // note: i/o convention are same as unpack
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
+                using namespace std;
+                const index_t y = i % i_height_;
+                const index_t idivh = i / i_height_;                
+                const index_t c = idivh % i_channel_;
+                const index_t n = idivh / i_channel_; 
+                const index_t x = j;
+                const index_t py_min = y < psize_ ? 0 : 
(y-psize_+pstride_)/pstride_;
+                const index_t px_min = x < psize_ ? 0 : 
(x-psize_+pstride_)/pstride_;
+                const index_t py_max = min( (y+pstride_)/pstride_, o_height_);
+                const index_t px_max = min( (x+pstride_)/pstride_, o_width_ );
+                real_t res = 0.0f;
+                for( index_t py = py_min; py < py_max; ++py ){
+                    for( index_t px = px_min; px < px_max; ++px ){
+                        res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + 
x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ];
+                    }
+                }
+                return res;
+            }
+        private:
+            Tensor<Device,2> mat_;
+            const index_t psize_, pstride_, i_channel_, i_height_, o_width_, 
o_height_;
+        };
+    };
+
+    namespace expr{
+        template<typename SrcExp, int dimdst, int dimsrc>
+        struct Plan< ReshapeExp<SrcExp,dimdst,dimsrc> >{
+        public:
+            Plan( const ReshapeExp<SrcExp,dimdst,dimsrc> &e )
+                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), 
ishape0_(e.ishape0_){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                const index_t idx = y * oshape0_ + x;
+                return src_.Eval( idx / ishape0_, idx % ishape0_ );
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t oshape0_, ishape0_;
+        };
+        // special work plan for 1 dimensional data
+        template<typename SrcExp,int dimdst>
+        struct Plan< ReshapeExp<SrcExp,dimdst,1> >{
+        public:
+            Plan( const ReshapeExp<SrcExp,dimdst,1> &e )
+                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
+                return src_.Eval( 0, y * oshape0_ + x );
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t oshape0_;
+        };
+    };
+    
+    namespace expr{
+        template<typename SrcExp,int dimsrc, int a1, int a2>
+        struct Plan< SwapAxisExp<SrcExp,dimsrc,a1,a2> >{
+        public:
+            Plan( const SwapAxisExp<SrcExp,dimsrc,a1,a2> &e )
+                : src_(MakePlan(e.src_)),
+                  shape1_( e.shape_.ProdShape( 1, a1 ) ),
+                  shape2_( e.shape_[a1] ),
+                  shape3_( e.shape_.ProdShape( a1+1, a2 ) ),
+                  shape4_( e.shape_[a2] ){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
+                const index_t y = i % shape1_;
+                i /= shape1_; 
+                const index_t z = i % shape2_;
+                i /= shape2_;
+                const index_t c = i % shape3_;
+                i /= shape3_;
+                const index_t n = i % shape4_;
+                // swap z and n
+                return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * 
shape4_ + n ) * shape1_ + y, j ); 
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t shape1_, shape2_, shape3_, shape4_;
+        };
+
+        template<typename SrcExp,int dimsrc, int a2>
+        struct Plan< SwapAxisExp<SrcExp,dimsrc,0,a2> >{
+        public:
+            Plan( const SwapAxisExp<SrcExp,dimsrc,0,a2> &e )
+                : src_(MakePlan(e.src_)),
+                  shape0_( e.shape_[0] ),
+                  shape1_( e.shape_.ProdShape(1,a2) ),
+                  shape2_( e.shape_[a2] ){
+            }
+            MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{
+                // swap x and z
+                const index_t y = i % shape1_;
+                i /= shape1_; 
+                const index_t z = i % shape2_;
+                const index_t n = i / shape2_;
+                return src_.Eval(  ( n*shape0_ + x ) * shape1_ + y , z ); 
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t shape0_, shape1_, shape2_;
+        };
+    };
+
+    namespace expr{
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > {
+        public:
+            Plan( const PoolingExp<Reducer, SrcExp, srcdim> &e )
+                : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), 
kstride_(e.kstride_),
+                  src_height_(e.src_height_),src_width_(e.src_width_), 
new_height_(e.shape_[1]) {
+            }
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                using namespace std;
+                const index_t py = i % new_height_;
+                const index_t y_start = py * kstride_;
+                const index_t y_end = min( y_start + ksize_, src_height_ );
+                const index_t px = j;
+                const index_t x_start = px * kstride_;
+                const index_t x_end = min( x_start + ksize_, src_width_ );
+                const index_t c = i / new_height_;
+
+                real_t res = Reducer::kInitV;
+                for (index_t y = y_start; y < y_end; ++y) {
+                    for (index_t x = x_start; x < x_end; ++x) {
+                        Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) 
);
+                    }
+                }
+                return res;
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t ksize_, kstride_;
+            const index_t src_height_, src_width_;
+            const index_t new_height_;
+        };
+
+        template<typename Reducer, typename Device>
+        struct Plan<UnPoolingExp<Reducer, Device> > {
+        public:
+            Plan(const UnPoolingExp<Reducer, Device> &e)
+                : data_src_(e.data_src_), data_pooled_(e.data_pooled_), 
grad_pooled_(e.grad_pooled_),
+                  ksize_(e.ksize_), kstride_(e.kstride_) {}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                using namespace std;
+                const index_t x = j;
+                const index_t y = i % data_src_.shape[1];
+                const index_t c = i / data_src_.shape[1];
+                const real_t vsrc = data_src_[0][c][y][x];
+
+                const index_t py_min = y < ksize_ ? 0 : 
(y-ksize_+kstride_)/kstride_;
+                const index_t px_min = x < ksize_ ? 0 : 
(x-ksize_+kstride_)/kstride_;
+                const index_t py_max = min( (y+kstride_)/kstride_, 
data_pooled_.shape[1]);
+                const index_t px_max = min( (x+kstride_)/kstride_, 
data_pooled_.shape[0]);
+
+                real_t val = 0;
+                for( index_t py = py_min; py < py_max; ++py ){
+                    for( index_t px = px_min; px < px_max; ++px ){
+                        val += Reducer::PartialGrad(vsrc, 
data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px];
+                    }
+                }
+                return val;
+            }
+        private:
+            Tensor<Device, 4> data_src_, data_pooled_, grad_pooled_;
+            const index_t ksize_;
+            const index_t kstride_;
+        };
+    }; // namespace expr
+
+    namespace expr{
+        template<typename SrcExp, int srcdim>
+        struct Plan< PaddingExp<SrcExp, srcdim> > {
+        public:
+            Plan(const PaddingExp<SrcExp, srcdim> &e)
+                : src_(MakePlan(e.src_)), pad_(e.pad_), 
new_height_(e.shape_[1]),
+                  src_height_(e.src_height_), src_width_(e.src_width_) {}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                const index_t x = j;
+                const index_t y = i % new_height_;
+                const index_t c = i / new_height_;
+                if (y < pad_ || x < pad_) return 0.0f;
+                const index_t h = y - pad_;
+                const index_t w = x - pad_;
+                if (h < src_height_ && w < src_width_) {
+                    return src_.Eval(c * src_height_ + h, w);
+                } else {
+                    return 0.0f;
+                }
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t pad_;
+            const index_t new_height_;
+            const index_t src_height_;
+            const index_t src_width_;
+        };
+
+        template<typename SrcExp, int srcdim>
+        struct Plan<CroppingExp<SrcExp, srcdim> > {
+        public:
+            Plan(const CroppingExp<SrcExp, srcdim> &e)
+                : src_(MakePlan(e.src_)), 
pad_height_(e.pad_height_),pad_width_(e.pad_width_), 
+                  new_height_(e.shape_[1]), src_height_(e.src_height_) {}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                const index_t x = j;
+                const index_t y = i % new_height_;
+                const index_t c = i / new_height_;
+                const index_t h = y + pad_height_;
+                const index_t w = x + pad_width_;
+                return src_.Eval(c * src_height_ + h, w);
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t pad_height_, pad_width_;
+            const index_t new_height_;
+            const index_t src_height_;
+        };
+
+        template<typename SrcExp, int srcdim>
+        struct Plan< MirroringExp<SrcExp, srcdim> > {
+        public:
+            Plan(const MirroringExp<SrcExp, srcdim> &e)
+                : src_(MakePlan(e.src_)), width_(e.shape_[0]){}
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                return src_.Eval( i, width_ - j - 1 );
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t width_;
+        };
+    }; // namespace expr
+
+    namespace expr{
+        template<typename Reducer, typename SrcExp, int srcdim>
+        struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > {
+        public:
+            Plan( const ChannelPoolingExp<Reducer, SrcExp, srcdim> &e )
+                : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]),
+                  height_(e.shape_[1]),width_(e.shape_[0]), 
hnsize_(e.nsize_/2){
+            }
+            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
+                using namespace std;
+                const index_t y = i % height_;
+                i /= height_;
+                const index_t c = i % channel_;
+                const index_t n = i / channel_;
+                const index_t x = j;
+                const index_t cstart = c < hnsize_ ? 0  : c - hnsize_;
+                const index_t cend   = min( c + hnsize_ + 1, channel_ );
+                real_t res = Reducer::kInitV;
+                for( index_t cc = cstart; cc < cend; ++ cc ){
+                    Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + 
y, x ) );
+                }
+                return res;
+            }
+        private:
+            Plan<SrcExp> src_;
+            const index_t channel_, height_, width_, hnsize_;
+        };
+    };
+}; // namespace mshadow
+
+#if MSHADOW_USE_SSE
+// implementations of SSE support, if possible
+#include "tensor_sse-inl.hpp"
+namespace mshadow{
+    namespace expr{
+        template<int dimdst>
+        struct SSECheck< Broadcast1DExp<cpu,dimdst,0> >{
+            const static bool kPass = true;
+        };
+        template<int dimdst>
+        struct SSEAlignCheck<2, Broadcast1DExp<cpu,dimdst,0> >{
+            inline static bool Check( const Broadcast1DExp<cpu,dimdst,0> &exp 
){
+                return sse2::CheckAlign( exp.src_.dptr );
+            }
+        };
+        template<int dimdst>
+        class SSEPlan< Broadcast1DExp<cpu,dimdst,0> >{
+        public:
+            SSEPlan( const Broadcast1DExp<cpu,dimdst,0> &t )
+                :dptr_(t.src_.dptr){}
+            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) 
const{
+                return sse2::FVec<real_t>( &dptr_[ x ] );
+            }
+            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
+                return dptr_[ x ];
+            }
+        private:
+            const real_t  *dptr_;
+        };
+    };
+};
+#endif
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_gpu-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_gpu-inl.hpp 
b/include/mshadow/tensor_gpu-inl.hpp
new file mode 100644
index 0000000..a2c1fc4
--- /dev/null
+++ b/include/mshadow/tensor_gpu-inl.hpp
@@ -0,0 +1,148 @@
+#ifndef MSHADOW_TENSOR_GPU_INL_HPP
+#define MSHADOW_TENSOR_GPU_INL_HPP
+/*!
+ * \file tensor_gpu-inl.hpp
+ * \brief implementation of GPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "tensor.h"
+
+#if !(MSHADOW_USE_CUDA)
+namespace mshadow {
+    // do nothing if no GPU operation is involved
+    inline void InitTensorEngine( int dev_id ){
+    }
+    inline void ShutdownTensorEngine( void ){
+    }
+};
+#else
+namespace mshadow {
+    #if (MSHADOW_USE_NVML)
+    inline int AutoSelectDevice(int device_count) {
+        // TODO nvml device id and cuda device id are not consistent
+        return 0;
+    }
+    #endif
+    inline void InitTensorEngine(int dev_id){
+        cudaDeviceProp prop;
+        int device_id = 0;
+        int device_count = 0;
+        cudaGetDeviceCount(&device_count);
+        utils::Assert(device_count > 0, "Cannot find CUDA device. Please check 
CUDA-Configuration");
+        if (dev_id < 0) {
+            #if (MSHADOW_USE_NVML)
+            device_id = AutoSelectDevice(device_count);
+            #endif
+        } else {
+            device_id = dev_id;
+        }
+        utils::Assert( device_id < device_count, "Incorrect Device ID" );
+        utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set 
device" );
+        cudaGetDeviceProperties(&prop, device_id);
+        printf("Use CUDA Device %d: %s\n", device_id, prop.name);
+        cublasInit();
+    }
+    inline void ShutdownTensorEngine( void ){
+        cublasShutdown();
+    }
+
+    template<int dim>
+    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){
+        size_t pitch;
+        // common choice for cuda mem align unit is 32
+        if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){
+            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
+                                               obj.shape[0] * sizeof(real_t), 
obj.FlatTo2D().shape[1] );
+            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
+            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
+        }else{
+            obj.shape.stride_ = obj.shape[0];
+            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
+                                               obj.shape.Size() * 
sizeof(real_t), 1 );
+            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
+        }
+    }
+
+    template<int dim>
+    inline void FreeSpace(Tensor<gpu,dim> &obj){
+        cudaFree( obj.dptr ); obj.dptr = NULL;
+    }
+
+    template<typename A,typename B, int dim>
+    inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind 
kind){
+        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
+        Tensor<A,2> dst = _dst.FlatTo2D();
+        Tensor<B,2> src = _src.FlatTo2D();
+        cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * 
sizeof(real_t),
+                                        src.dptr, src.shape.stride_ * 
sizeof(real_t),
+                                        dst.shape[0] * sizeof(real_t),
+                                        dst.shape[1], kind );
+        utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
+    }
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src){
+        Copy( dst, src, cudaMemcpyDeviceToHost );
+    }
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src){
+        Copy( dst, src, cudaMemcpyDeviceToDevice );
+    }
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src){
+        Copy( dst, src, cudaMemcpyHostToDevice );
+    }
+};
+
+#ifdef __CUDACC__
+// the following part is included only if compiler is nvcc
+#include "cuda/tensor_gpu-inl.cuh"
+
+namespace mshadow{
+    template<typename Saver, typename E, int dim>
+    inline void MapPlan(Tensor<gpu,dim> _dst, const expr::Plan<E> &plan){
+        cuda::MapPlan<Saver>( _dst.FlatTo2D(), plan );
+    }
+
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<gpu,dim,E>::kMapPass 
>::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
+        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: 
Shape of Tensors in expression is not consistent with target" );
+        MapPlan<Saver>( dst, MakePlan( exp.self() ) );
+    }
+
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<gpu,1,E>::kRedPass 
>::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() 
).FlatTo2D();
+
+        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not 
match" );
+        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
+        cuda::MapReduceKeepLowest<Saver,Reducer>( dst, MakePlan( exp.self() ), 
scale, eshape );
+    }
+
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int 
etype>
+    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<gpu,dimkeep,E>::kRedPass 
>::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        typedef Shape< ExpInfo<E>::kDim > EShape;
+        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
+        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension 
do not match" );
+        // use equvalent form
+        Shape<4> pshape = Shape4( 
eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep],
+                                  eshape.ProdShape(1,dimkeep), eshape[0] );
+        // call equavalent map red dim 2
+        cuda::MapReduceKeepDim2<Saver,Reducer>( dst, MakePlan( exp.self() ), 
scale, pshape );
+    }
+
+    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2>& src ){
+        cuda::Softmax( dst, src );
+    }
+}; // namespace mshadow
+
+#endif // __CUDACC__
+
+#endif // MSHADOW_USE_CUDA
+#endif // TENSOR_GPU_INL_HPP

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_io.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_io.h b/include/mshadow/tensor_io.h
new file mode 100644
index 0000000..2ce28b3
--- /dev/null
+++ b/include/mshadow/tensor_io.h
@@ -0,0 +1,137 @@
+#ifndef MSHADOW_TENSOR_IO_H
+#define MSHADOW_TENSOR_IO_H
+/*!
+ * \file tensor_io.h
+ * \brief definitions of I/O functions for mshadow tensor
+ * \author Tianqi Chen
+ */
+#include <cstdio>
+#include "tensor.h"
+
+namespace mshadow{
+    namespace utils{
+        /*! 
+         * \brief interface of stream I/O, used to serialize data, 
+         *   it is not restricted to only this interface in 
SaveBinary/LoadBinary
+         *   mshadow accept all class that implements Read and Write
+         */
+        class IStream{
+        public:
+            /*! 
+             * \brief read data from stream
+             * \param ptr pointer to memory buffer
+             * \param size size of block
+             * \return usually is the size of data readed
+             */
+            virtual size_t Read( void *ptr, size_t size ) = 0;        
+            /*! 
+             * \brief write data to stream
+             * \param ptr pointer to memory buffer
+             * \param size size of block
+             */
+            virtual void Write( const void *ptr, size_t size ) = 0;
+            /*! \brief virtual destructor */
+            virtual ~IStream( void ){}
+        };
+    };
+    
+    /*! 
+     * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp 
Tensor<cpu,dim> storage will be allocated
+     * \param fo output binary stream
+     * \param src source data file
+     * \tparam dim dimension of tensor
+     * \tparam TStream type of stream, need to support Read, Write, one 
example is utils::IStream.
+     */
+    template<int dim,typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa SaveBinary */
+    template<int dim,typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src );
+
+    /*! 
+     * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp 
Tensor<cpu,dim> storage will be allocated
+     *       if pre_alloc is true , then space in dst is preallocated, and 
must have same shape of the tensor loaded
+     *       if pre_alloc is false, then dst originally does not have space 
allocated, LoadBinary will allocate space for dst
+     * \param fi output binary stream
+     * \param dst destination file
+     * \param pre_alloc whether space is pre-allocated, if false, space 
allocation will happen
+     * \tparam dim dimension of tensor     
+     * \tparam TStream type of stream, need to support Read, Write, one 
example is utils::IStream.
+     */
+    template<int dim,typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst, bool pre_alloc 
);
+    /*! \brief refer to comment of cpu ver \sa LoadBinary */
+    template<int dim,typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc 
);
+    
+    namespace utils{
+        /*! \brief implementation of file i/o stream */
+        class FileStream: public IStream{
+        public:
+            /*! \brief constructor */
+            FileStream( FILE *fp ):fp_(fp){}
+            virtual size_t Read( void *ptr, size_t size ){
+                return fread( ptr, size, 1, fp_ );
+            }
+            virtual void Write( const void *ptr, size_t size ){
+                fwrite( ptr, size, 1, fp_ );
+            }
+            /*! \brief close file */
+            inline void Close( void ){
+                fclose( fp_ );
+            }
+        private:
+            FILE *fp_;
+        };
+    };
+};
+
+namespace mshadow{
+    // implementations
+    template<int dim, typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src_ ){
+        fo.Write( src_.shape.shape_, sizeof(index_t) * dim );
+        Tensor<cpu,2> src = src_.FlatTo2D();
+        for( index_t i = 0; i < src.shape[1]; ++ i ){
+            fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] );
+        }
+    }
+    template<int dim, typename TStream>
+    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src ){
+        // copy to CPU, then save
+        Tensor<cpu,dim> tmp( src.shape ); 
+        AllocSpace( tmp );
+        Copy( tmp, src );
+        SaveBinary( fo, tmp );
+        FreeSpace( tmp );
+    }
+
+    template<int dim, typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst_, bool pre_alloc 
){
+        Shape<dim> shape;
+        utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, 
"mshadow::LoadBinary" );
+        if( pre_alloc ){
+            utils::Assert( shape == dst_.shape );
+        }else{
+            dst_.shape = shape; AllocSpace( dst_ );
+        }
+        Tensor<cpu,2> dst = dst_.FlatTo2D();
+        if( dst.shape[0] == 0 ) return;        
+        for( index_t i = 0; i < dst.shape[1]; ++ i ){
+            utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) 
!= 0, "mshadow::LoadBinary" );
+        }
+    } 
+    template<int dim, typename TStream>
+    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc 
){
+        Tensor<cpu,dim> tmp;
+        LoadBinary( fi, tmp, false );
+        if( pre_alloc ){
+            utils::Assert( tmp.shape == dst.shape );
+        }else{
+            dst.shape = tmp.shape; AllocSpace( dst );
+        }
+        Copy( dst, tmp );
+        FreeSpace( tmp );
+    }
+};
+#endif // TENSOR_IO_H

[08/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

Reply via email to