http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr_engine-inl.hpp ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_expr_engine-inl.hpp b/include/mshadow/tensor_expr_engine-inl.hpp new file mode 100644 index 0000000..9c5f2c7 --- /dev/null +++ b/include/mshadow/tensor_expr_engine-inl.hpp @@ -0,0 +1,416 @@ +#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP +#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP +/*! + * \file tensor_expr_engine-inl.hpp + * \brief definitions of how expressions should be evaluated + * \author Tianqi Chen, Bing Xu + */ +#include "tensor_expr.h" +#include "tensor.h" + +namespace mshadow{ + namespace expr{ + /*! + * \brief a general class that allows extension that makes tensors of some shape + * \tparam SubType type of subclass + * \tparam SrcExp source expression of the MakeTensorExp, the source of operation + * \tparam dim dimension of the expression + */ + template<typename SubType, typename SrcExp, int dim> + struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, type::kMapper >{ + /*! \brief the shape of this expression */ + Shape<dim> shape_; + /*! \brief true self of subtype */ + inline const SubType& real_self( void ) const{ + return *static_cast<const SubType*>(this); + } + }; + }; + + namespace expr{ + /*! \brief This part of code gives plan that can be used to carry out execution */ + template<typename ExpType> + class Plan{ + public: + /*! + * \brief evaluate the expression at index [y][x] + * to be implemented by SubType + */ + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const; + }; + + template <typename Device, int dim> + class Plan< Tensor<Device,dim> >{ + public: + Plan( const Tensor<Device,dim> &t ) + :dptr_(t.dptr),stride_(t.shape.stride_){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return dptr_[ y * stride_ + x ]; + } + private: + const real_t *dptr_; + index_t stride_; + }; + // special evaluation case for 1d tensor + template <typename Device> + class Plan< Tensor<Device,1> >{ + public: + Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return dptr_[ x ]; + } + private: + const real_t *dptr_; + }; + + template<> + class Plan<ScalarExp>{ + public: + Plan( real_t scalar ):scalar_(scalar){} + /*! \brief evaluate at [y][x] */ + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return scalar_; + } + private: + real_t scalar_; + }; + + template<typename OP, typename TA, typename TB,int etype> + class Plan< BinaryMapExp<OP,TA,TB,etype> >{ + public: + Plan( const Plan<TA> &lhs, const Plan<TB> &rhs ) + :lhs_(lhs), rhs_(rhs){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) ); + } + private: + Plan<TA> lhs_; + Plan<TB> rhs_; + }; + + template<typename OP, typename TA, int etype> + class Plan< UnaryMapExp<OP,TA,etype> >{ + public: + Plan( const Plan<TA> &src ):src_(src){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return OP::Map( src_.Eval( y, x ) ); + } + private: + Plan<TA> src_; + }; + + + template<typename SubType, typename SrcExp, int dim> + struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{ + public: + Plan( const Plan<SubType> &src ):src_(src){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return src_.Eval( y, x ); + } + private: + Plan<SubType> src_; + }; + + // allow UnaryMap see the plan + template<typename OP, typename TA, typename TB, int etype> + inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ); + + // translate from exp to execution plan + inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){ + return Plan<ScalarExp>( e.scalar_ ); + } + + template<typename T> + inline Plan<T> MakePlan( const ContainerExp<T> &e ){ + return Plan<T>( e.self() ); + } + + template<typename T, typename SrcExp, int dim> + inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){ + return Plan< T >( e.real_self() ); + } + + template<typename OP, typename TA, int etype> + inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const UnaryMapExp<OP,TA,etype> &e ){ + return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) ); + } + + template<typename OP, typename TA, typename TB, int etype> + inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ){ + return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), MakePlan(e.rhs_) ); + } + }; // namespace expr + + namespace expr{ + /*! + * \brief static type inference template, + * used to get the dimension of each expression, + * if ExpInfo<E>::kDim == -1, this means here are mismatch in expression + * if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu + * \tparam E expression + */ + template<typename E> + struct ExpInfo{ + const static int kDim = -1; + const static int kDevMask = 0; + }; + template<> + struct ExpInfo<ScalarExp>{ + const static int kDim = 0; + const static int kDevMask = 0xffff; + }; + template<typename Device, int dim> + struct ExpInfo< Tensor<Device,dim> >{ + const static int kDim = dim; + const static int kDevMask = Device::kDevMask; + }; + template<typename T, typename SrcExp, int dim> + struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{ + const static int kDimSrc = ExpInfo<SrcExp>::kDim; + const static int kDim = kDimSrc >= 0 ? dim : -1; + const static int kDevMask = ExpInfo<SrcExp>::kDevMask; + }; + template<typename OP, typename TA, int etype> + struct ExpInfo< UnaryMapExp<OP,TA,etype> >{ + const static int kDim = ExpInfo<TA>::kDim; + const static int kDevMask = ExpInfo<TA>::kDevMask; + }; + template<typename OP, typename TA, typename TB, int etype> + struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{ + const static int kDimLhs = ExpInfo<TA>::kDim; + const static int kDimRhs = ExpInfo<TB>::kDim; + const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \ + ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1; + const static int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask; + }; + + /*! \brief template to do type check */ + template<typename Device, int dim, typename E> + struct TypeCheck{ + /*! \brief dimension of expression*/ + const static int kExpDim = ExpInfo<E>::kDim; + /*! \brief whether the expression device type matches */ + const static bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0; + /*! \brief whether the expression can be mapped to expression of dim */ + const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass; + /*! \brief whether the expression can be reduced to expression of dim */ + const static bool kRedPass = (kExpDim > dim) && kDevPass; + }; + + template<bool kPass> + struct TypeCheckPass; + template<> + struct TypeCheckPass<false>{}; + template<> + struct TypeCheckPass<true>{ + inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){} + inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){} + inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){} + }; + }; // namespace expr + + namespace expr{ + // check shape consistency + template<int dim,typename E> + struct ShapeCheck{ + inline static Shape<dim> Check( const E &t ); + }; + + template<int dim> + struct ShapeCheck<dim,ScalarExp>{ + inline static Shape<dim> Check( const ScalarExp &exp ){ + // use lowest dimension to mark scalar exp + Shape<dim> shape; shape[0] = 0; + return shape; + } + }; + template<int dim,typename Device> + struct ShapeCheck<dim,Tensor<Device,dim> >{ + inline static Shape<dim> Check( const Tensor<Device,dim> &t ){ + return t.shape; + } + }; + template<int dim,typename SrcExp,typename T> + struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{ + inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> &t ){ + return t.shape_; + } + }; + template<int dim, typename OP, typename TA, int etype> + struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{ + inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t ){ + Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ ); + return s; + } + }; + template<int dim, typename OP, typename TA, typename TB, int etype> + struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{ + inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> &t ){ + Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ ); + Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ ); + if( shape1[0] == 0 ) return shape2; + if( shape2[0] == 0 ) return shape1; + utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same"); + return shape1; + } + }; + }; // namespace expr + + // the matrix OP depends on BLAS + namespace expr{ + template<typename SV,typename Device, int ddim, int ldim, int rdim, bool ltrans, bool rtrans> + struct DotEngine{ + inline static void Eval( Tensor<Device,ddim> &dst, const Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale ); + }; + + // handles the dot + template<typename Device> + struct BLASEngine; + + #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL) + template<> + struct BLASEngine<cpu>{ + inline static CBLAS_TRANSPOSE GetT( bool t ){ + return t ? CblasTrans : CblasNoTrans; + } + inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \ + const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){ + cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \ + const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){ + cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \ + const float *X, int incX, float beta, float *Y, int incY ){ + cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \ + const double *X, int incX, double beta, double *Y, int incY ){ + cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){ + cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + } + inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){ + cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda); + } + }; + #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL + + #if MSHADOW_USE_CUDA + // All CuBLAS goes to here, use legacy API: not threadsafe + template<> + struct BLASEngine<gpu>{ + inline static char GetT( bool t ){ + return t ? 'T' : 'N'; + } + inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){ + cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, + const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){ + cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc); + } + inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \ + const float *X, int incX, float beta, float *Y, int incY ){ + cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \ + const double *X, int incX, double beta, double *Y, int incY ){ + cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY); + } + inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){ + cublasSger(m,n,alpha,X,incX,Y,incY,A,lda); + } + inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){ + cublasDger(m,n,alpha,X,incX,Y,incY,A,lda); + } + }; + #endif + + // helper function to decide which shape we are in + inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){ + return transpose ? Shape2(shape[0],shape[1]) : shape; + } + // dst = dot( lhs[.T], rhs[.T] ) + template<typename SV, typename xpu, bool transpose_left, bool transpose_right> + struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{ + inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) { + Shape<2> sleft = GetShape( lhs.shape, transpose_left ); + Shape<2> sright = GetShape( rhs.shape, transpose_right ); + utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \ + && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" ); + // use column major argument to compatible with most BLAS + BLASEngine<xpu>::gemm + ( transpose_right , transpose_left, + transpose_right ? rhs.shape[1] : rhs.shape[0], + transpose_left ? lhs.shape[0] : lhs.shape[1], + transpose_right ? rhs.shape[0] : rhs.shape[1], + scale * SV::kAlphaBLAS, + rhs.dptr, rhs.shape.stride_, + lhs.dptr, lhs.shape.stride_, + SV::kBetaBLAS, + dst.dptr, dst.shape.stride_ ); + } + }; + template<typename SV, typename xpu, bool transpose_right> + struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{ + inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) { + Shape<2> sright = GetShape( rhs.shape, transpose_right ); + utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch"); + BLASEngine<xpu>::gemv + ( transpose_right, + rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS, + rhs.dptr, rhs.shape.stride_, + lhs.dptr, 1, SV::kBetaBLAS, + dst.dptr, 1 ); + } + }; + template<typename SV, typename xpu> + struct DotEngine<SV,xpu,2,1,1,true,false>{ + inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,1> &rhs, real_t scale ) { + utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" ); + if( SV::kBetaBLAS < 1e-6f ){ + BLASEngine<xpu>::ger + ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS, + rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ ); + }else{ + DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale ); + } + } + }; + + }; // namespace expr + + namespace expr{ + /*! \brief some engine that evaluate complex expression */ + template<typename SV, typename Device, int dim, typename E> + struct ExpComplexEngine{ + inline static void Eval( Tensor<Device,dim>& dst, const E &exp ); + }; + template<typename SV, typename Device, int dim> + struct ExpEngine<SV, Tensor<Device,dim> >{ + template<typename E> + inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kMapper> &exp ){ + MapExp<SV,dim,E>( dst, exp ); + } + template<typename E> + inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kContainer> &exp ){ + MapExp<SV,dim,E>( dst, exp ); + } + template<typename E> + inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kComplex> &exp ){ + ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() ); + } + }; + template<typename SV, typename Device, int dim, int ldim,int rdim,bool ltrans,bool rtrans> + struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > >{ + inline static void Eval( Tensor<Device,dim> &dst, const DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){ + DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ ); + } + }; + }; // namespace expr +}; +#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr_ext.h ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_expr_ext.h b/include/mshadow/tensor_expr_ext.h new file mode 100644 index 0000000..8399b1b --- /dev/null +++ b/include/mshadow/tensor_expr_ext.h @@ -0,0 +1,978 @@ +#ifndef MSHADOW_TENSOR_EXPR_EXT_H +#define MSHADOW_TENSOR_EXPR_EXT_H +/*! + * \file tensor_expr_ext.h + * \brief some extension of expressions, used to support something beyond elementwise op + * \author Tianqi Chen, Bing Xu + */ +#include "tensor_expr_engine-inl.hpp" +namespace mshadow{ + // Declaration of expressions goes here + namespace expr{ + /*! + * \brief broadcast Tensor1D into a higher dimension Tensor + * input: Tensor<Device,1>: ishape[0] + * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0] + * \tparam Device which device it lies + * \tparam dimdst target tensor dimension + * \tparam dimcast the dimension where the 1D tensor fills in by index + */ + template<typename Device, int dimdst, int dimcast> + struct Broadcast1DExp: public MakeTensorExp< Broadcast1DExp<Device,dimdst,dimcast>,Tensor<Device,1>,dimdst>{ + /*! \brief source operand */ + const Tensor<Device,1> src_; + /*! \brief constructor */ + Broadcast1DExp( const Tensor<Device,1> &src, Shape<dimdst> shape ):src_(src){ + this->shape_ = shape; + } + }; + + /*! + * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch + * this is a version support unpacking multiple images + * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: + * \tparam SrcExp source expression + * \tparam dstdim destination dimension + */ + template<typename SrcExp, int srcdim> + struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp<SrcExp,srcdim>, SrcExp, 2>{ + /*! \brief source operand */ + const SrcExp& img_; + /*! \brief patch size */ + index_t psize_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief number of input channel */ + index_t i_channel_; + /*! \brief height of img */ + index_t i_height_; + /*! \brief width of img */ + index_t i_width_; + /*! \brief constructor */ + UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t pstride ) + :img_(img), psize_(psize), pstride_(pstride){ + Shape<srcdim> imshape = ShapeCheck<srcdim,SrcExp>::Check( img_ ); + utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "UnpackPatchToCol:image shape smaller than patch size"); + this->i_channel_ = imshape[2]; + this->i_height_ = imshape[1]; + this->i_width_ = imshape[0]; + // calculate number of batches + const index_t num = imshape.ProdShape( 3, srcdim ); + const index_t o_height = ( i_height_ - psize ) / pstride + 1; + const index_t o_width = ( i_width_ - psize ) / pstride + 1; + this->shape_[0] = o_height * o_width * num; + this->shape_[1] = psize * psize * imshape[2]; + } + }; + + /*! + * \brief reverse operation of UnpackPatchToCol, used to backprop gradient back + * this is a version supporting multiple images + * \tparam Device which device it lies + * \tparam dstdim destination dimension + */ + template<typename Device, int dstdim> + struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp<Device,dstdim>, Tensor<Device,2>, dstdim>{ + /*! \brief source operand */ + const Tensor<Device,2>& mat_; + /*! \brief patch size */ + index_t psize_; + /*! \brief patch stride */ + index_t pstride_; + /*! \brief constructor */ + PackColToPatchXExp( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride ) + :mat_(mat), psize_(psize), pstride_(pstride){ + this->shape_ = imshape; + const index_t o_height = ( imshape[1] - psize ) / pstride + 1; + const index_t o_width = ( imshape[0] - psize ) / pstride + 1; + utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" ); + utils::Assert( mat.shape[1] == psize * psize * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" ); + } + }; + + /*! + * \brief reshape the content to another shape + * input: Tensor<Device,dimsrc>: ishape + * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size() + * \tparam SrcExp source expression + * \tparam dimdst target dimension + * \tparam dimsrc source dimension + */ + template<typename SrcExp, int dimdst, int dimsrc> + struct ReshapeExp: public MakeTensorExp< ReshapeExp<SrcExp,dimdst,dimsrc>, SrcExp, dimdst>{ + /*! \brief source expression */ + const SrcExp& src_; + /*! \brief smallest dimension of input */ + index_t ishape0_; + /*! \brief constructor */ + ReshapeExp( const SrcExp &src, Shape<dimdst> shape ):src_(src){ + Shape<dimsrc> ishape = ShapeCheck<dimsrc,SrcExp>::Check( src_ ); + utils::Assert( ishape.Size() == shape.Size(), "reshape size must match" ); + ishape0_ = ishape[0]; + this->shape_ = shape; + } + }; + + /*! + * \brief swap two axis of a tensor + * input: Tensor<Device,dim>: ishape + * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1] + * + * \tparam SrcExp type of source expression + * \tparam dimsrc source dimension + * \tparam a1 smaller dimension to be swapped + * \tparam a2 larger dimension to be swapped + */ + template<typename SrcExp,int dimsrc, int a1, int a2> + struct SwapAxisExp: public MakeTensorExp< SwapAxisExp<SrcExp,dimsrc,a1,a2>, SrcExp, dimsrc>{ + /*! \brief source expression */ + const SrcExp& src_; + /*! \brief constructor */ + SwapAxisExp( const SrcExp &src ):src_(src){ + this->shape_ = ShapeCheck<dimsrc,SrcExp>::Check(src); + std::swap( this->shape_[a1], this->shape_[a2] ); + } + }; + + /*! + * \brief reduction to 1 dimension tensor + * input: Tensor<Device,k>: ishape + * output: Tensor<Device,1> shape[0] = ishape[dimkeep]; + * + * \tparam EType type of expression to be reduced + * \tparam Reducer which reducer to use + * \tparam srcdim dimension of source + * \tparam dimkeep which dimension to be kept, + */ + template<typename EType, typename Reducer,int dimkeep> + struct ReduceTo1DExp: public Exp< ReduceTo1DExp<EType,Reducer, dimkeep>, type::kComplex >{ + /*! \brief source operand */ + const EType& src_; + /*! \brief source operand, scale of the */ + real_t scale_; + /*! \brief construct a repmat expression from src and nrow */ + ReduceTo1DExp( const EType& src, real_t scale ):src_(src),scale_(scale){} + }; + + /*! + * \brief pooling expression, do reduction over local patches of a image + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam srcdim dimension of src + */ + template<typename Reducer, typename SrcExp, int srcdim> + struct PoolingExp: public MakeTensorExp< PoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> { + /*! \brief source operand */ + const SrcExp& src_; + /*! \brief kernel size */ + index_t ksize_; + /*! \brief kernel stride */ + index_t kstride_; + /*! \brief source height shape[1] */ + index_t src_height_; + /*! \brief source width shape[0] */ + index_t src_width_; + /*! \brief constructor */ + PoolingExp( const SrcExp &src, index_t ksize, index_t kstride ) + : src_(src), ksize_(ksize), kstride_(kstride) { + Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); + utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" ); + this->src_height_ = sshape[1]; + this->src_width_ = sshape[0]; + this->shape_ = sshape; + this->shape_[1] = (src_height_ - ksize) / kstride + 1; + this->shape_[0] = (src_width_ - ksize) / kstride + 1; + } + /*! \brief constructor, specify shape */ + PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, index_t kstride ) + : src_(src), ksize_(ksize), kstride_(kstride) { + Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ ); + utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" ); + this->src_height_ = sshape[1]; + this->src_width_ = sshape[0]; + this->shape_ = sshape; + this->shape_[1] = pshape[1]; + this->shape_[0] = pshape[0]; + } + }; + + /*! + * \brief unpooling expr reverse operation of pooling, used to pass gradient back + * \tparam Reducer specifies reduction operation during pooling + * \tparam Device which device it lies + */ + template<typename Reducer, typename Device> + struct UnPoolingExp: public MakeTensorExp< UnPoolingExp<Reducer, Device>, Tensor<Device,4>, 4> { + /*! \brief source input, corresponds to src in pooling */ + const Tensor<Device, 4>& data_src_; + /*! \brief result of pooled data, corresponds to result of pooling */ + const Tensor<Device, 4>& data_pooled_; + /*! \brief gradient data of pooled part, to be propgate down */ + const Tensor<Device, 4>& grad_pooled_; + /*! \brief kernel size */ + index_t ksize_; + /*! \brief kernel stride */ + index_t kstride_; + /*! \brief constructor */ + UnPoolingExp( const Tensor<Device,4> &data_src, const Tensor<Device,4> &data_pooled, + const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride ) + : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled), + ksize_(ksize), kstride_(kstride) { + utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" ); + utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" ); + utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" ); + this->shape_ = data_src_.shape; + } + }; + + /*! + * \brief padding expression, pad a image with zeros + * \tparam SrcExp source expression to be pooled from + * \tparam srcdim dimension of src + */ + template<typename SrcExp, int srcdim> + struct PaddingExp : public MakeTensorExp<PaddingExp<SrcExp, srcdim>, SrcExp, srcdim> { + /*! \brief source operand */ + const SrcExp& src_; + /*! \brief pad size */ + index_t pad_; + /*! \brief source tensor height */ + index_t src_height_; + /*! \brief source tensor width */ + index_t src_width_; + /*! \brief constructor */ + PaddingExp( const SrcExp &src, index_t pad ) + : src_(src), pad_(pad) { + this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ ); + src_height_ = this->shape_[1]; + src_width_ = this->shape_[0]; + this->shape_[1] += pad * 2; // height + this->shape_[0] += pad * 2; // width + } + }; + + /*! + * \brief crop expression, cut off the boundary region, reverse operation of padding + * \tparam SrcExp source expression to be pooled from + * \tparam srcdim dimension of src + */ + template<typename SrcExp, int srcdim> + struct CroppingExp : public MakeTensorExp< CroppingExp<SrcExp, srcdim>, SrcExp, srcdim> { + /*! \brief source operand */ + const SrcExp& src_; + /*! \brief pad height */ + index_t pad_height_; + /*! \brief pad height */ + index_t pad_width_; + /*! \brief src height */ + index_t src_height_; + /*! \brief constructor */ + CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) { + this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ ); + utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); + utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); + pad_height_ = (this->shape_[1] - cshape[1]) / 2; + pad_width_ = (this->shape_[0] - cshape[0]) / 2; + src_height_ = this->shape_[1]; + this->shape_[1] = cshape[1]; // width + this->shape_[0] = cshape[0]; // height + } + /*! \brief constructor */ + CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width ) + : src_(src), pad_height_(start_height), pad_width_(start_width) { + this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ ); + utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met"); + utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met"); + src_height_ = this->shape_[1]; + this->shape_[1] = cshape[1]; // width + this->shape_[0] = cshape[0]; // height + } + + }; // struct CroppingExp + + + /*! + * \brief mirror expression, mirror a image in width + * \tparam SrcExp source expression to be mirrored + * \tparam srcdim dimension of src + */ + template<typename SrcExp, int srcdim> + struct MirroringExp : public MakeTensorExp<MirroringExp<SrcExp, srcdim>, SrcExp, srcdim> { + /*! \brief source operand */ + const SrcExp& src_; + /*! \brief constructor */ + MirroringExp( const SrcExp &src ): src_(src) { + this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ ); + } + }; + + /*! + * \brief channel pooling expression, do reduction over (local nearby) channels, used to implement local response normalization + * \tparam Reducer reduction method during pooling + * \tparam SrcExp source expression to be pooled from + * \tparam srcdim dimension of src + */ + template<typename Reducer, typename SrcExp, int srcdim> + struct ChannelPoolingExp: public MakeTensorExp< ChannelPoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> { + /*! \brief source operand */ + const SrcExp& src_; + /*! \brief neighbor size */ + index_t nsize_; + /*! \brief constructor */ + ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), nsize_(nsize){ + utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric" ); + this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ ); + utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels" ); + } + }; + }; // namespace expr + + + // Declaration of all functions go here + namespace expr{ + /*! \brief operator overload */ + template<typename E, typename R,int d> + inline ReduceTo1DExp<E,R,d> operator*( const ReduceTo1DExp<E,R,d> &e, real_t scale ){ + return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale ); + } + /*! \brief operator overload */ + template<typename E, typename R,int d> + inline ReduceTo1DExp<E,R,d> operator*( real_t scale, const ReduceTo1DExp<E,R,d> &e ){ + return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale ); + } + + /*! + * \brief a expression that replicate a 1 dimension tensor in dimension dimcast + * \param src Tensor<Device,1>: shape[0] + * \param shape shape of output + * \return a expresion with type Tensor<Device,dimdst> + * \tparam dimcast target dimension where the 1D tensor will be broadcasted + * \tparam Device which device it lies + * \tparam dimdst dimension of destination tensor + */ + template<int dimcast,typename Device,int dimdst> + inline Broadcast1DExp<Device,dimdst,dimcast> broadcast( const Tensor<Device,1> &src, Shape<dimdst> shape ){ + TypeCheckPass< dimcast<dimdst >::Error_Expression_Does_Not_Meet_Dimension_Req(); + utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape mismatch" ); + return Broadcast1DExp<Device,dimdst,dimcast>( src, shape ); + } + + /*! + * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution + * after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations: + * + * weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize + * output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images + * out_height = ( in_height - psize ) / pstride + 1, this means we pad inperfect patch with 0 + * out_width = ( in_width - psize ) / pstride + 1 + * + * \return mat target matrix; shape[1]: in_channel*psize*psize shape[0]: out_height*out_width * num_of_images + * \param img source image; shape[2]: in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images) + * \param psize height and width of each patch + * \param pstride stride of each patch + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename SrcExp, int etype> + inline UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim > unpack_patch2col( const Exp<SrcExp,etype> &img, index_t psize, index_t pstride ){ + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim >( img.self(), psize, pstride ); + } + + /*! + * \brief reverse operation of pack_col2patch, can be used to implement deconvolution + * \return packed img expression + * \param mat source matrix + * \param imshape shape of target img + * \param psize height and width of each patch + * \param pstride stride of each patch + * \tparam Device the Device where input data lies + */ + template<typename Device, int dstdim> + inline PackColToPatchXExp<Device,dstdim> pack_col2patch( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride ){ + utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size"); + return PackColToPatchXExp<Device,dstdim>( mat, imshape, psize, pstride ); + } + /*! + * \brief a expression that reshapes a tensor to another shape + * \param src Tensor<Device,dimsrc>: + * \param oshape target shape + * \return a expresion with type Tensor<Device,dimdst> + * \tparam SrcExp source expression + * \tparam etype source expression type + * \tparam dimdst target dimension + */ + template<typename SrcExp, int etype, int dimdst> + inline ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim > reshape( const Exp<SrcExp,etype> &src, Shape<dimdst> oshape ){ + return ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim >( src.self(), oshape ); + } + + /*! + * \brief a expression that reshapes a tensor to another shape + * \param src Tensor<Device,dimsrc>: + * \return a expresion with type Tensor<Device,dimdst> + * \tparam a1 smaller dimension to be swapped + * \tparam a2 larger dimension to be swapped + * \tparam SrcExp source expression + * \tparam etype source expression type + */ + template<int a1, int a2, typename SrcExp, int etype> + inline SwapAxisExp< SrcExp, ExpInfo<SrcExp>::kDim, a1,a2> swapaxis( const Exp<SrcExp,etype> &src ){ + typedef ExpInfo<SrcExp> Info; + TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() ); + } + + /*! + * \brief a sum over all dimensions, except dimkeep + * \param exp input expression that must be a matrix Tensor<?,2> + * \return a expresion with type Tensor<Device,1> + * \tparam dimkeep the dimension that will be kept + * \tparam SrcExp expression + * \tparam etype type of expression + */ + template<int dimkeep, typename SrcExp, int etype> + inline ReduceTo1DExp<SrcExp, red::sum, dimkeep > sumall_except_dim( const Exp<SrcExp,etype> &exp ){ + return ReduceTo1DExp<SrcExp,red::sum,dimkeep>( exp.self(), 1.0f ); + } + + /*! + * \brief pooling subregion results together + * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width + * \param ksize kernel size + * \param kstride stride for each kernel + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename Reducer, typename SrcExp, int etype> + inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, index_t ksize, index_t kstride ) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), ksize, kstride); + } + /*! + * \brief same as pool, except the output shape is specified by pshape + * \param src source image + * \param pshape ouput shape + * \param ksize kernel size + * \param kstride stride for each kernel + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename Reducer, typename SrcExp, int etype> + inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, Shape<2> pshape, index_t ksize, index_t kstride ) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), pshape, ksize, kstride); + } + /*! + * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling + * \param data_src source input, corresponds to src in pooling + * \param data_pooled result of pooled data, corresponds to result of pooling + * \param grad_pooled gradient data of pooled part, to be propgate down + * \param ksize kernel size + * \param kstride stride for each kernel + * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient + * \tparam Reducer reducer type + * \tparam Device device where data lies + */ + template<typename Reducer, typename Device> + inline UnPoolingExp<Reducer, Device> unpool( const Tensor<Device,4>&data_src, const Tensor<Device,4> &data_pooled, + const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride ) { + return UnPoolingExp<Reducer, Device>(data_src, data_pooled, grad_pooled,ksize, kstride); + } + + /*! + * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1] + * \param src original image batches + * \param pad padding size + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename SrcExp, int etype> + inline PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim> pad(const Exp<SrcExp, etype> &src, index_t pad) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), pad); + } + + /*! + * \brief revserse operationg of padding, cut off boundaries, crop output from center of input + * \param src original image batches + * \param oshape output shape to be cropped + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename SrcExp, int etype> + inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape ) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape); + } + /*! + * \brief same as crop, but can specify starting position to do cropping + * \param src original image batches + * \param oshape output shape to be cropped + * \param start_height start height position to do cropping + * \param start_width start width position to do cropping + * \return expression corresponding to padded result + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename SrcExp, int etype> + inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape, index_t start_height, index_t start_width ) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape, start_height, start_width); + } + + /*! + * \brief mirroring expression, mirror images in width + * \param src original image batches + * \return expression corresponding to mirrored result + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename SrcExp, int etype> + inline MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim> mirror(const Exp<SrcExp, etype> &src) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self()); + } + + /*! + * \brief channel pooling, do reduction over (local nearby) channels, used to implement local response normalization + * \param src source data + * \param nsize neighbor size + * \return expression of pooled result + * \tparam Reducer reducer type + * \tparam SrcExp source expression + * \tparam etype type of expression + */ + template<typename Reducer, typename SrcExp, int etype> + inline ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > chpool( const Exp<SrcExp,etype> &src, index_t nsize ) { + TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + return ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(),nsize); + } + // short cut functions + /*! + * \brief a expression that replicate a 1 dimension tensor for nrow times + * \param src Tensor<Device,1>: shape[0] + * \param nrow number of rows to replicate + * \return a expresion with type Tensor<Device,2> shape[0], shape[1] = nrow + * \tparam Device which device it lies + */ + template<typename Device> + inline Broadcast1DExp<Device,2,0> repmat( const Tensor<Device,1> &src, index_t nrow ){ + return broadcast<0>( src, Shape2( nrow, src.shape[0] ) ); + } + /*! + * \brief a expression that sum over rows of a matrix + * \param exp input expression that must be a matrix Tensor<?,2> + * \return a expresion with type Tensor<Device,1> + * \tparam SrcExp expression + * \tparam etype type of expression + */ + template<typename SrcExp, int etype> + inline ReduceTo1DExp<SrcExp, red::sum, 0 > sum_rows( const Exp<SrcExp,etype> &exp ){ + return sumall_except_dim<0>( exp ); + } + + }; // namespace expr +}; // namespace mshadow + +// ================================================== +// implementations afterwards, +// no need to read if only use the functions +// -------------------------------------------------- +namespace mshadow{ + namespace expr{ + template<typename SV, typename Device, typename EType, typename Reducer, int dimkeep> + struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,dimkeep> >{ + inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,dimkeep> &exp ){ + TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + MapReduceKeepHighDim<SV,Reducer,dimkeep>( dst, exp.src_, exp.scale_ ); + } + }; + + template<typename SV, typename Device, typename EType, typename Reducer> + struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,0> >{ + inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,0> &exp ){ + MapReduceKeepLowest<SV,Reducer>( dst, exp.src_, exp.scale_ ); + } + }; + }; // namespace expr + + namespace expr{ + /*! \brief execution plan of Broadcast1DExp */ + template<typename Device, int dimdst, int dimcast> + struct Plan< Broadcast1DExp<Device,dimdst,dimcast> >{ + public: + Plan( const Broadcast1DExp<Device,dimdst,dimcast> &e ) + : dptr_( e.src_.dptr ), + ystride_( e.shape_.ProdShape(1,dimcast) ), + length_(e.shape_[dimcast]){ + TypeCheckPass< dimcast!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req(); + } + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return dptr_[ (y / ystride_) % length_ ]; + } + private: + const real_t *dptr_; + const index_t ystride_, length_; + }; + + /*! \brief execution plan of Broadcast1DExp */ + template<typename Device, int dimdst> + struct Plan< Broadcast1DExp<Device,dimdst,0> >{ + public: + Plan( const Broadcast1DExp<Device,dimdst,0> &e ): dptr_( e.src_.dptr ){} + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return dptr_[ x ]; + } + private: + const real_t *dptr_; + }; + }; // namespace expr + + namespace expr{ + template<typename SrcExp, int srcdim> + struct Plan< UnpackPatchToColXExp<SrcExp,srcdim> >{ + public: + Plan( const UnpackPatchToColXExp<SrcExp,srcdim> &e ) + :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_), + i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_), + o_height_(( i_height_ - psize_ ) / pstride_ + 1), + o_width_ (( i_width_ - psize_ ) / pstride_ + 1){ + } + MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + const index_t x_offset = i % psize_; + const index_t idivp = i / psize_; + const index_t y_offset = idivp % psize_; + const index_t c = idivp / psize_; + const index_t x = (j % o_width_) * pstride_ + x_offset; + const index_t jdivw = j / o_width_; + const index_t y = (jdivw % o_height_) * pstride_ + y_offset; + const index_t n = jdivw / o_height_; + + if( x < i_width_ && y < i_height_ ){ + return src_.Eval( ( n * i_channel_ + c ) * i_height_ + y, x ); + }else{ + return 0.0f; + } + } + private: + Plan<SrcExp> src_; + const index_t psize_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_; + }; + + template<typename Device, int dstdim> + struct Plan< PackColToPatchXExp<Device, dstdim> >{ + public: + Plan( const PackColToPatchXExp<Device, dstdim> &e ) + :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_), + i_channel_(e.shape_[2]), i_height_(e.shape_[1]), + o_width_(( e.shape_[0] - psize_ ) / pstride_ + 1), + o_height_(( e.shape_[1] - psize_ ) / pstride_ + 1){ + // note: i/o convention are same as unpack + } + MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + using namespace std; + const index_t y = i % i_height_; + const index_t idivh = i / i_height_; + const index_t c = idivh % i_channel_; + const index_t n = idivh / i_channel_; + const index_t x = j; + const index_t py_min = y < psize_ ? 0 : (y-psize_+pstride_)/pstride_; + const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_; + const index_t py_max = min( (y+pstride_)/pstride_, o_height_); + const index_t px_max = min( (x+pstride_)/pstride_, o_width_ ); + real_t res = 0.0f; + for( index_t py = py_min; py < py_max; ++py ){ + for( index_t px = px_min; px < px_max; ++px ){ + res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ]; + } + } + return res; + } + private: + Tensor<Device,2> mat_; + const index_t psize_, pstride_, i_channel_, i_height_, o_width_, o_height_; + }; + }; + + namespace expr{ + template<typename SrcExp, int dimdst, int dimsrc> + struct Plan< ReshapeExp<SrcExp,dimdst,dimsrc> >{ + public: + Plan( const ReshapeExp<SrcExp,dimdst,dimsrc> &e ) + : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){ + } + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + const index_t idx = y * oshape0_ + x; + return src_.Eval( idx / ishape0_, idx % ishape0_ ); + } + private: + Plan<SrcExp> src_; + const index_t oshape0_, ishape0_; + }; + // special work plan for 1 dimensional data + template<typename SrcExp,int dimdst> + struct Plan< ReshapeExp<SrcExp,dimdst,1> >{ + public: + Plan( const ReshapeExp<SrcExp,dimdst,1> &e ) + : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){ + } + MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{ + return src_.Eval( 0, y * oshape0_ + x ); + } + private: + Plan<SrcExp> src_; + const index_t oshape0_; + }; + }; + + namespace expr{ + template<typename SrcExp,int dimsrc, int a1, int a2> + struct Plan< SwapAxisExp<SrcExp,dimsrc,a1,a2> >{ + public: + Plan( const SwapAxisExp<SrcExp,dimsrc,a1,a2> &e ) + : src_(MakePlan(e.src_)), + shape1_( e.shape_.ProdShape( 1, a1 ) ), + shape2_( e.shape_[a1] ), + shape3_( e.shape_.ProdShape( a1+1, a2 ) ), + shape4_( e.shape_[a2] ){ + } + MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{ + const index_t y = i % shape1_; + i /= shape1_; + const index_t z = i % shape2_; + i /= shape2_; + const index_t c = i % shape3_; + i /= shape3_; + const index_t n = i % shape4_; + // swap z and n + return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n ) * shape1_ + y, j ); + } + private: + Plan<SrcExp> src_; + const index_t shape1_, shape2_, shape3_, shape4_; + }; + + template<typename SrcExp,int dimsrc, int a2> + struct Plan< SwapAxisExp<SrcExp,dimsrc,0,a2> >{ + public: + Plan( const SwapAxisExp<SrcExp,dimsrc,0,a2> &e ) + : src_(MakePlan(e.src_)), + shape0_( e.shape_[0] ), + shape1_( e.shape_.ProdShape(1,a2) ), + shape2_( e.shape_[a2] ){ + } + MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{ + // swap x and z + const index_t y = i % shape1_; + i /= shape1_; + const index_t z = i % shape2_; + const index_t n = i / shape2_; + return src_.Eval( ( n*shape0_ + x ) * shape1_ + y , z ); + } + private: + Plan<SrcExp> src_; + const index_t shape0_, shape1_, shape2_; + }; + }; + + namespace expr{ + template<typename Reducer, typename SrcExp, int srcdim> + struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > { + public: + Plan( const PoolingExp<Reducer, SrcExp, srcdim> &e ) + : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), kstride_(e.kstride_), + src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) { + } + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { + using namespace std; + const index_t py = i % new_height_; + const index_t y_start = py * kstride_; + const index_t y_end = min( y_start + ksize_, src_height_ ); + const index_t px = j; + const index_t x_start = px * kstride_; + const index_t x_end = min( x_start + ksize_, src_width_ ); + const index_t c = i / new_height_; + + real_t res = Reducer::kInitV; + for (index_t y = y_start; y < y_end; ++y) { + for (index_t x = x_start; x < x_end; ++x) { + Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) ); + } + } + return res; + } + private: + Plan<SrcExp> src_; + const index_t ksize_, kstride_; + const index_t src_height_, src_width_; + const index_t new_height_; + }; + + template<typename Reducer, typename Device> + struct Plan<UnPoolingExp<Reducer, Device> > { + public: + Plan(const UnPoolingExp<Reducer, Device> &e) + : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_), + ksize_(e.ksize_), kstride_(e.kstride_) {} + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { + using namespace std; + const index_t x = j; + const index_t y = i % data_src_.shape[1]; + const index_t c = i / data_src_.shape[1]; + const real_t vsrc = data_src_[0][c][y][x]; + + const index_t py_min = y < ksize_ ? 0 : (y-ksize_+kstride_)/kstride_; + const index_t px_min = x < ksize_ ? 0 : (x-ksize_+kstride_)/kstride_; + const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]); + const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]); + + real_t val = 0; + for( index_t py = py_min; py < py_max; ++py ){ + for( index_t px = px_min; px < px_max; ++px ){ + val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px]; + } + } + return val; + } + private: + Tensor<Device, 4> data_src_, data_pooled_, grad_pooled_; + const index_t ksize_; + const index_t kstride_; + }; + }; // namespace expr + + namespace expr{ + template<typename SrcExp, int srcdim> + struct Plan< PaddingExp<SrcExp, srcdim> > { + public: + Plan(const PaddingExp<SrcExp, srcdim> &e) + : src_(MakePlan(e.src_)), pad_(e.pad_), new_height_(e.shape_[1]), + src_height_(e.src_height_), src_width_(e.src_width_) {} + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { + const index_t x = j; + const index_t y = i % new_height_; + const index_t c = i / new_height_; + if (y < pad_ || x < pad_) return 0.0f; + const index_t h = y - pad_; + const index_t w = x - pad_; + if (h < src_height_ && w < src_width_) { + return src_.Eval(c * src_height_ + h, w); + } else { + return 0.0f; + } + } + private: + Plan<SrcExp> src_; + const index_t pad_; + const index_t new_height_; + const index_t src_height_; + const index_t src_width_; + }; + + template<typename SrcExp, int srcdim> + struct Plan<CroppingExp<SrcExp, srcdim> > { + public: + Plan(const CroppingExp<SrcExp, srcdim> &e) + : src_(MakePlan(e.src_)), pad_height_(e.pad_height_),pad_width_(e.pad_width_), + new_height_(e.shape_[1]), src_height_(e.src_height_) {} + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { + const index_t x = j; + const index_t y = i % new_height_; + const index_t c = i / new_height_; + const index_t h = y + pad_height_; + const index_t w = x + pad_width_; + return src_.Eval(c * src_height_ + h, w); + } + private: + Plan<SrcExp> src_; + const index_t pad_height_, pad_width_; + const index_t new_height_; + const index_t src_height_; + }; + + template<typename SrcExp, int srcdim> + struct Plan< MirroringExp<SrcExp, srcdim> > { + public: + Plan(const MirroringExp<SrcExp, srcdim> &e) + : src_(MakePlan(e.src_)), width_(e.shape_[0]){} + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { + return src_.Eval( i, width_ - j - 1 ); + } + private: + Plan<SrcExp> src_; + const index_t width_; + }; + }; // namespace expr + + namespace expr{ + template<typename Reducer, typename SrcExp, int srcdim> + struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > { + public: + Plan( const ChannelPoolingExp<Reducer, SrcExp, srcdim> &e ) + : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]), + height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){ + } + MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const { + using namespace std; + const index_t y = i % height_; + i /= height_; + const index_t c = i % channel_; + const index_t n = i / channel_; + const index_t x = j; + const index_t cstart = c < hnsize_ ? 0 : c - hnsize_; + const index_t cend = min( c + hnsize_ + 1, channel_ ); + real_t res = Reducer::kInitV; + for( index_t cc = cstart; cc < cend; ++ cc ){ + Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + y, x ) ); + } + return res; + } + private: + Plan<SrcExp> src_; + const index_t channel_, height_, width_, hnsize_; + }; + }; +}; // namespace mshadow + +#if MSHADOW_USE_SSE +// implementations of SSE support, if possible +#include "tensor_sse-inl.hpp" +namespace mshadow{ + namespace expr{ + template<int dimdst> + struct SSECheck< Broadcast1DExp<cpu,dimdst,0> >{ + const static bool kPass = true; + }; + template<int dimdst> + struct SSEAlignCheck<2, Broadcast1DExp<cpu,dimdst,0> >{ + inline static bool Check( const Broadcast1DExp<cpu,dimdst,0> &exp ){ + return sse2::CheckAlign( exp.src_.dptr ); + } + }; + template<int dimdst> + class SSEPlan< Broadcast1DExp<cpu,dimdst,0> >{ + public: + SSEPlan( const Broadcast1DExp<cpu,dimdst,0> &t ) + :dptr_(t.src_.dptr){} + MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{ + return sse2::FVec<real_t>( &dptr_[ x ] ); + } + MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{ + return dptr_[ x ]; + } + private: + const real_t *dptr_; + }; + }; +}; +#endif + +#endif + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_gpu-inl.hpp ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_gpu-inl.hpp b/include/mshadow/tensor_gpu-inl.hpp new file mode 100644 index 0000000..a2c1fc4 --- /dev/null +++ b/include/mshadow/tensor_gpu-inl.hpp @@ -0,0 +1,148 @@ +#ifndef MSHADOW_TENSOR_GPU_INL_HPP +#define MSHADOW_TENSOR_GPU_INL_HPP +/*! + * \file tensor_gpu-inl.hpp + * \brief implementation of GPU host code + * \author Bing Xu, Tianqi Chen + */ +#include "tensor.h" + +#if !(MSHADOW_USE_CUDA) +namespace mshadow { + // do nothing if no GPU operation is involved + inline void InitTensorEngine( int dev_id ){ + } + inline void ShutdownTensorEngine( void ){ + } +}; +#else +namespace mshadow { + #if (MSHADOW_USE_NVML) + inline int AutoSelectDevice(int device_count) { + // TODO nvml device id and cuda device id are not consistent + return 0; + } + #endif + inline void InitTensorEngine(int dev_id){ + cudaDeviceProp prop; + int device_id = 0; + int device_count = 0; + cudaGetDeviceCount(&device_count); + utils::Assert(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration"); + if (dev_id < 0) { + #if (MSHADOW_USE_NVML) + device_id = AutoSelectDevice(device_count); + #endif + } else { + device_id = dev_id; + } + utils::Assert( device_id < device_count, "Incorrect Device ID" ); + utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" ); + cudaGetDeviceProperties(&prop, device_id); + printf("Use CUDA Device %d: %s\n", device_id, prop.name); + cublasInit(); + } + inline void ShutdownTensorEngine( void ){ + cublasShutdown(); + } + + template<int dim> + inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){ + size_t pitch; + // common choice for cuda mem align unit is 32 + if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){ + cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ + obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); + utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); + obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) ); + }else{ + obj.shape.stride_ = obj.shape[0]; + cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \ + obj.shape.Size() * sizeof(real_t), 1 ); + utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); + } + } + + template<int dim> + inline void FreeSpace(Tensor<gpu,dim> &obj){ + cudaFree( obj.dptr ); obj.dptr = NULL; + } + + template<typename A,typename B, int dim> + inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind kind){ + utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); + Tensor<A,2> dst = _dst.FlatTo2D(); + Tensor<B,2> src = _src.FlatTo2D(); + cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t), + src.dptr, src.shape.stride_ * sizeof(real_t), + dst.shape[0] * sizeof(real_t), + dst.shape[1], kind ); + utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); + } + template<int dim> + inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src){ + Copy( dst, src, cudaMemcpyDeviceToHost ); + } + template<int dim> + inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src){ + Copy( dst, src, cudaMemcpyDeviceToDevice ); + } + template<int dim> + inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src){ + Copy( dst, src, cudaMemcpyHostToDevice ); + } +}; + +#ifdef __CUDACC__ +// the following part is included only if compiler is nvcc +#include "cuda/tensor_gpu-inl.cuh" + +namespace mshadow{ + template<typename Saver, typename E, int dim> + inline void MapPlan(Tensor<gpu,dim> _dst, const expr::Plan<E> &plan){ + cuda::MapPlan<Saver>( _dst.FlatTo2D(), plan ); + } + + template<typename Saver, int dim, typename E, int etype> + inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp ){ + using namespace expr; + TypeCheckPass< TypeCheck<gpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); + Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() ); + utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" ); + MapPlan<Saver>( dst, MakePlan( exp.self() ) ); + } + + template<typename Saver, typename Reducer, typename E, int etype> + inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){ + using namespace expr; + TypeCheckPass< TypeCheck<gpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D(); + + utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" ); + utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" ); + cuda::MapReduceKeepLowest<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, eshape ); + } + + template<typename Saver, typename Reducer, int dimkeep, typename E, int etype> + inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){ + using namespace expr; + TypeCheckPass< TypeCheck<gpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + typedef Shape< ExpInfo<E>::kDim > EShape; + EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ); + utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" ); + // use equvalent form + Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], + eshape.ProdShape(1,dimkeep), eshape[0] ); + // call equavalent map red dim 2 + cuda::MapReduceKeepDim2<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, pshape ); + } + + inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2>& src ){ + cuda::Softmax( dst, src ); + } +}; // namespace mshadow + +#endif // __CUDACC__ + +#endif // MSHADOW_USE_CUDA +#endif // TENSOR_GPU_INL_HPP http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_io.h ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_io.h b/include/mshadow/tensor_io.h new file mode 100644 index 0000000..2ce28b3 --- /dev/null +++ b/include/mshadow/tensor_io.h @@ -0,0 +1,137 @@ +#ifndef MSHADOW_TENSOR_IO_H +#define MSHADOW_TENSOR_IO_H +/*! + * \file tensor_io.h + * \brief definitions of I/O functions for mshadow tensor + * \author Tianqi Chen + */ +#include <cstdio> +#include "tensor.h" + +namespace mshadow{ + namespace utils{ + /*! + * \brief interface of stream I/O, used to serialize data, + * it is not restricted to only this interface in SaveBinary/LoadBinary + * mshadow accept all class that implements Read and Write + */ + class IStream{ + public: + /*! + * \brief read data from stream + * \param ptr pointer to memory buffer + * \param size size of block + * \return usually is the size of data readed + */ + virtual size_t Read( void *ptr, size_t size ) = 0; + /*! + * \brief write data to stream + * \param ptr pointer to memory buffer + * \param size size of block + */ + virtual void Write( const void *ptr, size_t size ) = 0; + /*! \brief virtual destructor */ + virtual ~IStream( void ){} + }; + }; + + /*! + * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated + * \param fo output binary stream + * \param src source data file + * \tparam dim dimension of tensor + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template<int dim,typename TStream> + inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src ); + /*! \brief refer to comment of cpu ver \sa SaveBinary */ + template<int dim,typename TStream> + inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src ); + + /*! + * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated + * if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded + * if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst + * \param fi output binary stream + * \param dst destination file + * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen + * \tparam dim dimension of tensor + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template<int dim,typename TStream> + inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst, bool pre_alloc ); + /*! \brief refer to comment of cpu ver \sa LoadBinary */ + template<int dim,typename TStream> + inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc ); + + namespace utils{ + /*! \brief implementation of file i/o stream */ + class FileStream: public IStream{ + public: + /*! \brief constructor */ + FileStream( FILE *fp ):fp_(fp){} + virtual size_t Read( void *ptr, size_t size ){ + return fread( ptr, size, 1, fp_ ); + } + virtual void Write( const void *ptr, size_t size ){ + fwrite( ptr, size, 1, fp_ ); + } + /*! \brief close file */ + inline void Close( void ){ + fclose( fp_ ); + } + private: + FILE *fp_; + }; + }; +}; + +namespace mshadow{ + // implementations + template<int dim, typename TStream> + inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src_ ){ + fo.Write( src_.shape.shape_, sizeof(index_t) * dim ); + Tensor<cpu,2> src = src_.FlatTo2D(); + for( index_t i = 0; i < src.shape[1]; ++ i ){ + fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] ); + } + } + template<int dim, typename TStream> + inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src ){ + // copy to CPU, then save + Tensor<cpu,dim> tmp( src.shape ); + AllocSpace( tmp ); + Copy( tmp, src ); + SaveBinary( fo, tmp ); + FreeSpace( tmp ); + } + + template<int dim, typename TStream> + inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst_, bool pre_alloc ){ + Shape<dim> shape; + utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, "mshadow::LoadBinary" ); + if( pre_alloc ){ + utils::Assert( shape == dst_.shape ); + }else{ + dst_.shape = shape; AllocSpace( dst_ ); + } + Tensor<cpu,2> dst = dst_.FlatTo2D(); + if( dst.shape[0] == 0 ) return; + for( index_t i = 0; i < dst.shape[1]; ++ i ){ + utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) != 0, "mshadow::LoadBinary" ); + } + } + template<int dim, typename TStream> + inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc ){ + Tensor<cpu,dim> tmp; + LoadBinary( fi, tmp, false ); + if( pre_alloc ){ + utils::Assert( tmp.shape == dst.shape ); + }else{ + dst.shape = tmp.shape; AllocSpace( dst ); + } + Copy( dst, tmp ); + FreeSpace( tmp ); + } +}; +#endif // TENSOR_IO_H
