http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/gtest/gtest_main.cc ---------------------------------------------------------------------- diff --git a/include/gtest/gtest_main.cc b/include/gtest/gtest_main.cc new file mode 100644 index 0000000..f302822 --- /dev/null +++ b/include/gtest/gtest_main.cc @@ -0,0 +1,38 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <stdio.h> + +#include "gtest/gtest.h" + +GTEST_API_ int main(int argc, char **argv) { + printf("Running main() from gtest_main.cc\n"); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/cuda_reduce.cuh ---------------------------------------------------------------------- diff --git a/include/mshadow/cuda/cuda_reduce.cuh b/include/mshadow/cuda/cuda_reduce.cuh new file mode 100644 index 0000000..b7808a6 --- /dev/null +++ b/include/mshadow/cuda/cuda_reduce.cuh @@ -0,0 +1,117 @@ +#ifndef MSHADOW_CUDA_REDUCE_CUH +#define MSHADOW_CUDA_REDUCE_CUH +/*! + * \file cuda_reduce.cuh + * \brief helper functions to do reduction + * \author Tianqi Chen + */ +namespace mshadow{ + namespace cuda{ + /* + * \brief reduce over the dimension x + * \tparam Reducer reducer + * \tparam x_bits dimension = 1<<x_bits + */ + template<typename Reducer,int x_bits> + inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ); + /* + * \brief reduce over the dimension x + * \tparam Reducer reducer + * \tparam xmax_bits maximum size of buffer + * \param xsize size of x dimension, not sure if aligned + */ + template<typename Reducer, int xmax_bits> + inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize ); + }; +}; + +// ===============================================x=== +// implementations afterwards, +// no need to read if only use the functions +// -------------------------------------------------- +#ifdef __DEVICE_EMULATION__ +#define __MSHADOW_EMUSYNC__ __syncthreads() +#else +#define __MSHADOW_EMUSYNC__ +#endif + +namespace mshadow{ + namespace cuda{ + template<typename Reducer, int x_bits> + inline __device__ void ReduceX( volatile real_t buf[], int tid ){ + if( x_bits >= 10 ){ + if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] ); + __syncthreads(); + } + if( x_bits >= 9 ){ + if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] ); + __syncthreads(); + } + if( x_bits >= 8 ){ + if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] ); + __syncthreads(); + } + if( x_bits >= 7 ){ + if( tid < 64 ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] ); + __syncthreads(); + } + if( x_bits >= 6 ){ + if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] ); + __syncthreads(); + } + // in warp optimization + if( x_bits >= 5 ){ + if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] ); + __MSHADOW_EMUSYNC__; + } + if( x_bits >= 4 ){ + if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] ); + __MSHADOW_EMUSYNC__; + } + if( x_bits >= 3 ){ + if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] ); + __MSHADOW_EMUSYNC__; + } + if( x_bits >= 2 ){ + if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] ); + __MSHADOW_EMUSYNC__; + } + if( x_bits >= 1 ){ + if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] ); + __MSHADOW_EMUSYNC__; + } + }; + + template<typename Reducer,int x_bits> + inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){ + ReduceX<Reducer,x_bits>( buf, threadIdx.x ); + } + + // reduce with a upper bound + #define __RD_NON_ALIGN(els,x_bits) \ + els \ + if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){ \ + if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \ + Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \ + } \ + __syncthreads(); \ + ReduceX<Reducer, x_bits>( buf, tid ); \ + } \ + + template<typename Reducer, int xmax_bits> + inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){ + int tid = threadIdx.x; + __RD_NON_ALIGN(, 8) + __RD_NON_ALIGN(else, 7) + __RD_NON_ALIGN(else, 6) + __RD_NON_ALIGN(else, 5) + __RD_NON_ALIGN(else, 4) + __RD_NON_ALIGN(else, 3) + __RD_NON_ALIGN(else, 2) + __RD_NON_ALIGN(else, 1) + } + }; +}; + +#endif // MSHADOW_CUDA_REDUCE_CUH + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/tensor_gpu-inl.cuh ---------------------------------------------------------------------- diff --git a/include/mshadow/cuda/tensor_gpu-inl.cuh b/include/mshadow/cuda/tensor_gpu-inl.cuh new file mode 100644 index 0000000..61e477c --- /dev/null +++ b/include/mshadow/cuda/tensor_gpu-inl.cuh @@ -0,0 +1,231 @@ +#ifndef MSHADOW_TENSOR_GPU_INL_CUH +#define MSHADOW_TENSOR_GPU_INL_CUH +/*! + * \file tensor_gpu-inl.cuh + * \brief implementation of GPU code using CUDA + * \author Bing Xu, Tianqi Chen + */ +#include "../tensor.h" +#include "cuda_reduce.cuh" + +namespace mshadow{ + namespace cuda{ + #ifndef __CUDA_ARCH__ + #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0" + #endif + /* load unit for memory access */ + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 + const int kMemUnitBits = 5; + const int kMaxThreadsPerBlock = 1024; + #else + const int kMemUnitBits = 4; + const int kMaxThreadsPerBlock = 512; + #endif + /*! \brief number of units that can do synchronized update, half warp size */ + const int kMemUnit = 1 << kMemUnitBits; + /*! \brief mask that could be helpful sometime */ + const int kMemUnitMask = kMemUnit - 1; + /*! \brief suggested thread number(logscale) for mapping kernel */ + const int kBaseThreadBits = 8; + /*! \brief suggested thread number for mapping kernel */ + const int kBaseThreadNum = 1 << kBaseThreadBits; + /*! \brief maximum value of grid */ + const int kMaxGridNum = 65535; + /*! \brief suggested grid number for mapping kernel */ + const int kBaseGridNum = 1024; + + /*! \brief get align stride for given size in x dimension */ + inline index_t GetAlignStride( index_t xsize, index_t xstride ){ + if( (xstride & (kMemUnit-1)) == 0 ){ + return ( (xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits; + }else{ + // if originally space is not aligned, no necessary to to alligned thread allocation + return xsize; + } + } + inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){ + if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock || + dimGrid.x > 65535 || dimGrid.y > 65535 ){ + fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z ); + utils::Error( "too large launch parameter\n"); + } + } + }; + + namespace cuda { + template<typename Saver, typename Plan, int block_dim_bits> + __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int block_idx ){ + const index_t tid = (block_idx << block_dim_bits) + threadIdx.x; + const int y = tid / xstride; + const int x = tid % xstride; + if (y < dst.shape[1] && x < dst.shape[0]) { + Saver::Save(dst[y][x], exp.Eval(y,x)); + } + } + template<typename Saver, typename Plan, int block_dim_bits> + __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp ){ + MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x ); + } + template<typename Saver, typename Plan, int block_dim_bits, int grid_size> + __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int repeat ){ + for( int i = 0; i < repeat; ++i ){ + MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x + i*grid_size ); + } + } + + template<typename Saver, typename E> + inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){ + const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ ); + const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum; + dim3 dimBlock(kBaseThreadNum, 1, 1); + + if (num_block < kMaxGridNum) { + dim3 dimGrid(num_block, 1, 1); + MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits> \ + <<<dimGrid,dimBlock>>>(dst, xstride, plan); + } else { + int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum; + dim3 dimGrid( kBaseGridNum, 1 , 1 ); + MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, kBaseGridNum> \ + <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat ); + } + } + }; // namespace cuda + + namespace cuda{ + template<typename Saver,typename Reducer, int warp_bits, typename Plan> + __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<2> eshape ){ + const unsigned warp_size = 1 << warp_bits; + const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x; + // to avoid bank conflict + __shared__ real_t s_res[ warp_size ][ warp_size + 1 ]; + + // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization + if( threadIdx.y < eshape[1] && x < eshape[0] ){ + s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x ); + } + for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){ + if( threadIdx.y + y < eshape[1] && x < eshape[0] ){ + Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) ); + } + } + __syncthreads(); + if( eshape[1] >= warp_size ){ + Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] ); + }else{ + Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], eshape[1] ); + } + __syncthreads(); + + if( threadIdx.y == 0 && x < eshape[0] ){ + Saver::Save( dst[x], s_res[ threadIdx.x ][ 0 ] * scale ); + } + } + + template<typename Saver, typename Reducer, typename E> + inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){ + dim3 dimBlock( kMemUnit, kMemUnit ); + dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits ); + CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" ); + MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, plan, scale, eshape ); + } + }; // namespace cuda + + namespace cuda{ + template<typename Saver,typename Reducer, int block_dim_bits, typename Plan> + __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<4> pshape ){ + const int block_size = 1 << block_dim_bits; + __shared__ real_t s_rec[ block_size ]; + const int c = blockIdx.x; + const index_t tot = pshape[0]*pshape[1]*pshape[3]; + + real_t res = Reducer::kInitV; + for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){ + index_t i = i_offset + threadIdx.x; + if( i< tot ){ + const index_t x = i % pshape[0]; + i /= pshape[0]; + const index_t y = i % pshape[1]; + const index_t n = i / pshape[1]; + Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) ); + } + } + s_rec[ threadIdx.x ] = res; + __syncthreads(); + Reduce1D<Reducer,block_dim_bits>( s_rec ); + if( threadIdx.x == 0 ){ + Saver::Save( dst[c], s_rec[0]*scale ); + } + } + + template<typename Saver, typename Reducer, typename Plan> + inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, real_t scale, Shape<4> pshape ){ + dim3 dimBlock( kBaseThreadNum ); + dim3 dimGrid ( dst.shape[0] ); + CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" ); + MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits> + <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape ); + } + }; + + namespace cuda{ + template<int x_bits> + __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){ + const unsigned x_size = 1 << x_bits; + const int y = blockIdx.x; + __shared__ real_t s_rec[ x_size ]; + + // step 1: get max + if( threadIdx.x < dst.shape[ 0 ] ){ + s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; + } + for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){ + if( x + threadIdx.x < dst.shape[0] ){ + real_t a = src[ y ][ x + threadIdx.x ]; + s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] ); + } + } + __syncthreads(); + if( threadIdx.x >= dst.shape[0] ){ + s_rec[ threadIdx.x ] = s_rec[0]; + } + __syncthreads(); + Reduce1D<red::maximum,x_bits>( s_rec ); + __syncthreads(); + real_t smax = s_rec[0]; + __syncthreads(); + s_rec[ threadIdx.x ] = 0.0f; + __syncthreads(); + + // calculate normalizer, with writeback + for( unsigned x = 0; x < dst.shape[0]; x += x_size ){ + if( x + threadIdx.x < dst.shape[0] ){ + real_t p = expf( src[ y ][ x + threadIdx.x ] - smax ); + s_rec[ threadIdx.x ] += p; + // write back first, will fetch later + dst[ y ][ x + threadIdx.x ] = p; + } + } + // calculate normalizer + __syncthreads(); + Reduce1D<red::sum,x_bits>( s_rec ); + __syncthreads(); + real_t ssum = s_rec[0]; + + for( unsigned x = 0; x < dst.shape[0]; x += x_size ){ + if( x + threadIdx.x < dst.shape[0] ){ + dst[ y ][ x + threadIdx.x ] /= ssum; + } + } + } + + inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){ + dim3 dimBlock( kBaseThreadNum ); + dim3 dimGrid ( dst.shape[1] ); + utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" ); + CheckLaunchParam( dimGrid, dimBlock, "Softmax" ); + SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src ); + } + }; // namespace cuda +}; // namespace mshadow +#endif // TENSOR_GPU_INL_H http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cxxnet_op.h ---------------------------------------------------------------------- diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h new file mode 100644 index 0000000..930caf2 --- /dev/null +++ b/include/mshadow/cxxnet_op.h @@ -0,0 +1,116 @@ +#ifndef CXXNET_OP_H +#define CXXNET_OP_H +#pragma once +/*! + * \file cxxnet_op.h + * \brief extra mshadow operation for cxxnet + * \author Bing Xu + */ +#include "mshadow/tensor.h" + +namespace mshadow { + /*! \brief operations for algorithm */ + namespace op { + struct sigmoid { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f / (1.0f + expf(-a)); + } + }; + struct sigmoid_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + return a * ( 1.0f - a ); + } + }; + + /*! \brief Rectified Linear Operation */ + struct relu { + MSHADOW_XINLINE static real_t Map(real_t a) { + using namespace std; + return max( a, 0.0f ); + } + }; + struct relu_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + return a > 0.0f ? 1.0f : 0.0f; + } + }; + + struct tanh { + MSHADOW_XINLINE static real_t Map(real_t a) { + return tanhf( a ); + } + }; + struct tanh_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f - a * a; + } + }; + struct softplus { + MSHADOW_XINLINE static real_t Map(real_t a) { + return logf(1 + expf(a)); + } + }; + struct softplus_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.0f / (1.0f + expf(-a)); + } + }; + struct bnll { + MSHADOW_XINLINE static real_t Map(real_t a) { + return a > 0.0f ? a + logf(1.0f + expf(-a)) : logf(1.0f + expf(a)); + } + }; + struct bnll_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + real_t expval = a > 50.0f ? 50.0f : a; // kBNLL_THRESHOLD = 50.0f + expval = expf(-expval); + return 1.0f / (1.0f + expval); + } + }; + + struct square { + MSHADOW_XINLINE static real_t Map(real_t a) { + return a * a; + } + }; + /*! \brief scaled tanh, hard code the scale factor*/ + struct stanh { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 1.7159047*tanhf(0.66666667 *a); + } + }; + /*! \breif back prop for scaled tanh: */ + struct stanh_grad { + MSHADOW_XINLINE static real_t Map(real_t a) { + return 0.66666667*1.7159047 -0.66666667/1.7159047*a*a; + } + }; + + }; //namespace op + +}; //namespace mshadow + +namespace mshadow { + namespace op { + /*! \brief used for generate Bernoulli mask */ + struct threshold { + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return a < b ? 1.0f : 0.0f; + } + }; + + /*! \brief used for generate element of power */ + struct power { + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return powf( a, b ); + } + }; + struct sqrtop { + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return sqrt(a+b); + } + }; + }; // namespace op +}; // namespace mshadow + +#endif // CXXNET_OP_H http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor.h ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h new file mode 100644 index 0000000..42d13d3 --- /dev/null +++ b/include/mshadow/tensor.h @@ -0,0 +1,472 @@ +#ifndef MSHADOW_TENSOR_H +#define MSHADOW_TENSOR_H +/*! + * \file tensor.h + * \brief header file of tensor data structure and functions + * covention: this lib requires explicit memory allocation and de-allocation + * all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers), + * no memory allocation is happening during calculation + * \author Bing Xu, Tianqi Chen + */ +#include "tensor_base.h" +#include "tensor_expr.h" + +namespace mshadow { + /*! + * \brief shape of a tensor + * IMPORTANT NOTE: this shape is different from numpy.shape + * shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension + * shape[k] corresponds to k-th dimension of tensor + * \tparam dimension dimension of tensor + */ + template<int dimension> + struct Shape { + public: + /*! \brief maximum dimension of tensor */ + const static int kMaxShape = dimension; + /*! \brief maximum dimension minus 1 */ + const static int kSubShape = dimension - 1; + public: + /*! \brief default constructor, do nothing */ + MSHADOW_XINLINE Shape(void) {} + /*! \brief constuctor */ + MSHADOW_XINLINE Shape( const Shape<dimension> &s ){ + #pragma unroll + for( int i = 0; i < kMaxShape; ++i ){ + this->shape_[i] = s[i]; + } + this->stride_ = s.stride_; + } + /*! + * \brief get corresponding index + * \param idx dimension index + * \return the corresponding dimension size + */ + MSHADOW_XINLINE index_t& operator[](index_t idx) { + return shape_[ idx ]; + } + /*! + * \brief get corresponding index + * \param idx dimension index + * \return the corresponding dimension size + */ + MSHADOW_XINLINE const index_t& operator[](index_t idx) const { + return shape_[ idx ]; + } + /*! \return whether two shape equals */ + MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const { + #pragma unroll + for ( int i = 0; i < kMaxShape; ++i ) { + if (s.shape_[i] != this->shape_[i]) return false; + } + return true; + } + /*! + * flatten the higher dimension to second dimension, return a 2D shape + * \return the flat 2d shape + */ + MSHADOW_XINLINE Shape<2> FlatTo2D(void) const { + Shape<2> s; + s.stride_ = this->stride_; + s.shape_[ 0 ] = this->shape_[ 0 ]; + index_t ymax = 1; + + #pragma unroll + for (int i = 1; i < kMaxShape; ++i) { + ymax *= this->shape_[ i ]; + } + s.shape_[1] = ymax; + return s; + } + /*! \return number of valid elements */ + MSHADOW_XINLINE size_t Size(void) const{ + size_t memsz = this->shape_[ 0 ]; + #pragma unroll + for (int i = 1; i < kMaxShape; ++i) { + memsz *= this->shape_[ i ]; + } + return memsz; + } + /*! \return memory size, including the aligned x dimension */ + MSHADOW_XINLINE size_t MSize(void) const { + size_t memsz = this->stride_; + #pragma unroll + for (int i = 1; i < kMaxShape; ++i) { + memsz *= this->shape_[ i ]; + } + return memsz; + } + /*! + * \return product shape in [dimstart,dimend) + * \param dimstart start dimension + * \param dimend end dimension + */ + MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{ + index_t num = 1; + #pragma unroll + for (int i = dimstart; i < dimend; ++i) { + num *= this->shape_[ i ]; + } + return num; + } + /*! + * \brief get subshape + * \return subshape + */ + MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const { + Shape<kSubShape> s; + s.stride_ = this->stride_; + // for cuda + #pragma unroll + for (int i = 0; i < kSubShape; ++i) { + s.shape_[ i ] = this->shape_[ i ]; + } + return s; + } + + public: + /*! \brief storing the dimension information */ + index_t shape_[ kMaxShape ]; + /*! + * \brief storing the stride information in x dimension + * this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency + */ + index_t stride_; + }; + // useful construction functions to generate shape + /*! + * \brief construct a one dimension shape, stride will equal s0 + * \param s0 size of dimension 0 + * \return the shape construction + */ + MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){ + Shape<1> s; s[0] = s0; s.stride_ = s0; + return s; + } + /*! + * \brief construct a two dimension shape, stride will equal s0 + * \param s1 size of dimension 1 + * \param s0 size of dimension 0 + * \return the shape construction + */ + MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){ + Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0; + return s; + } + /*! + * \brief construct a three dimension shape, stride will equal s0 + * \param s2 size of dimension 2 + * \param s1 size of dimension 1 + * \param s0 size of dimension 0 + * \return the shape construction + */ + MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){ + Shape<3> s; + s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0; + return s; + } + /*! + * \brief construct a four dimension shape, stride will equal s0 + * \param s3 size of dimension 3 + * \param s2 size of dimension 2 + * \param s1 size of dimension 1 + * \param s0 size of dimension 0 + * \return the shape construction + */ + MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){ + Shape<4> s; + s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0; + return s; + } +}; // namespace mshadow + +namespace mshadow { + /*! \brief device name CPU */ + struct cpu { + /*! \brief whether this device is CPU or not */ + const static bool kDevCPU = true; + /*! \brief device flag number, identifies this device */ + const static int kDevMask = 1<<0; + }; + /*! \brief device name CPU */ + struct gpu { + /*! \brief whether this device is CPU or not */ + const static bool kDevCPU = false; + /*! \brief device flag number, identifies this device */ + const static int kDevMask = 1<<1; + }; + + // more compact template + /*! + * \brief general tensor + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + */ + template<typename Device, int dimension> + struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{ + public: + /*! \brief whether current type lies in cpu */ + const static bool kDevCPU = Device::kDevCPU; + /*! \brief dimension of subtype */ + const static int kSubdim = dimension - 1; + + public: + /*! \brief pointer to the data */ + real_t *dptr; + /*! \brief shape of the tensor */ + Shape<dimension> shape; + public: + /*! \brief default constructor */ + MSHADOW_XINLINE Tensor(void) {} + /*! \brief constructor from shape */ + MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {} + /*! \brief constructor from data pointer and shape */ + MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {} + /*! + * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together + * \return tensor after flatten + */ + MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const { + return Tensor<Device, 2>(reinterpret_cast<real_t*> \ + (dptr), shape.FlatTo2D()); + } + /*! + * \brief get a element of dimension - 1 + * \param idx index + * \return the result tensor + */ + MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const { + Shape<kSubdim> s = shape.SubShape(); + return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \ + (dptr) + s.MSize() * idx, s); + } + /*! + * \brief slice the tensor in highest dimension [begin,end) + * \param begin begin position of slice + * \param end end position of slice + * \return tensor after slice + */ + MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const { + Shape<dimension> s = this->shape; + s[ dimension - 1 ] = end - begin; + return Tensor<Device, dimension>(reinterpret_cast<real_t*>\ + (dptr) + s.SubShape().MSize() * begin, s); + } + public: + /*!\brief functions to fit expression template */ + inline Tensor<Device,dimension>& operator=( real_t s ){ + return this->__assign( s ); + } + /*!\brief functions to fit expression template */ + template<typename E> + inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){ + return this->__assign( exp ); + } + /*!\brief functions to fit expression template */ + template<typename E> + inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){ + return this->__assign( exp ); + } + }; + + /* + * respecialized class Tensor1D,thei is due to different implementation in operator[] + */ + template<typename Device> + struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{ + public: + real_t *dptr; + Shape<1> shape; + public: + MSHADOW_XINLINE Tensor(void) {} + MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {} + MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {} + + MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const { + return Tensor<Device, 2>(reinterpret_cast<real_t*> \ + (dptr), shape.FlatTo2D()); + } + MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const { + Shape<1> s; + s[0] = s.stride_ = end - begin; + return Tensor<Device, 1>(reinterpret_cast<real_t*> \ + (dptr) + begin, s); + } + MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; } + MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; } + public: + // functions to fit expression template + inline Tensor<Device,1>& operator=( double s ){ + return this->__assign( s ); + } + template<typename E> + inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){ + return this->__assign( exp ); + } + template<typename E> + inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){ + return this->__assign( exp ); + } + }; +}; // namespace mshadow + +// add unroll loops for the shape +namespace mshadow { + // function declarations + /*! + * \brief initialize tensor engine, used to call intialization functions of dependent libs + * this function should be called before all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + * \param device_id GPU device id to be choosed + */ + inline void InitTensorEngine( int device_id=0 ); + /*! + * \brief Shutdown tensor engine, + * this function should be called after all GPU tensor operations, + * for using tensors in CPU, this call is actually not needed + */ + inline void ShutdownTensorEngine( void ); + + /*! + * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj + * this function is responsible to set the stride_ in each obj.shape + * \tparam dim specify the dim of tensor + * \param obj the tensor object, with shape specified + * \param pad whether padding dimension 0, to make last dimension aligned, + * padding may help improve efficiency of matrix multiplications + * if true, will allocate space with stride_ that may not equals shape[0] + * if false, will allocate continuous space + */ + template<int dim> + inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD); + /*! \brief refer to comment of cpu ver \sa AllocSpace */ + template<int dim> + inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD); + + /*! + * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL + * \tparam dim specify the dim of tensor + * \param obj the tensor object + */ + template<int dim> + inline void FreeSpace(Tensor<cpu,dim> &obj); + /*! \brief refer to comment of cpu ver \sa FreeSpace */ + template<int dim> + inline void FreeSpace(Tensor<gpu,dim> &obj); + + /*! + * \brief CPU/GPU: short cut to allocate and initialize a Tensor + * \tparam Device device of tensor + * \tparam dim dimention of tensor + * \param shape: shape of tensor + * \param initv: initialization value + * \param pad : padding option + * \sa AllocSpace + */ + template<typename Device, int dim> + inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD); + + /*! + * \brief copy data from one tensor to another, with same shape + * \tparam dim specify the dim of tensor + * \param dst target tensor + * \param src source tensor + */ + template<int dim> + inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src ); + /*! \brief refer to comment of cpu ver \sa Copy */ + template<int dim> + inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src ); + /*! \brief refer to comment of cpu ver \sa Copy */ + template<int dim> + inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src ); + /*! \brief refer to comment of cpu ver \sa Copy */ + template<int dim> + inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src ); + + + /*! + * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) ) + * \param dst destination + * \param energy input energy + */ + inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy ); + /*! \brief refer to comment of cpu ver \sa Softmax */ + inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy ); + +}; // namespace mshadow + + +namespace mshadow{ + // function declarations to support expression, no need to understand them + // these functions do not need to be directly used + + /*! + * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan + * \tparam Saver specify storage method + * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \sa namespace mshadow:sv, mshadow::op, mshadow::expr + */ + template<typename Saver, int dim, typename E, int etype> + inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ); + /*! \brief refer to comment of cpu ver \sa MapExp */ + template<typename Saver, int dim, typename E, int etype> + inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp ); + + /*! + * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) + * \tparam Saver specify storage method + * \tparam Reducer specify a reducer method + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \param scale scale the result before save + * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr + */ + template<typename Saver, typename Reducer, typename E, int etype> + inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f ); + /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */ + template<typename Saver, typename Reducer, typename E, int etype> + inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f ); + + + /*! + * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) + * \tparam Saver specify storage method + * \tparam Reducer specify a reducer method + * \tparam E specifies the expression type, not need to specify this parameter during usage + * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest + * \tparam etype expression type + * \param dst destination + * \param exp expression + * \param scale scale the result before save + * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr + */ + template<typename Saver, typename Reducer, int dimkeep, typename E, int etype> + inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f ); + /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */ + template<typename Saver, typename Reducer, int dimkeep, typename E, int etype> + inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f ); + +};// namespace mshadow + +// execution implementation of expression evaluations +#include "tensor_expr_engine-inl.hpp" +// cpu implementation of functions +#include "tensor_cpu-inl.hpp" +// gpu implementation of functions +#include "tensor_gpu-inl.hpp" +// extension of expressions +#include "tensor_expr_ext.h" +// io +#include "tensor_io.h" +// container +#include "tensor_container.h" +// random number generator +#include "tensor_random.h" +#endif // TENSOR_H http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_base.h ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_base.h b/include/mshadow/tensor_base.h new file mode 100644 index 0000000..b251cba --- /dev/null +++ b/include/mshadow/tensor_base.h @@ -0,0 +1,298 @@ +#ifndef MSHADOW_TENSOR_BASE_H +#define MSHADOW_TENSOR_BASE_H +/*! + * \file tensor_base.h + * \brief definitions of base types, macros functions + * + * \author Bing Xu, Tianqi Chen + */ +#include <cmath> +#include <cstdio> +#include <cfloat> +#include <climits> +#include <algorithm> +// macro defintiions + +/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */ +#ifndef MSHADOW_STAND_ALONE + #define MSHADOW_STAND_ALONE 0 +#endif + +/*! \brief whether do padding during allocation */ +#ifndef MSHADOW_ALLOC_PAD + #define MSHADOW_ALLOC_PAD true +#endif + +/*! + * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation + * for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64 + * set it to 0 then we will always allocate padded memory + */ +#ifndef MSHADOW_MIN_PAD_RATIO + #define MSHADOW_MIN_PAD_RATIO 2 +#endif + +#if MSHADOW_STAND_ALONE + #define MSHADOW_USE_CBLAS 0 + #define MSHADOW_USE_MKL 0 + #define MSHADOW_USE_CUDA 0 +#endif + +/*! \brief use CBLAS for CBLAS */ +#ifndef MSHADOW_USE_CBLAS + #define MSHADOW_USE_CBLAS 0 +#endif +/*! \brief use MKL for BLAS */ +#ifndef MSHADOW_USE_MKL + #define MSHADOW_USE_MKL 1 +#endif +/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */ +#ifndef MSHADOW_USE_CUDA + #define MSHADOW_USE_CUDA 1 +#endif +/*! \brief use single precition float */ +#ifndef MSHADOW_SINGLE_PRECISION + #define MSHADOW_SINGLE_PRECISION 1 +#endif +/*! \brief whether use SSE */ +#ifndef MSHADOW_USE_SSE + #define MSHADOW_USE_SSE 1 +#endif +/*! \brief whether use NVML to get dynamic info */ +#ifndef MSHADOW_USE_NVML + #define MSHADOW_USE_NVML 0 +#endif +// SSE is conflict with cudacc +#ifdef __CUDACC__ + #undef MSHADOW_USE_SSE + #define MSHADOW_USE_SSE 0 +#endif + +#if MSHADOW_USE_CBLAS +extern "C"{ + #include <cblas.h> +} +#elif MSHADOW_USE_MKL + #include <mkl.h> + #include <mkl_cblas.h> + #include <mkl_vsl.h> + #include <mkl_vsl_functions.h> +#endif + +#if MSHADOW_USE_CUDA + #include <cublas.h> + #include <curand.h> +#endif + +#if MSHADOW_USE_NVML + #include <nvml.h> +#endif +// -------------------------------- +// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code. +#ifdef MSHADOW_XINLINE + #error "MSHADOW_XINLINE must not be defined" +#endif +#ifdef __CUDACC__ + #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__ +#else + #define MSHADOW_XINLINE inline __attribute__((always_inline)) +#endif +/*! \brief cpu force inline */ +#define MSHADOW_CINLINE inline __attribute__((always_inline)) + +#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L + #define MSHADOW_CONSTEXPR constexpr +#else + #define MSHADOW_CONSTEXPR const +#endif + +/*! \brief namespace for mshadow */ +namespace mshadow { + /*! \brief buffer size for each random number generator */ + const unsigned kRandBufferSize = 1000000; + /*! \brief pi */ + const float kPi = 3.1415926f; + +#if MSHADOW_SINGLE_PRECISION + /*! \brief type that will be used for content */ + typedef float real_t; +#else + typedef double real_t; +#endif + /*! \brief type that will be used for index */ + typedef unsigned index_t; +}; // namespace mshadow + +namespace mshadow { + /*! \brief namespace for operators */ + namespace op { + // binary operator + /*! \brief mul operator */ + struct mul{ + /*! \brief map a, b to result using defined operation */ + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return a * b; + } + }; + /*! \brief plus operator */ + struct plus { + /*! \brief map a, b to result using defined operation */ + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return a + b; + } + }; + /*! \brief minus operator */ + struct minus { + /*! \brief map a, b to result using defined operation */ + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return a - b; + } + }; + /*! \brief divide operator */ + struct div { + /*! \brief map a, b to result using defined operation */ + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return a / b; + } + }; + /*! \brief get rhs */ + struct right { + /*! \brief map a, b to result using defined operation */ + MSHADOW_XINLINE static real_t Map(real_t a, real_t b) { + return b; + } + }; + }; // namespace op + + /*! \brief namespace for savers */ + namespace sv { + /*! \brief save to saver: = */ + struct saveto { + /*! \brief save b to a using save method */ + MSHADOW_XINLINE static void Save(real_t& a, real_t b) { + a = b; + } + /*! \brief helper constant to use BLAS, alpha */ + MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f; + /*! \brief helper constant to use BLAS, beta */ + MSHADOW_CONSTEXPR static real_t kBetaBLAS = 0.0f; + /*! \brief corresponding binary operator type */ + typedef op::right OPType; + }; + /*! \brief save to saver: += */ + struct plusto { + /*! \brief save b to a using save method */ + MSHADOW_XINLINE static void Save(real_t& a, real_t b) { + a += b; + } + /*! \brief helper constant to use BLAS, alpha */ + MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f; + /*! \brief helper constant to use BLAS, beta */ + MSHADOW_CONSTEXPR static real_t kBetaBLAS = 1.0f; + /*! \brief corresponding binary operator type */ + typedef op::plus OPType; + }; + /*! \brief minus to saver: -= */ + struct minusto { + /*! \brief save b to a using save method */ + MSHADOW_XINLINE static void Save(real_t& a, real_t b) { + a -= b; + } + /*! \brief helper constant to use BLAS, alpha */ + MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f; + /*! \brief helper constant to use BLAS, beta */ + MSHADOW_CONSTEXPR static real_t kBetaBLAS = 1.0f; + /*! \brief corresponding binary operator type */ + typedef op::minus OPType; + }; + /*! \brief multiply to saver: *= */ + struct multo { + /*! \brief save b to a using save method */ + MSHADOW_XINLINE static void Save(real_t& a, real_t b) { + a *= b; + } + /*! \brief corresponding binary operator type */ + typedef op::mul OPType; + }; + /*! \brief divide to saver: /= */ + struct divto { + /*! \brief save b to a using save method */ + MSHADOW_XINLINE static void Save(real_t& a, real_t b) { + a /= b; + } + /*! \brief corresponding binary operator type */ + typedef op::div OPType; + }; + }; // namespace sv + + + namespace op { + // unary operator/ function: example + // these operators can be defined by user, in the same style as binary and unary operator + // to use, simply write F<op::identity>( src ) + /*! \brief identity function that maps a real number to it self */ + struct identity{ + /*! \brief map a to result using defined operation */ + MSHADOW_XINLINE static real_t Map(real_t a) { + return a; + } + }; + }; // namespace op + + /*! \brief namespace for potential reducer operations */ + namespace red { + /*! \brief sum reducer */ + struct sum { + /*! \brief do reduction into dst */ + MSHADOW_XINLINE static void Reduce( volatile real_t& dst, volatile real_t src ) { + dst += src; + } + /*! \brief calculate gradient of redres with respect to redsrc, redres: reduced result, redsrc: one of reduction element */ + MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) { + return 1.0f; + } + /*! \brief an intial value of reducer */ + MSHADOW_CONSTEXPR static real_t kInitV = 0.0f; + }; + /*! \brief maximum reducer */ + struct maximum { + /*! \brief do reduction into dst */ + MSHADOW_XINLINE static void Reduce( volatile real_t& dst, volatile real_t src ) { + using namespace std; + dst = max( dst, src ); + } + /*! \brief calculate gradient of redres with respect to redsrc, redres: reduced result, redsrc: one of reduction element */ + MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) { + return redres == redsrc ? 1.0f: 0.0f; + } + /*! \brief an intial value of reducer */ +#if MSHADOW_SINGLE_PRECISION + MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX; +#else + MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX; +#endif + }; + }; + + /*! \brief namespace for helper utils of the project */ + namespace utils{ + /*! \brief send error message then exit */ + inline void Error( const char *msg ){ + fprintf( stderr, "Error:%s\n",msg ); + exit( -1 ); + } + /*! \brief assert a expression is true */ + inline void Assert( bool exp ){ + if( !exp ) Error( "AssertError" ); + } + /*! \brief assert a expression is true */ + inline void Assert( bool exp, const char *msg ){ + if( !exp ) Error( msg ); + } + /*! \brief warning */ + inline void Warning( const char *msg ){ + fprintf( stderr, "warning:%s\n",msg ); + } + }; // namespace utils +}; // namespace mshadow +#endif // TENSOR_BASE_H http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_container.h ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_container.h b/include/mshadow/tensor_container.h new file mode 100644 index 0000000..f0699e7 --- /dev/null +++ b/include/mshadow/tensor_container.h @@ -0,0 +1,152 @@ +#ifndef MSHADOW_TENSOR_CONTAINER_H +#define MSHADOW_TENSOR_CONTAINER_H +/*! + * \file tensor_container.h + * \brief tensor container that does memory allocation and resize like STL + * \author Tianqi Chen + */ +#include "tensor.h" +#include "tensor_io.h" + +namespace mshadow{ + /*! + * \brief tensor container that does memory allocation and resize like STL, + * use it to save the lines of FreeSpace in class. + * Do not abuse it, efficiency can come from pre-allocation and no re-allocation + * + * \tparam Device which device the tensor is on + * \tparam dimension dimension of the tensor + */ + template<typename Device, int dimension> + class TensorContainer: public Tensor<Device,dimension>{ + public: + /*! + * \brief constructor + * \param pad whether use padding alignment in space allocation + */ + TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){ + this->pad_ = pad; + this->dptr = data_.dptr = NULL; + this->shape[0] = 0; + this->shape.stride_ = 0; + this->data_.shape.stride_ = 0; + this->data_.shape[1] = 0; + } + /*! + * \brief constructor + * \param shape intial shape + */ + TensorContainer( const Shape<dimension> &shape ){ + this->pad_ = MSHADOW_ALLOC_PAD; + data_.dptr = NULL; + this->AllocByShape( shape ); + } + /*! + * \brief constructor + * \param shape intial shape + * \param initv intial value + */ + TensorContainer( const Shape<dimension> &shape, real_t initv ){ + this->pad_ = MSHADOW_ALLOC_PAD; + data_.dptr = NULL; + this->AllocByShape( shape ); + (*this) = initv; + } + ~TensorContainer( void ){ + this->FreeSpace(); + } + /*! + * \brief resize the container to given shape, content is NOT preserved + * \param shape target shape + */ + inline void Resize( const Shape<dimension> &shape ){ + Shape<2> s2 = shape.FlatTo2D(); + if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){ + this->AllocByShape( shape ); + }else{ + this->shape = shape; + if( this->pad_ ){ + this->shape.stride_ = data_.shape.stride_; + }else{ + this->shape.stride_ = this->shape[ 0 ]; + } + } + } + /*! + * \brief resize the container to given shape, and initialize, content is NOT preserved + * \param shape target shape + * \param initv initialization value + */ + inline void Resize( const Shape<dimension> &shape, real_t initv ){ + this->Resize( shape ); + (*this) = initv; + } + /*! \brief set whether padding is allowed in tensor */ + inline void set_pad( bool pad ){ + this->pad_ = pad; + } + /*! + * \brief save by binary format + * \param fo output binary stream + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template<typename TStream> + inline void SaveBinary( TStream &fo ) const{ + mshadow::SaveBinary( fo, *this ); + } + /*! + * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated + * \param fi input binary stream + * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream. + */ + template<typename TStream> + inline void LoadBinary( TStream &fi ) { + Tensor<cpu,dimension> tmp; + mshadow::LoadBinary( fi, tmp, false ); + this->Resize( tmp.shape ); + Copy( *this, tmp ); + mshadow::FreeSpace( tmp ); + } + public: + // functions to fit exp template + inline Tensor<Device,dimension>& operator=( real_t s ){ + return this->__assign( s ); + } + template<typename E> + inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){ + return this->__assign( exp ); + } + template<typename E> + inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){ + return this->__assign( exp ); + } + private: + /*! \brief whether we do padding in the space */ + bool pad_; + /*! \brief the shape of data_ is actually current data space */ + Tensor<Device, 2> data_; + private: + inline void FreeSpace (void){ + if( data_.dptr != NULL ){ + mshadow::FreeSpace( data_ ); + data_.dptr = this->dptr = NULL; + } + } + inline void AllocByShape (const Shape<dimension>& shape){ + if( data_.dptr != NULL ){ + this->FreeSpace(); + } + data_.shape = shape.FlatTo2D(); + mshadow::AllocSpace( data_, pad_ ); + this->dptr = data_.dptr; + this->shape = shape; + if( this->pad_ ){ + this->shape.stride_ = data_.shape.stride_; + }else{ + this->shape.stride_ = shape[0]; + } + } + }; +};// namespace mshadow + +#endif http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_cpu-inl.hpp ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_cpu-inl.hpp b/include/mshadow/tensor_cpu-inl.hpp new file mode 100644 index 0000000..0fa3cfa --- /dev/null +++ b/include/mshadow/tensor_cpu-inl.hpp @@ -0,0 +1,168 @@ +#ifndef MSHADOW_TENSOR_CPU_INL_HPP +#define MSHADOW_TENSOR_CPU_INL_HPP +/*! + * \file tensor_cpu-inl.hpp + * \brief implementation of CPU host code + * \author Bing Xu, Tianqi Chen + */ +#include <cstring> +#include "tensor_base.h" +#include "tensor_sse-inl.hpp" + +namespace mshadow { + template<int dim> + inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){ + size_t pitch; + if( pad ){ + obj.dptr = (real_t*)sse2::AlignedMallocPitch + ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] ); + obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) ); + }else{ + obj.shape.stride_ = obj.shape[0]; + obj.dptr = (real_t*)sse2::AlignedMallocPitch + ( pitch, obj.shape.Size() * sizeof(real_t), 1 ); + } + } + + template<typename Device, int dim> + inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad ){ + Tensor<Device, dim> obj( shape ); + AllocSpace( obj, pad ); + MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) ); + return obj; + } + + template<int dim> + inline void FreeSpace(Tensor<cpu,dim> &obj){ + sse2::AlignedFree( obj.dptr ); + obj.dptr = NULL; + } + + template<int dim> + inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){ + utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); + Tensor<cpu,2> dst = _dst.FlatTo2D(); + Tensor<cpu,2> src = _src.FlatTo2D(); + for (index_t y = 0; y < dst.shape[1]; ++y ) { + memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] ); + } + } + + template<typename Saver, typename E, int dim> + inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){ + Tensor<cpu,2> dst = _dst.FlatTo2D(); + for (index_t y = 0; y < dst.shape[1]; ++y ) { + for (index_t x = 0; x < dst.shape[0]; ++x ) { + // trust your compiler! -_- they will optimize it + Saver::Save(dst[y][x], plan.Eval( y, x ) ); + } + } + } + + // code to handle SSE optimization + template<bool pass_check,typename Saver, int dim, typename E, int etype> + struct MapExpCPUEngine; + template<typename SV, int dim, typename E, int etype> + struct MapExpCPUEngine<false,SV,dim,E,etype>{ + inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){ + MapPlan<SV>( dst, MakePlan( exp.self() ) ); + } + }; + + #if MSHADOW_USE_SSE + template<typename SV, int dim, typename E, int etype> + struct MapExpCPUEngine<true,SV,dim,E,etype>{ + inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){ + using namespace expr; + if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< dim,Tensor<cpu,dim> >::Check(dst) ){ + MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) ); + }else{ + MapPlan<SV>( dst, MakePlan( exp.self() ) ); + } + } + }; + #endif + + template<typename Saver, int dim, typename E, int etype> + inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){ + using namespace expr; + TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type(); + Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() ); + utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" ); + #if MSHADOW_USE_SSE + MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp ); + #else + MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp ); + #endif + } + + template<typename Saver, typename Reducer, typename E, int etype> + inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){ + using namespace expr; + TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D(); + + utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" ); + utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" ); + // execution + expr::Plan<E> plan = MakePlan( exp.self() ); + for( index_t x = 0; x < eshape[0]; ++x ){ + real_t res = plan.Eval( 0, x ); + for( index_t y = 1; y < eshape[1]; ++y ){ + Reducer::Reduce( res, plan.Eval( y, x ) ); + } + Saver::Save( dst[x], res*scale ); + } + } + + template<typename Saver, typename Reducer, int dimkeep, typename E, int etype> + inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){ + using namespace expr; + TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp(); + typedef Shape< ExpInfo<E>::kDim > EShape; + EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ); + utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" ); + // use equvalent form + Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], + eshape.ProdShape(1,dimkeep), eshape[0] ); + + // execution + expr::Plan<E> plan = MakePlan( exp.self() ); + + for( index_t c = 0; c < pshape[2]; ++c ){ + real_t res = Reducer::kInitV; + for( index_t n = 0; n < pshape[3]; ++n ){ + real_t tres = Reducer::kInitV; + for( index_t y = 0; y < pshape[1]; ++y ){ + for( index_t x = 0; x < pshape[0]; ++x ){ + Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) ); + } + } + Reducer::Reduce( res, tres ); + } + Saver::Save( dst[c], res*scale ); + } + } + + inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){ + real_t mmax = energy[0]; + for( real_t x = 1; x < dst.shape[0]; ++x ) + if( mmax < energy[x] ) mmax = energy[x]; + real_t sum = 0.0f; + for( index_t x = 0; x < dst.shape[0]; ++x ){ + dst[x] = std::exp( energy[x] - mmax ); + sum += dst[x]; + } + for( index_t x = 0; x < dst.shape[0]; ++x ){ + dst[x] /= sum; + } + } + inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){ + utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" ); + for( index_t y = 0; y < dst.shape[1]; ++y ){ + Softmax( dst[y], energy[y] ); + } + } +}; // namespace mshadow + +#endif // TENSOR_CPU_INL_HPP http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr.h ---------------------------------------------------------------------- diff --git a/include/mshadow/tensor_expr.h b/include/mshadow/tensor_expr.h new file mode 100644 index 0000000..ac8fde7 --- /dev/null +++ b/include/mshadow/tensor_expr.h @@ -0,0 +1,367 @@ +#ifndef MSHADOW_TENSOR_EXPR_H +#define MSHADOW_TENSOR_EXPR_H +/*! + * \file tensor_expr.h + * \brief definitions of abstract expressions and expressions template + * \author Tianqi Chen, Bing Xu + */ +#include "tensor_base.h" + +namespace mshadow{ + /*! + * \brief namespace for abstract expressions and expressions template, + * have no dependecy on tensor.h, + * These data structure takes no charge in computations, + * they are only used to define operations and represent expression in a symbolic way + */ + namespace expr{ + + /*! \brief type of expressions */ + namespace type{ + /*! \brief this expression directly correspnds to a data class */ + const int kContainer = 0; + /*! \brief this only contains element-wise vector operations */ + const int kMapper = 1; + /*! \brief othercase: e.g dot product */ + const int kComplex = 3; + }; + + /*! + * \brief expression engine that actually interprets these expressions + * this is a function template that needed to be implemented for specific expressions + */ + template<typename Saver,typename Container> + struct ExpEngine{ + template<typename EType> + inline static void Eval( Container& dst, const EType &exp ); + }; + + template<typename Container> + class ContainerExp; + class ScalarExp; + + /*! + * \brief base class for expression + * \tparam SubType inheritated class must put their type into this parameter + * \tparam exp_type expression type, see namespace type + */ + template<typename SubType, int exp_type> + struct Exp{ + public: + /*! \return subtype instance of current class */ + inline const SubType& self( void ) const{ + return *static_cast<const SubType*>(this); + } + /*! \return reference of subtype instance of current class */ + inline SubType& refself( void ){ + return *static_cast<SubType*>(this); + } + }; + + /*! \brief scalar expression */ + struct ScalarExp: public Exp<ScalarExp, type::kMapper>{ + /*! \brief scalar value */ + real_t scalar_; + /*! \brief constructor */ + ScalarExp( real_t scalar ):scalar_(scalar){} + }; + + /*! \brief represent a transpose expression of a container */ + template<typename EType> + struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{ + public: + /*! \brief expression to be transposed */ + const EType &exp; + /*! \brief constructor */ + TransposeExp( const EType &e ):exp(e){} + /*! \brief transpose expression */ + inline const EType & T( void ) const{ + return exp; + } + }; + + /*! + * \brief base class of all variables, that can be assigned to values + * \tparam Container the actually class of data container, e.g. CTensor1D + */ + template<typename Container> + class ContainerExp: public Exp< Container, type::kContainer >{ + public: + /*! + *\brief transpose of a matrix + *\return transpose of current expression + */ + inline const TransposeExp<Container> T( void ) const{ + return TransposeExp<Container>( this->self() ); + } + public: + /*! \brief operator overload */ + inline Container &operator+=( real_t s ){ + ExpEngine<sv::plusto,Container>::Eval( this->refself(), ScalarExp(s) ); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &operator-=( real_t s ){ + ExpEngine<sv::minusto,Container>::Eval( this->refself(), ScalarExp(s) ); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &operator*=( real_t s ){ + ExpEngine<sv::multo,Container>::Eval( this->refself(), ScalarExp(s) ); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &operator/=( real_t s ){ + ExpEngine<sv::divto,Container>::Eval( this->refself(), ScalarExp(s) ); + return this->refself(); + } + /*! \brief operator overload */ + inline Container &__assign( real_t s ){ + ExpEngine<sv::saveto,Container>::Eval( this->refself(), ScalarExp(s) ); + return this->refself(); + } + public: + /*! \brief implementation of operator=, note that we can not define container = container */ + template<typename E> + inline Container &__assign( const Exp<E,type::kMapper> &exp ){ + ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() ); + return this->refself(); + } + /*! \brief implementation of operator=, note that we can not define container = container */ + template<typename E> + inline Container &__assign( const Exp<E,type::kComplex> &exp ){ + ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() ); + return this->refself(); + } + /*! \brief implementation of operator+= */ + template<typename E,int etype> + inline Container &operator+=( const Exp<E,etype> &exp ){ + ExpEngine<sv::plusto,Container>::Eval( this->refself(), exp.self() ); + return this->refself(); + } + /*! \brief implementation of operator-= */ + template<typename E,int etype> + inline Container &operator-=( const Exp<E,etype> &exp ){ + ExpEngine<sv::minusto,Container>::Eval( this->refself(), exp.self() ); + return this->refself(); + } + /*! \brief implementation of operator*= */ + template<typename E,int etype> + inline Container &operator*=( const Exp<E,etype> &exp ){ + ExpEngine<sv::multo,Container>::Eval( this->refself(), exp.self() ); + return this->refself(); + } + /*! \brief implementation of operator/= */ + template<typename E,int etype> + inline Container &operator/=( const Exp<E,etype> &exp ){ + ExpEngine<sv::divto,Container>::Eval( this->refself(), exp.self() ); + return this->refself(); + } + }; + }; // namespace expr + + namespace expr{ + /*! + * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] ) + * \tparam TA type of lhs + * \tparam TB type of rhs + * \tparam ltrans whether lhs is transposed + * \tparam rtrans whether rhs is transposed + */ + template<typename TA,typename TB,bool ltrans,bool rtrans> + struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex >{ + /*! \brief left operand */ + const TA& lhs_; + /*! \brief right operand */ + const TB& rhs_; + /*! \brief scale over result */ + real_t scale_; + /*! \brief constructor */ + DotExp( const TA &lhs, const TB &rhs, real_t scale ) + :lhs_(lhs),rhs_(rhs),scale_(scale){} + }; + + /*! \brief dot operator def */ + template<typename TA, typename TB> + inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, const ContainerExp<TB> &rhs ){ + return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f ); + } + /*! \brief dot operator def */ + template<typename TA, typename TB> + inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, const ContainerExp<TB> &rhs ){ + return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f ); + } + /*! \brief dot operator def */ + template<typename TA, typename TB> + inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, const TransposeExp<TB> &rhs ){ + return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f ); + } + /*! \brief dot operator def */ + template<typename TA, typename TB> + inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const TransposeExp<TB> &rhs ){ + return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f ); + } + /*! \brief dot operator def */ + template<typename TA, typename TB, bool ltrans, bool rtrans > + inline DotExp<TA,TB,ltrans,rtrans> operator*( const DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){ + return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs ); + } + /*! \brief scale of dot operation */ + template<typename TA, typename TB, bool ltrans, bool rtrans > + inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const DotExp<TA,TB,ltrans,rtrans> &rhs ){ + return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs ); + } + }; // namespace expr + + namespace expr{ + /*! + * \brief binary map expression lhs [op] rhs + * \tparam OP operator + * \tparam TA type of lhs + * \tparam TB type of rhs + * \tparam etype expression type, sa namespace::type + */ + template<typename OP, typename TA, typename TB, int etype > + struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{ + /*! \brief left operand */ + const TA& lhs_; + /*! \brief right operand */ + const TB& rhs_; + /*! \brief constructor */ + BinaryMapExp( const TA &lhs, const TB &rhs ) + :lhs_(lhs), rhs_(rhs){} + }; + + /*! \brief make expression */ + template<typename OP,typename TA, typename TB, int ta, int tb> + inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){ + return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), rhs.self() ); + } + + /*! + * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression + * \param lhs left operand + * \param rhs right operand + * \tparam binary operator + * \tparam TA lhs expression + * \tparam ta lhs expression type + * \tparam TB rhs expression + * \tparam tb rhs expression type + * \sa mshadow::op + */ + template<typename OP,typename TA, typename TB, int ta, int tb> + inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){ + return MakeExp<OP>( lhs, rhs ); + } + /*! \brief operator overload for const */ + template<typename OP,typename TA, int ta> + inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const Exp<TA,ta> &lhs, const ScalarExp &rhs ){ + return MakeExp<OP>( lhs, rhs ); + } + /*! \brief operator overload for const */ + template<typename OP,typename TB, int tb> + inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const ScalarExp &lhs, const Exp<TB,tb>& rhs ){ + return MakeExp<OP>( lhs, rhs ); + } + + // operator rules + /*! \brief operator overload */ + template<typename TA, typename TB, int ta, int tb> + inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){ + return MakeExp<op::plus>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TA, typename TB, int ta, int tb> + inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){ + return MakeExp<op::minus>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TA, typename TB, int ta, int tb> + inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){ + return MakeExp<op::mul>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TA, typename TB, int ta, int tb> + inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){ + return MakeExp<op::div>( lhs, rhs ); + } + // constant operators + /*! \brief operator overload */ + template<typename TA, int ta> + inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > operator+( const Exp<TA,ta>& lhs, const ScalarExp& rhs ){ + return MakeExp<op::plus>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TA, int ta> + inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > operator-( const Exp<TA,ta>& lhs, const ScalarExp& rhs ){ + return MakeExp<op::minus>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TA, int ta> + inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > operator*( const Exp<TA,ta>& lhs, const ScalarExp& rhs ){ + return MakeExp<op::mul>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TA, int ta> + inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > operator/( const Exp<TA,ta>& lhs, const ScalarExp& rhs ){ + return MakeExp<op::div>( lhs, rhs ); + } + // constant operators 2 + /*! \brief operator overload */ + template<typename TB, int tb> + inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){ + return MakeExp<op::plus>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TB, int tb> + inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){ + return MakeExp<op::minus>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TB, int tb> + inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){ + return MakeExp<op::mul>( lhs, rhs ); + } + /*! \brief operator overload */ + template<typename TB, int tb> + inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){ + return MakeExp<op::div>( lhs, rhs ); + } + }; + + namespace expr{ + /*! + * \brief unary map expression op(src) + * \tparam OP operator + * \tparam TA type of src + * \tparam etype expression type, sa namespace::type + */ + template<typename OP, typename TA, int etype > + struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{ + /*! \brief source expression */ + const TA& src_; + /*! \brief constructor */ + UnaryMapExp( const TA &src ):src_(src){} + }; + + /*! \brief make expression */ + template<typename OP,typename TA, int ta> + inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const Exp<TA,ta> &src ){ + return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() ); + } + + /*! + * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression + * \param src source expression + * \tparam operator + * \tparam TA source expression + * \tparam ta source expression type + * \sa mshadow::op + */ + template<typename OP,typename TA, int ta> + inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src ){ + return MakeExp<OP>(src); + } + }; +}; +#endif
