http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/gtest/gtest_main.cc
----------------------------------------------------------------------
diff --git a/include/gtest/gtest_main.cc b/include/gtest/gtest_main.cc
new file mode 100644
index 0000000..f302822
--- /dev/null
+++ b/include/gtest/gtest_main.cc
@@ -0,0 +1,38 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from gtest_main.cc\n");
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/cuda_reduce.cuh
----------------------------------------------------------------------
diff --git a/include/mshadow/cuda/cuda_reduce.cuh 
b/include/mshadow/cuda/cuda_reduce.cuh
new file mode 100644
index 0000000..b7808a6
--- /dev/null
+++ b/include/mshadow/cuda/cuda_reduce.cuh
@@ -0,0 +1,117 @@
+#ifndef MSHADOW_CUDA_REDUCE_CUH
+#define MSHADOW_CUDA_REDUCE_CUH
+/*!
+ * \file cuda_reduce.cuh
+ * \brief helper functions to do reduction
+ * \author Tianqi Chen
+ */
+namespace mshadow{
+    namespace cuda{
+        /*
+         * \brief reduce over the dimension x
+         * \tparam Reducer reducer
+         * \tparam x_bits dimension = 1<<x_bits
+         */
+        template<typename Reducer,int x_bits>
+        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
+        /*
+         * \brief reduce over the dimension x
+         * \tparam Reducer reducer
+         * \tparam xmax_bits maximum size of buffer
+         * \param xsize size of x dimension, not sure if aligned
+         */
+        template<typename Reducer, int xmax_bits>
+        inline __device__ void Reduce1DNotAlign( volatile real_t 
buf[1<<xmax_bits], int xsize );
+    };
+};
+
+// ===============================================x===
+//  implementations afterwards, 
+//  no need to read if only use the functions
+// --------------------------------------------------
+#ifdef  __DEVICE_EMULATION__
+#define __MSHADOW_EMUSYNC__ __syncthreads()
+#else
+#define __MSHADOW_EMUSYNC__ 
+#endif
+
+namespace mshadow{
+    namespace cuda{        
+        template<typename Reducer, int x_bits>
+        inline __device__ void ReduceX( volatile real_t buf[], int tid ){
+            if( x_bits >= 10 ){
+                if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 9 ){
+                if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 8 ){
+                if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 7 ){
+                if( tid < 64  ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
+                __syncthreads(); 
+            }            
+            if( x_bits >= 6 ){
+                if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
+                __syncthreads();
+            }
+            // in warp optimization
+            if( x_bits >= 5 ){
+                if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 4 ){
+                if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
+                __MSHADOW_EMUSYNC__;            
+            }
+            if( x_bits >= 3 ){
+                if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 2 ){
+                if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 1 ){
+                if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
+                __MSHADOW_EMUSYNC__;
+            }  
+        };
+        
+        template<typename Reducer,int x_bits>
+        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
+            ReduceX<Reducer,x_bits>( buf, threadIdx.x );
+        }
+
+        // reduce with a upper bound
+        #define __RD_NON_ALIGN(els,x_bits)                              \
+            els                                                         \
+            if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){       \
+                if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
+                    Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
+                }                                                       \
+                __syncthreads();                                        \
+                ReduceX<Reducer, x_bits>( buf, tid );                   \
+            }                                                           \
+            
+        template<typename Reducer, int xmax_bits>
+        inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int 
x_size ){
+            int tid = threadIdx.x;
+            __RD_NON_ALIGN(, 8)
+            __RD_NON_ALIGN(else, 7)
+            __RD_NON_ALIGN(else, 6)
+            __RD_NON_ALIGN(else, 5) 
+            __RD_NON_ALIGN(else, 4) 
+            __RD_NON_ALIGN(else, 3) 
+            __RD_NON_ALIGN(else, 2) 
+            __RD_NON_ALIGN(else, 1)                     
+        }
+    };
+};
+
+#endif // MSHADOW_CUDA_REDUCE_CUH
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/tensor_gpu-inl.cuh
----------------------------------------------------------------------
diff --git a/include/mshadow/cuda/tensor_gpu-inl.cuh 
b/include/mshadow/cuda/tensor_gpu-inl.cuh
new file mode 100644
index 0000000..61e477c
--- /dev/null
+++ b/include/mshadow/cuda/tensor_gpu-inl.cuh
@@ -0,0 +1,231 @@
+#ifndef MSHADOW_TENSOR_GPU_INL_CUH
+#define MSHADOW_TENSOR_GPU_INL_CUH
+/*!
+ * \file tensor_gpu-inl.cuh
+ * \brief implementation of GPU code using CUDA
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "../tensor.h"
+#include "cuda_reduce.cuh"
+
+namespace mshadow{
+    namespace cuda{
+        #ifndef __CUDA_ARCH__
+        #warning "__CUDA_ARCH__ is not defined, I will assume compiling with 
CUDA verion greater than 2.0"
+        #endif
+        /* load unit for memory access */
+        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
+        const int kMemUnitBits = 5;
+        const int kMaxThreadsPerBlock = 1024;
+        #else
+        const int kMemUnitBits = 4;
+        const int kMaxThreadsPerBlock = 512;
+        #endif
+        /*! \brief number of units that can do synchronized update, half warp 
size */
+        const int kMemUnit     = 1 << kMemUnitBits;
+        /*! \brief mask that could be helpful sometime */
+        const int kMemUnitMask = kMemUnit - 1;
+        /*! \brief suggested thread number(logscale) for mapping kernel */
+        const int kBaseThreadBits = 8;
+        /*! \brief suggested thread number for mapping kernel */
+        const int kBaseThreadNum  = 1 << kBaseThreadBits;
+        /*! \brief maximum value of grid */
+        const int kMaxGridNum     = 65535;
+        /*! \brief suggested grid number for mapping kernel */
+        const int kBaseGridNum    = 1024;
+        
+        /*! \brief get align stride for given size in x dimension */
+        inline index_t GetAlignStride( index_t xsize, index_t xstride ){ 
+            if( (xstride & (kMemUnit-1)) == 0 ){
+                return ( (xsize  + kMemUnit - 1) >> kMemUnitBits) << 
kMemUnitBits;
+            }else{
+                // if originally space is not aligned, no necessary to to 
alligned thread allocation
+                return xsize;
+            }
+        }
+        inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char 
*estr = "" ){
+            if( dimBlock.x*dimBlock.y*dimBlock.z > 
(unsigned)kMaxThreadsPerBlock ||
+                dimGrid.x > 65535 || dimGrid.y > 65535 ){
+                fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, 
dimBlock.y, dimBlock.z );
+                utils::Error( "too large launch parameter\n");
+            } 
+        }        
+    };
+
+    namespace cuda {
+        template<typename Saver, typename Plan, int block_dim_bits>
+        __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, 
const Plan exp, int block_idx ){
+            const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
+            const int y   = tid / xstride;
+            const int x   = tid % xstride;
+            if (y < dst.shape[1] && x < dst.shape[0]) {
+                Saver::Save(dst[y][x], exp.Eval(y,x));
+            }
+        }
+        template<typename Saver, typename Plan, int block_dim_bits>
+        __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t 
xstride, const Plan exp ){
+            MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, 
blockIdx.x );
+        }
+        template<typename Saver, typename Plan, int block_dim_bits, int 
grid_size>
+        __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t 
xstride, const Plan exp, int repeat ){
+            for( int i = 0; i < repeat; ++i ){
+                MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, 
blockIdx.x + i*grid_size );
+            }
+        }        
+        
+        template<typename Saver, typename E>
+        inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){
+            const index_t xstride = GetAlignStride( dst.shape[0], 
dst.shape.stride_ );
+            const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / 
kBaseThreadNum;
+            dim3 dimBlock(kBaseThreadNum, 1, 1);
+
+            if (num_block < kMaxGridNum) {
+                dim3 dimGrid(num_block, 1, 1);
+                MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits>   \
+                    <<<dimGrid,dimBlock>>>(dst, xstride, plan);
+            } else {
+                int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
+                dim3 dimGrid( kBaseGridNum, 1 , 1 );
+                MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, 
kBaseGridNum> \
+                    <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat );
+            }
+        }        
+    }; // namespace cuda
+    
+    namespace cuda{
+        template<typename Saver,typename Reducer, int warp_bits, typename Plan>
+        __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, 
real_t scale, Shape<2> eshape ){
+            const unsigned warp_size = 1 << warp_bits;
+            const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x;
+            // to avoid bank conflict
+            __shared__ real_t s_res[ warp_size ][ warp_size + 1 ];
+
+            // note: reverse store [y][x], so that we can reduce over 
threadIdx.x, use warp optimization
+            if( threadIdx.y < eshape[1] && x < eshape[0] ){
+                s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, 
x );
+            }
+            for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){
+                if( threadIdx.y + y < eshape[1] && x < eshape[0] ){
+                    Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], 
plan.Eval( threadIdx.y + y, x ) );
+                }
+            } 
+            __syncthreads();
+            if( eshape[1] >= warp_size ){
+                Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] );
+            }else{
+                Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], 
eshape[1] );
+            }
+            __syncthreads();            
+            
+            if( threadIdx.y == 0 && x < eshape[0] ){
+                Saver::Save( dst[x],  s_res[ threadIdx.x ][ 0 ] * scale );
+            } 
+        }        
+        
+        template<typename Saver, typename Reducer, typename E>
+        inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const 
expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){
+            dim3 dimBlock( kMemUnit, kMemUnit );
+            dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits );
+            CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" );
+            
MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, 
plan, scale, eshape );
+        } 
+    }; // namespace cuda
+    
+    namespace cuda{
+        template<typename Saver,typename Reducer, int block_dim_bits, typename 
Plan>
+        __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, 
real_t scale, Shape<4> pshape ){
+            const int block_size = 1 << block_dim_bits;
+            __shared__ real_t s_rec[ block_size ];
+            const int c = blockIdx.x;            
+            const index_t tot = pshape[0]*pshape[1]*pshape[3];
+
+            real_t res = Reducer::kInitV;
+            for( index_t i_offset = 0; i_offset < tot; i_offset += block_size 
){
+                index_t i = i_offset + threadIdx.x;
+                if( i< tot ){
+                    const index_t x = i % pshape[0];
+                    i /= pshape[0]; 
+                    const index_t y = i % pshape[1];
+                    const index_t n = i / pshape[1];
+                    Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * 
pshape[1] + y, x ) );
+                }
+            }                
+            s_rec[ threadIdx.x ] = res;
+            __syncthreads();
+            Reduce1D<Reducer,block_dim_bits>( s_rec );
+            if( threadIdx.x == 0 ){
+                Saver::Save( dst[c], s_rec[0]*scale );
+            }
+        }
+
+        template<typename Saver, typename Reducer, typename Plan>
+        inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, 
real_t scale, Shape<4> pshape ){  
+            dim3 dimBlock( kBaseThreadNum );
+            dim3 dimGrid ( dst.shape[0] );
+            CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" );
+            MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits>
+                <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape );
+        }
+    };
+    
+    namespace cuda{
+        template<int x_bits>        
+        __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){
+            const unsigned x_size = 1 << x_bits;  
+            const int y = blockIdx.x;
+            __shared__ real_t s_rec[ x_size ];
+            
+            // step 1: get max
+            if( threadIdx.x < dst.shape[ 0 ] ){
+                s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; 
+            }
+            for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    real_t a = src[ y ][ x + threadIdx.x ];
+                    s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] );
+                }
+            }
+            __syncthreads();
+            if( threadIdx.x >= dst.shape[0] ){
+                s_rec[ threadIdx.x ] = s_rec[0];
+            }
+            __syncthreads();
+            Reduce1D<red::maximum,x_bits>( s_rec );
+            __syncthreads();
+            real_t smax = s_rec[0];            
+            __syncthreads();
+            s_rec[ threadIdx.x ] = 0.0f;
+            __syncthreads();
+
+            // calculate normalizer, with writeback
+            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    real_t p = expf( src[ y ][ x + threadIdx.x ] - smax );
+                    s_rec[ threadIdx.x ] += p;
+                    // write back first, will fetch later
+                    dst[ y ][ x + threadIdx.x ] = p;
+                }
+            }
+            // calculate normalizer
+            __syncthreads();
+            Reduce1D<red::sum,x_bits>( s_rec );
+            __syncthreads();
+            real_t ssum = s_rec[0];
+
+            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    dst[ y ][ x + threadIdx.x ] /= ssum;
+                }
+            }
+        }
+    
+        inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){
+            dim3 dimBlock( kBaseThreadNum );
+            dim3 dimGrid ( dst.shape[1] );
+            utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" );
+            CheckLaunchParam( dimGrid, dimBlock, "Softmax" );
+            SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src );
+        }
+    }; // namespace cuda
+}; // namespace mshadow
+#endif // TENSOR_GPU_INL_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cxxnet_op.h
----------------------------------------------------------------------
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
new file mode 100644
index 0000000..930caf2
--- /dev/null
+++ b/include/mshadow/cxxnet_op.h
@@ -0,0 +1,116 @@
+#ifndef CXXNET_OP_H
+#define CXXNET_OP_H
+#pragma once
+/*!
+ * \file cxxnet_op.h
+ * \brief extra mshadow operation for cxxnet
+ * \author Bing Xu
+ */
+#include "mshadow/tensor.h"
+
+namespace mshadow {
+    /*! \brief operations for algorithm */
+    namespace op {
+        struct sigmoid {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f / (1.0f + expf(-a));
+            }
+        };
+        struct sigmoid_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a * ( 1.0f - a );
+            }
+        };
+
+        /*! \brief Rectified Linear Operation */
+        struct relu {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                using namespace std;
+                return max( a, 0.0f );
+            }
+        };
+        struct relu_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a > 0.0f ? 1.0f : 0.0f;
+            }
+        };
+
+        struct tanh {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return tanhf( a );
+            }
+        };
+        struct tanh_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f - a * a;
+            }
+        };
+        struct softplus {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return logf(1 + expf(a));
+            }
+        };
+        struct softplus_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f / (1.0f + expf(-a));
+            }
+        };
+        struct bnll {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a > 0.0f ? a + logf(1.0f + expf(-a)) : logf(1.0f + 
expf(a));
+            }
+        };
+        struct bnll_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                real_t expval = a > 50.0f ? 50.0f : a; // kBNLL_THRESHOLD = 
50.0f
+                expval = expf(-expval);
+                return 1.0f / (1.0f + expval);
+            }
+        };
+
+        struct square {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a * a;
+            }
+        };
+       /*! \brief scaled tanh, hard code the scale factor*/
+        struct stanh {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+              return  1.7159047*tanhf(0.66666667 *a);
+            }
+        };
+        /*! \breif back prop for scaled tanh: */
+        struct stanh_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 0.66666667*1.7159047 -0.66666667/1.7159047*a*a;
+            }
+        };
+
+    }; //namespace op
+
+}; //namespace mshadow
+
+namespace mshadow {
+    namespace op {
+        /*! \brief used for generate Bernoulli mask */
+        struct threshold {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a < b ? 1.0f : 0.0f;
+            }
+        };
+
+        /*! \brief used for generate element of power */
+        struct power {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return powf( a, b );
+            }
+        };
+        struct sqrtop {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return sqrt(a+b);
+            }
+        };
+    }; // namespace op
+}; // namespace mshadow
+
+#endif // CXXNET_OP_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h
new file mode 100644
index 0000000..42d13d3
--- /dev/null
+++ b/include/mshadow/tensor.h
@@ -0,0 +1,472 @@
+#ifndef MSHADOW_TENSOR_H
+#define MSHADOW_TENSOR_H
+/*!
+ * \file tensor.h
+ * \brief header file of tensor data structure and functions
+ *        covention: this lib requires explicit memory allocation and 
de-allocation
+ *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are 
like handles(pointers),
+ *                   no memory allocation is happening during calculation
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "tensor_base.h"
+#include "tensor_expr.h"
+
+namespace mshadow {
+    /*!
+     * \brief shape of a tensor
+     *       IMPORTANT NOTE: this shape is different from numpy.shape
+     *       shape[0] gives the lowest dimension, shape[dimension-1] gives the 
highest dimension
+     *       shape[k] corresponds to k-th dimension of tensor
+     * \tparam dimension dimension of tensor
+     */
+    template<int dimension>
+    struct Shape {
+    public:
+        /*! \brief maximum dimension of tensor */
+        const static int kMaxShape = dimension;
+        /*! \brief maximum dimension minus 1 */
+        const static int kSubShape = dimension - 1;
+    public:
+        /*! \brief default constructor, do nothing */
+        MSHADOW_XINLINE Shape(void) {}
+        /*! \brief constuctor */
+        MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
+            #pragma unroll
+            for( int i = 0; i < kMaxShape; ++i ){
+                this->shape_[i] = s[i];
+            }
+            this->stride_ = s.stride_;
+        }
+        /*!
+         * \brief get corresponding index
+         * \param idx dimension index
+         * \return the corresponding dimension size
+         */
+        MSHADOW_XINLINE index_t& operator[](index_t idx) {
+            return shape_[ idx ];
+        }
+        /*!
+         * \brief get corresponding index
+         * \param idx dimension index
+         * \return the corresponding dimension size
+         */
+        MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
+            return shape_[ idx ];
+        }
+        /*! \return whether two shape equals */
+        MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
+            #pragma unroll
+            for ( int i = 0; i < kMaxShape; ++i ) {
+                if (s.shape_[i] != this->shape_[i]) return false;
+            }
+            return true;
+        }
+        /*!
+         * flatten the higher dimension to second dimension, return a 2D shape
+         * \return the flat 2d shape
+         */
+        MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
+            Shape<2> s;
+            s.stride_ = this->stride_;
+            s.shape_[ 0 ] = this->shape_[ 0 ];
+            index_t ymax = 1;
+
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                ymax *= this->shape_[ i ];
+            }
+            s.shape_[1] = ymax;
+            return s;
+        }
+        /*! \return number of valid elements */
+        MSHADOW_XINLINE size_t Size(void) const{
+            size_t memsz = this->shape_[ 0 ];
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                memsz *= this->shape_[ i ];
+            }
+            return memsz;
+        }
+        /*! \return memory size, including the aligned x dimension */
+        MSHADOW_XINLINE size_t MSize(void) const {
+            size_t memsz = this->stride_;
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                memsz *= this->shape_[ i ];
+            }
+            return memsz;
+        }
+        /*!
+         * \return product shape in [dimstart,dimend)
+         * \param dimstart start dimension
+         * \param dimend   end dimension
+         */
+        MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
+            index_t num = 1;
+            #pragma unroll
+            for (int i = dimstart; i < dimend; ++i) {
+                num *= this->shape_[ i ];
+            }
+            return num;
+        }
+        /*!
+         * \brief get subshape
+         * \return subshape
+         */
+        MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
+            Shape<kSubShape> s;
+            s.stride_ = this->stride_;
+            // for cuda
+            #pragma unroll
+            for (int i = 0; i < kSubShape; ++i) {
+                s.shape_[ i ] = this->shape_[ i ];
+            }
+            return s;
+        }
+
+    public:
+        /*! \brief storing the dimension information */
+        index_t shape_[ kMaxShape ];
+        /*!
+         * \brief storing the stride information in x dimension
+         *    this is used to deal with pitch allocation in gpu or sse(align x 
dimension to 64bit) for efficiency
+         */
+        index_t stride_;
+    };
+    // useful construction functions to generate shape
+    /*!
+     * \brief construct a one dimension shape, stride will equal s0
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
+        Shape<1> s; s[0] = s0; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a two dimension shape, stride will equal s0
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
+        Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a three dimension shape, stride will equal s0
+     * \param s2 size of dimension 2
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
+        Shape<3> s;
+        s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a four dimension shape, stride will equal s0
+     * \param s3 size of dimension 3
+     * \param s2 size of dimension 2
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, 
index_t s0 ){
+        Shape<4> s;
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
+        return s;
+    }
+}; // namespace mshadow
+
+namespace mshadow {
+    /*! \brief device name CPU */
+    struct cpu {
+        /*! \brief whether this device is CPU or not */
+        const static bool kDevCPU = true;
+        /*! \brief device flag number, identifies this device */
+        const static int kDevMask = 1<<0;
+    };
+    /*! \brief device name CPU */
+    struct gpu {
+        /*! \brief whether this device is CPU or not */
+        const static bool kDevCPU = false;
+        /*! \brief device flag number, identifies this device */
+        const static int kDevMask = 1<<1;
+    };
+
+    // more compact template
+    /*!
+     * \brief general tensor
+     * \tparam Device which device the tensor is on
+     * \tparam dimension dimension of the tensor
+     */
+    template<typename Device, int dimension>
+    struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
+    public:
+        /*! \brief whether current type lies in cpu */
+        const static bool kDevCPU = Device::kDevCPU;
+        /*! \brief dimension of subtype */
+        const static int  kSubdim = dimension - 1;
+
+    public:
+        /*! \brief pointer to the data */
+        real_t *dptr;
+        /*! \brief shape of the tensor */
+        Shape<dimension> shape;
+    public:
+        /*! \brief default constructor */
+        MSHADOW_XINLINE Tensor(void) {}
+        /*! \brief constructor from shape  */
+        MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
+        /*! \brief constructor from data pointer and shape  */
+        MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): 
dptr((real_t*)dptr), shape(shape) {}
+        /*!
+         * \brief flatten the tensor to 2 dimension, collapse the higher 
dimensions together
+         * \return tensor after flatten
+         */
+        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
+            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
+                                     (dptr), shape.FlatTo2D());
+        }
+        /*!
+         * \brief get a element of dimension - 1
+         * \param idx index
+         * \return the result tensor
+         */
+        MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
+            Shape<kSubdim> s = shape.SubShape();
+            return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
+                                           (dptr) + s.MSize() * idx, s);
+        }
+        /*!
+         * \brief slice the tensor in highest dimension [begin,end)
+         * \param begin begin position of slice
+         * \param end end position of slice
+         * \return tensor after slice
+         */
+        MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t 
end) const {
+            Shape<dimension> s = this->shape;
+            s[ dimension - 1 ] = end - begin;
+            return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
+                                             (dptr) + s.SubShape().MSize() * 
begin, s);
+        }
+    public:
+        /*!\brief functions to fit expression template */
+        inline Tensor<Device,dimension>& operator=( real_t s ){
+            return this->__assign( s );
+        }
+        /*!\brief functions to fit expression template */
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const 
expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        /*!\brief functions to fit expression template */
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const 
expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    };
+
+    /*
+     *  respecialized class Tensor1D,thei is due to different implementation 
in operator[]
+     */
+    template<typename Device>
+    struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
+    public:
+        real_t *dptr;
+        Shape<1> shape;
+    public:
+        MSHADOW_XINLINE Tensor(void) {}
+        MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
+        MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), 
shape(shape) {}
+
+        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
+            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
+                                     (dptr), shape.FlatTo2D());
+        }
+        MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) 
const {
+            Shape<1> s;
+            s[0] = s.stride_ = end  - begin;
+            return Tensor<Device, 1>(reinterpret_cast<real_t*> \
+                                     (dptr) + begin, s);
+        }
+        MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
+        MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return 
dptr[ idx ]; }
+    public:
+        // functions to fit expression template
+        inline Tensor<Device,1>& operator=( double s ){
+            return this->__assign( s );
+        }
+        template<typename E>
+        inline Tensor<Device,1>& operator=( const 
expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        template<typename E>
+        inline Tensor<Device,1>& operator=( const 
expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    };
+}; // namespace mshadow
+
+// add unroll loops for the shape
+namespace mshadow {
+    // function declarations
+    /*!
+     * \brief initialize tensor engine, used to call intialization functions 
of dependent libs
+     *        this function should be called before all GPU tensor operations,
+     *        for using tensors in CPU, this call is actually not needed
+     * \param device_id GPU device id to be choosed
+     */
+    inline void InitTensorEngine( int device_id=0 );
+    /*!
+     * \brief Shutdown tensor engine,
+     *        this function should be called after all GPU tensor operations,
+     *        for using tensors in CPU, this call is actually not needed
+     */
+    inline void ShutdownTensorEngine( void );
+
+    /*!
+     * \brief CPU/CPU: allocate space for CTensor, according to the shape in 
the obj
+     *        this function is responsible to set the stride_ in each obj.shape
+     * \tparam dim specify the dim of tensor
+     * \param obj the tensor object, with shape specified
+     * \param pad whether padding dimension 0, to make last dimension aligned,
+     *            padding may help improve efficiency of matrix multiplications
+     *            if true, will allocate space with stride_ that may not 
equals shape[0]
+     *            if false, will allocate continuous space
+     */
+    template<int dim>
+    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
+    /*! \brief refer to comment of cpu ver \sa AllocSpace */
+    template<int dim>
+    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
+
+    /*!
+     * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
+     * \tparam dim specify the dim of tensor
+     * \param obj the tensor object
+     */
+    template<int dim>
+    inline void FreeSpace(Tensor<cpu,dim> &obj);
+    /*! \brief refer to comment of cpu ver \sa FreeSpace */
+    template<int dim>
+    inline void FreeSpace(Tensor<gpu,dim> &obj);
+
+    /*!
+     * \brief CPU/GPU: short cut to allocate and initialize a Tensor
+     * \tparam Device device of tensor
+     * \tparam dim dimention of tensor
+     * \param shape: shape of tensor
+     * \param initv: initialization value
+     * \param pad : padding option
+     * \sa AllocSpace
+     */
+    template<typename Device, int dim>
+    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, 
bool pad = MSHADOW_ALLOC_PAD);
+
+    /*!
+     * \brief copy data from one tensor to another, with same shape
+     * \tparam dim specify the dim of tensor
+     * \param dst target tensor
+     * \param src source tensor
+     */
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );
+
+
+    /*!
+     * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( 
sum_j exp( energy[i][j] ) )
+     * \param dst destination
+     * \param energy input energy
+     */
+    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
+    /*! \brief refer to comment of cpu ver \sa Softmax */
+    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );
+
+}; // namespace mshadow
+
+
+namespace mshadow{
+    // function declarations to support expression, no need to understand them
+    // these functions do not need to be directly used
+
+    /*!
+     * \brief CPU/GPU: map a expression to a tensor, this function calls 
MapPlan
+     * \tparam Saver specify storage method
+     * \tparam dim dim of the tensor, during usage, there is no need to 
specify this parameter
+     * \tparam E specifies the expression type, not need to specify this 
parameter during usage
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
+     */
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
+    /*! \brief refer to comment of cpu ver \sa MapExp */
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );
+
+    /*!
+     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest 
dimension (dimension 0)
+     * \tparam Saver specify storage method
+     * \tparam Reducer specify a reducer method
+     * \tparam E specifies the expression type, not need to specify this 
parameter during usage
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \param scale scale the result before save
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+     */
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+    /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+
+
+    /*!
+     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third 
dimension (dimension 2)
+     * \tparam Saver specify storage method
+     * \tparam Reducer specify a reducer method
+     * \tparam E specifies the expression type, not need to specify this 
parameter during usage
+     * \tparam dimkeep the target dimension to be kept, should be larger than 
0, for 0, use MapReduceKeepLowest
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \param scale scale the result before save
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+     */
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int 
etype>
+    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+    /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int 
etype>
+    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+
+};// namespace mshadow
+
+// execution implementation of expression evaluations
+#include "tensor_expr_engine-inl.hpp"
+// cpu implementation of functions
+#include "tensor_cpu-inl.hpp"
+// gpu implementation of functions
+#include "tensor_gpu-inl.hpp"
+// extension of expressions
+#include "tensor_expr_ext.h"
+// io 
+#include "tensor_io.h"
+// container
+#include "tensor_container.h"
+// random number generator
+#include "tensor_random.h"
+#endif // TENSOR_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_base.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_base.h b/include/mshadow/tensor_base.h
new file mode 100644
index 0000000..b251cba
--- /dev/null
+++ b/include/mshadow/tensor_base.h
@@ -0,0 +1,298 @@
+#ifndef MSHADOW_TENSOR_BASE_H
+#define MSHADOW_TENSOR_BASE_H
+/*!
+ * \file tensor_base.h
+ * \brief definitions of base types, macros functions
+ *
+ * \author Bing Xu, Tianqi Chen
+ */
+#include <cmath>
+#include <cstdio>
+#include <cfloat>
+#include <climits>
+#include <algorithm>
+// macro defintiions
+
+/*!\brief if this macro is define to be 1, mshadow should compile without any 
of other libs */
+#ifndef MSHADOW_STAND_ALONE
+    #define MSHADOW_STAND_ALONE 0
+#endif
+
+/*! \brief whether do padding during allocation */
+#ifndef MSHADOW_ALLOC_PAD
+    #define MSHADOW_ALLOC_PAD true
+#endif
+
+/*! 
+ * \brief x dimension of data must be bigger pad_size * ratio to be alloced 
padded memory, otherwise use tide allocation 
+ *        for example, if pad_ratio=2, GPU memory alignement size is 32, then 
we will only allocate padded memory if x dimension > 64
+ *        set it to 0 then we will always allocate padded memory
+ */
+#ifndef MSHADOW_MIN_PAD_RATIO
+    #define MSHADOW_MIN_PAD_RATIO 2
+#endif
+
+#if MSHADOW_STAND_ALONE
+   #define MSHADOW_USE_CBLAS 0
+   #define MSHADOW_USE_MKL   0
+   #define MSHADOW_USE_CUDA  0
+#endif
+
+/*! \brief use CBLAS for CBLAS */
+#ifndef MSHADOW_USE_CBLAS
+   #define MSHADOW_USE_CBLAS 0
+#endif
+/*! \brief use MKL for BLAS */
+#ifndef MSHADOW_USE_MKL
+   #define MSHADOW_USE_MKL   1
+#endif
+/*! \brief use CUDA support, must ensure that the cuda include path is 
correct, or directly compile using nvcc */
+#ifndef MSHADOW_USE_CUDA
+  #define MSHADOW_USE_CUDA   1
+#endif
+/*! \brief use single precition float */
+#ifndef MSHADOW_SINGLE_PRECISION
+  #define MSHADOW_SINGLE_PRECISION 1
+#endif
+/*! \brief whether use SSE */
+#ifndef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 1
+#endif
+/*! \brief whether use NVML to get dynamic info */
+#ifndef MSHADOW_USE_NVML
+  #define MSHADOW_USE_NVML 0
+#endif
+// SSE is conflict with cudacc
+#ifdef __CUDACC__
+  #undef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 0
+#endif
+
+#if MSHADOW_USE_CBLAS
+extern "C"{
+    #include <cblas.h>
+}
+#elif MSHADOW_USE_MKL
+  #include <mkl.h>
+  #include <mkl_cblas.h>
+  #include <mkl_vsl.h>
+  #include <mkl_vsl_functions.h>
+#endif
+
+#if MSHADOW_USE_CUDA
+  #include <cublas.h>
+  #include <curand.h>
+#endif
+
+#if MSHADOW_USE_NVML
+  #include <nvml.h>
+#endif
+// --------------------------------
+// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU 
code.
+#ifdef MSHADOW_XINLINE
+  #error "MSHADOW_XINLINE must not be defined"
+#endif
+#ifdef __CUDACC__
+  #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ 
__host__
+#else
+  #define MSHADOW_XINLINE inline __attribute__((always_inline))
+#endif
+/*! \brief cpu force inline */
+#define MSHADOW_CINLINE inline __attribute__((always_inline))
+
+#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) 
|| __cplusplus >= 201103L
+  #define MSHADOW_CONSTEXPR constexpr
+#else
+  #define MSHADOW_CONSTEXPR const
+#endif
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+    /*! \brief buffer size for each random number generator */
+    const unsigned kRandBufferSize = 1000000;
+    /*! \brief pi  */
+    const float kPi = 3.1415926f;
+
+#if MSHADOW_SINGLE_PRECISION
+    /*! \brief type that will be used for content */
+    typedef float real_t;
+#else
+    typedef double real_t;
+#endif
+    /*! \brief type that will be used for index */
+    typedef unsigned index_t;
+}; // namespace mshadow
+
+namespace mshadow {
+    /*! \brief namespace for operators */
+    namespace op {
+        // binary operator
+        /*! \brief mul operator */
+        struct mul{
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a * b;
+            }
+        };
+        /*! \brief plus operator */
+        struct plus {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a + b;
+            }
+        };
+        /*! \brief minus operator */
+        struct minus {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a - b;
+            }
+        };
+        /*! \brief divide operator */
+        struct div {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a / b;
+            }
+        };
+        /*! \brief get rhs */
+        struct right {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return b;
+            }
+        };
+    }; // namespace op
+
+    /*! \brief namespace for savers */
+    namespace sv {
+        /*! \brief save to saver: = */
+        struct saveto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a  = b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 0.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::right OPType;
+        };
+        /*! \brief save to saver: += */
+        struct plusto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a += b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::plus OPType;
+        };
+        /*! \brief minus to saver: -= */
+        struct minusto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a -= b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::minus OPType;
+        };
+        /*! \brief multiply to saver: *= */
+        struct multo {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a *= b;
+            }
+            /*! \brief corresponding binary operator type */
+            typedef op::mul OPType;
+        };
+        /*! \brief divide to saver: /= */
+        struct divto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a /= b;
+            }
+            /*! \brief corresponding binary operator type */
+            typedef op::div OPType;
+        };
+    }; // namespace sv
+
+
+    namespace op {
+        // unary operator/ function: example
+        // these operators can be defined by user, in the same style as binary 
and unary operator
+        // to use, simply write F<op::identity>( src )
+        /*! \brief identity function that maps a real number to it self */
+        struct identity{
+            /*! \brief map a to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a;
+            }
+        };
+    }; // namespace op
+
+    /*! \brief namespace for potential reducer operations */
+    namespace red {
+        /*! \brief sum reducer */
+        struct sum {
+            /*! \brief do reduction into dst */
+            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  
volatile real_t src ) {
+                dst += src;
+            }
+            /*! \brief calculate gradient of redres with respect to redsrc,  
redres: reduced result, redsrc: one of reduction element */
+            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t 
redsrc ) {
+                return 1.0f;
+            }
+            /*! \brief an intial value of reducer */
+            MSHADOW_CONSTEXPR static real_t kInitV = 0.0f;
+        };
+        /*! \brief maximum reducer */
+        struct maximum {
+            /*! \brief do reduction into dst */
+            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  
volatile real_t src ) {
+                using namespace std;
+                dst = max( dst, src );
+            }
+            /*! \brief calculate gradient of redres with respect to redsrc,  
redres: reduced result, redsrc: one of reduction element */
+            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t 
redsrc ) {
+                return redres == redsrc ? 1.0f: 0.0f;
+            }
+            /*! \brief an intial value of reducer */
+#if MSHADOW_SINGLE_PRECISION
+            MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX;
+#else
+            MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX;
+#endif
+        };
+    };
+
+    /*! \brief namespace for helper utils of the project */
+    namespace utils{
+        /*! \brief send error message then exit */
+        inline void Error( const char *msg ){
+            fprintf( stderr, "Error:%s\n",msg );
+            exit( -1 );
+        }
+        /*! \brief assert a expression is true */
+        inline void Assert( bool exp ){
+            if( !exp ) Error( "AssertError" );
+        }
+        /*! \brief assert a expression is true */
+        inline void Assert( bool exp, const char *msg ){
+            if( !exp ) Error( msg );
+        }
+        /*! \brief warning */
+        inline void Warning( const char *msg ){
+            fprintf( stderr, "warning:%s\n",msg );
+        }
+    }; // namespace utils
+}; // namespace mshadow
+#endif // TENSOR_BASE_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_container.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_container.h 
b/include/mshadow/tensor_container.h
new file mode 100644
index 0000000..f0699e7
--- /dev/null
+++ b/include/mshadow/tensor_container.h
@@ -0,0 +1,152 @@
+#ifndef MSHADOW_TENSOR_CONTAINER_H
+#define MSHADOW_TENSOR_CONTAINER_H
+/*!
+ * \file tensor_container.h
+ * \brief tensor container that does memory allocation and resize like STL
+ * \author Tianqi Chen
+ */
+#include "tensor.h"
+#include "tensor_io.h"
+
+namespace mshadow{
+    /*!
+     * \brief tensor container that does memory allocation and resize like STL,
+     *        use it to save the lines of FreeSpace in class.
+     *        Do not abuse it, efficiency can come from pre-allocation and no 
re-allocation
+     *
+     * \tparam Device which device the tensor is on
+     * \tparam dimension dimension of the tensor
+     */
+    template<typename Device, int dimension>
+    class TensorContainer: public Tensor<Device,dimension>{
+    public:
+        /*! 
+         * \brief constructor 
+         * \param pad whether use padding alignment in space allocation
+         */
+        TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){
+            this->pad_ = pad;
+            this->dptr = data_.dptr = NULL;
+            this->shape[0] = 0;
+            this->shape.stride_ = 0;
+            this->data_.shape.stride_ = 0;
+            this->data_.shape[1] = 0;
+        }
+        /*! 
+         * \brief constructor 
+         * \param shape intial shape
+         */
+        TensorContainer( const Shape<dimension> &shape ){
+            this->pad_ = MSHADOW_ALLOC_PAD;
+            data_.dptr = NULL;
+            this->AllocByShape( shape );
+        }
+        /*! 
+         * \brief constructor 
+         * \param shape intial shape
+         * \param initv intial value
+         */
+        TensorContainer( const Shape<dimension> &shape, real_t initv ){
+            this->pad_ = MSHADOW_ALLOC_PAD;
+            data_.dptr = NULL;
+            this->AllocByShape( shape );
+            (*this) = initv;
+        }
+        ~TensorContainer( void ){
+            this->FreeSpace();
+        }
+        /*! 
+         * \brief resize the container to given shape, content is NOT preserved
+         * \param shape target shape
+         */
+        inline void Resize( const Shape<dimension> &shape ){
+            Shape<2> s2 = shape.FlatTo2D();            
+            if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > 
data_.shape[1] ){
+                this->AllocByShape( shape );
+            }else{
+                this->shape = shape;
+                if( this->pad_ ){
+                    this->shape.stride_ = data_.shape.stride_;
+                }else{
+                    this->shape.stride_ = this->shape[ 0 ];
+                }
+            }
+        }
+        /*! 
+         * \brief resize the container to given shape, and initialize, content 
is NOT preserved
+         * \param shape target shape
+         * \param initv initialization value
+         */
+        inline void Resize( const Shape<dimension> &shape, real_t initv ){
+            this->Resize( shape );
+            (*this) = initv;
+        }
+        /*! \brief set whether padding is allowed in tensor */
+        inline void set_pad( bool pad ){
+            this->pad_ = pad;
+        }
+        /*! 
+         * \brief save by binary format
+         * \param fo output binary stream
+         * \tparam TStream type of stream, need to support Read, Write, one 
example is utils::IStream.
+         */
+        template<typename TStream>
+        inline void SaveBinary( TStream &fo ) const{
+            mshadow::SaveBinary( fo, *this );
+        }
+        /*! 
+         * \brief load by binary format, a temp Tensor<cpu,dim> storage will 
be allocated
+         * \param fi input binary stream
+         * \tparam TStream type of stream, need to support Read, Write, one 
example is utils::IStream.
+         */
+        template<typename TStream>
+        inline void LoadBinary( TStream &fi ) {
+            Tensor<cpu,dimension> tmp;
+            mshadow::LoadBinary( fi, tmp, false );
+            this->Resize( tmp.shape );
+            Copy( *this, tmp );
+            mshadow::FreeSpace( tmp );
+        }
+    public:
+        // functions to fit exp template
+        inline Tensor<Device,dimension>& operator=( real_t s ){
+            return this->__assign( s );
+        }
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const 
expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const 
expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    private:
+        /*! \brief whether we do padding in the space */
+        bool pad_;
+        /*! \brief the shape of data_ is actually current data space */
+        Tensor<Device, 2> data_;
+    private:
+        inline void FreeSpace (void){
+            if( data_.dptr != NULL ){
+                mshadow::FreeSpace( data_ );
+                data_.dptr = this->dptr = NULL;
+            }
+        }
+        inline void AllocByShape (const Shape<dimension>& shape){
+            if( data_.dptr != NULL ){
+                this->FreeSpace();
+            }
+            data_.shape = shape.FlatTo2D();
+            mshadow::AllocSpace( data_, pad_ );
+            this->dptr  = data_.dptr;
+            this->shape = shape;
+            if( this->pad_ ){
+                this->shape.stride_ = data_.shape.stride_;
+            }else{
+                this->shape.stride_ = shape[0];
+            }
+        }
+    };
+};// namespace mshadow
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_cpu-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_cpu-inl.hpp 
b/include/mshadow/tensor_cpu-inl.hpp
new file mode 100644
index 0000000..0fa3cfa
--- /dev/null
+++ b/include/mshadow/tensor_cpu-inl.hpp
@@ -0,0 +1,168 @@
+#ifndef MSHADOW_TENSOR_CPU_INL_HPP
+#define MSHADOW_TENSOR_CPU_INL_HPP
+/*!
+ * \file tensor_cpu-inl.hpp
+ * \brief implementation of CPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#include <cstring>
+#include "tensor_base.h"
+#include "tensor_sse-inl.hpp"
+
+namespace mshadow {
+    template<int dim>
+    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){
+        size_t pitch;
+        if( pad ){
+            obj.dptr = (real_t*)sse2::AlignedMallocPitch
+                ( pitch, obj.shape[0] * sizeof(real_t), 
obj.FlatTo2D().shape[1] );
+            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
+        }else{
+            obj.shape.stride_ = obj.shape[0];
+            obj.dptr = (real_t*)sse2::AlignedMallocPitch
+                ( pitch, obj.shape.Size() * sizeof(real_t), 1 );
+        }
+    }
+
+    template<typename Device, int dim>
+    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, 
bool pad ){
+        Tensor<Device, dim> obj( shape );
+        AllocSpace( obj, pad );
+        MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) );
+        return obj;
+    }
+
+    template<int dim>
+    inline void FreeSpace(Tensor<cpu,dim> &obj){
+        sse2::AlignedFree( obj.dptr );
+        obj.dptr = NULL;
+    }
+
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){
+        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        Tensor<cpu,2> src = _src.FlatTo2D();
+        for (index_t y = 0; y < dst.shape[1]; ++y ) {
+            memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] );
+        }
+    }
+
+    template<typename Saver, typename E, int dim>
+    inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        for (index_t y = 0; y < dst.shape[1]; ++y ) {
+            for (index_t x = 0; x < dst.shape[0]; ++x ) {
+                // trust your compiler! -_- they will optimize it
+                Saver::Save(dst[y][x], plan.Eval( y, x ) );
+            }
+        }
+    }
+
+    // code to handle SSE optimization
+    template<bool pass_check,typename Saver, int dim, typename E, int etype>
+    struct MapExpCPUEngine;
+    template<typename SV, int dim, typename E, int etype>
+    struct MapExpCPUEngine<false,SV,dim,E,etype>{
+        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> 
&exp ){
+            MapPlan<SV>( dst, MakePlan( exp.self() ) );
+        }
+    };
+
+    #if MSHADOW_USE_SSE
+    template<typename SV, int dim, typename E, int etype>
+    struct MapExpCPUEngine<true,SV,dim,E,etype>{
+        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> 
&exp ){
+            using namespace expr;
+            if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< 
dim,Tensor<cpu,dim> >::Check(dst) ){
+                MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) );
+            }else{
+                MapPlan<SV>( dst, MakePlan( exp.self() ) );
+            }
+        }
+    };
+    #endif
+
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass 
>::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
+        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: 
Shape of Tensors in expression is not consistent with target" );
+        #if MSHADOW_USE_SSE
+        MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp 
);
+        #else
+        MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp );
+        #endif
+    }
+
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass 
>::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() 
).FlatTo2D();
+
+        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not 
match" );
+        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
+        // execution
+        expr::Plan<E> plan = MakePlan( exp.self() );
+        for( index_t x = 0; x < eshape[0]; ++x ){
+            real_t res = plan.Eval( 0, x );
+            for( index_t y = 1; y < eshape[1]; ++y ){
+                Reducer::Reduce( res, plan.Eval( y, x ) );
+            }
+            Saver::Save( dst[x], res*scale );
+        }
+    }
+
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int 
etype>
+    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const 
expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass 
>::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        typedef Shape< ExpInfo<E>::kDim > EShape;
+        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
+        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension 
do not match" );
+        // use equvalent form
+        Shape<4> pshape = Shape4( 
eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], 
+                                  eshape.ProdShape(1,dimkeep), eshape[0] );
+
+        // execution
+        expr::Plan<E> plan = MakePlan( exp.self() );
+
+        for( index_t c = 0; c < pshape[2]; ++c ){
+            real_t res = Reducer::kInitV;
+            for( index_t n = 0; n < pshape[3]; ++n ){
+                real_t tres = Reducer::kInitV;
+                for( index_t y = 0; y < pshape[1]; ++y ){
+                    for( index_t x = 0; x < pshape[0]; ++x ){
+                        Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * 
pshape[1] + y, x ) );
+                    }
+                }
+                Reducer::Reduce( res, tres );
+            }
+            Saver::Save( dst[c], res*scale );
+        }
+    }
+
+    inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){
+        real_t mmax = energy[0];
+        for( real_t x = 1; x < dst.shape[0]; ++x )
+            if( mmax < energy[x] ) mmax = energy[x];
+        real_t sum = 0.0f;
+        for( index_t x = 0; x < dst.shape[0]; ++x ){
+            dst[x] = std::exp( energy[x] - mmax );
+            sum += dst[x];
+        }
+        for( index_t x = 0; x < dst.shape[0]; ++x ){
+            dst[x] /= sum;
+        }
+    }
+    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){
+        utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" );
+        for( index_t y = 0; y < dst.shape[1]; ++y ){
+            Softmax( dst[y], energy[y] );
+        }
+    }
+}; // namespace mshadow
+
+#endif // TENSOR_CPU_INL_HPP

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr.h b/include/mshadow/tensor_expr.h
new file mode 100644
index 0000000..ac8fde7
--- /dev/null
+++ b/include/mshadow/tensor_expr.h
@@ -0,0 +1,367 @@
+#ifndef MSHADOW_TENSOR_EXPR_H
+#define MSHADOW_TENSOR_EXPR_H
+/*!
+ * \file tensor_expr.h
+ * \brief definitions of abstract expressions and expressions template
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_base.h"
+
+namespace mshadow{
+    /*!
+     * \brief namespace for abstract expressions and expressions template,
+     *        have no dependecy on tensor.h,
+     *        These data structure takes no charge in computations,
+     *        they are only used to define operations and represent expression 
in a symbolic way
+     */
+    namespace expr{
+
+        /*! \brief type of expressions */
+        namespace type{
+            /*! \brief this expression directly correspnds to a data class */
+            const int kContainer = 0;
+            /*! \brief this only contains element-wise vector operations */
+            const int kMapper    = 1;
+            /*! \brief othercase: e.g dot product */
+            const int kComplex   = 3;
+        };
+
+        /*!
+         * \brief expression engine that actually interprets these expressions
+         *        this is a function template that needed to be implemented 
for specific expressions
+         */
+        template<typename Saver,typename Container>
+        struct ExpEngine{
+            template<typename EType>
+            inline static void Eval( Container& dst, const EType &exp );
+        };
+
+        template<typename Container>
+        class ContainerExp;
+        class ScalarExp;
+
+        /*!
+         * \brief base class for expression
+         * \tparam SubType inheritated class must put their type into this 
parameter
+         * \tparam exp_type expression type, see namespace type
+         */
+        template<typename SubType, int exp_type>
+        struct Exp{
+        public:
+            /*! \return  subtype instance of current class */
+            inline const SubType& self( void ) const{
+                return *static_cast<const SubType*>(this);
+            }
+            /*! \return reference of subtype instance of current class */
+            inline SubType& refself( void ){
+                return *static_cast<SubType*>(this);
+            }
+        };
+
+        /*! \brief scalar expression */
+        struct ScalarExp: public Exp<ScalarExp, type::kMapper>{
+            /*! \brief scalar value */
+            real_t scalar_;
+            /*! \brief constructor */
+            ScalarExp( real_t scalar ):scalar_(scalar){}
+        };
+
+        /*! \brief represent a transpose expression of a container */
+        template<typename EType>
+        struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{
+        public:
+            /*! \brief expression to be transposed */
+            const EType &exp;
+            /*! \brief constructor */
+            TransposeExp( const EType &e ):exp(e){}
+            /*! \brief transpose expression */
+            inline const EType & T( void ) const{
+                return exp;
+            }
+        };
+        
+        /*!
+         * \brief base class of all variables, that can be assigned to values
+         * \tparam Container the actually class of data container, e.g. 
CTensor1D
+         */
+        template<typename Container>
+        class ContainerExp: public Exp< Container, type::kContainer >{
+        public:
+            /*!
+             *\brief transpose of a matrix
+             *\return transpose of current expression
+             */
+            inline const TransposeExp<Container> T( void ) const{
+                return TransposeExp<Container>( this->self() );
+            }
+        public:
+            /*! \brief operator overload */
+            inline Container &operator+=( real_t s ){
+                ExpEngine<sv::plusto,Container>::Eval( this->refself(), 
ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator-=( real_t s ){
+                ExpEngine<sv::minusto,Container>::Eval( this->refself(), 
ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator*=( real_t s ){
+                ExpEngine<sv::multo,Container>::Eval( this->refself(), 
ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator/=( real_t s ){
+                ExpEngine<sv::divto,Container>::Eval( this->refself(), 
ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &__assign( real_t s ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), 
ScalarExp(s) );
+                return this->refself();
+            }
+        public:
+            /*! \brief implementation of operator=, note that we can not 
define container = container */
+            template<typename E>
+            inline Container &__assign( const Exp<E,type::kMapper> &exp ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), 
exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator=, note that we can not 
define container = container */
+            template<typename E>
+            inline Container &__assign( const Exp<E,type::kComplex> &exp ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), 
exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator+= */
+            template<typename E,int etype>
+            inline Container &operator+=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::plusto,Container>::Eval( this->refself(), 
exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator-= */
+            template<typename E,int etype>
+            inline Container &operator-=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::minusto,Container>::Eval( this->refself(), 
exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator*= */
+            template<typename E,int etype>
+            inline Container &operator*=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::multo,Container>::Eval( this->refself(), 
exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator/= */
+            template<typename E,int etype>
+            inline Container &operator/=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::divto,Container>::Eval( this->refself(), 
exp.self() );
+                return this->refself();
+            }
+        };
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] )
+         * \tparam TA type of lhs
+         * \tparam TB type of rhs
+         * \tparam ltrans whether lhs is transposed
+         * \tparam rtrans whether rhs is transposed
+         */
+        template<typename TA,typename TB,bool ltrans,bool rtrans>
+        struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex 
>{
+            /*! \brief left operand */
+            const TA& lhs_;
+            /*! \brief right operand */
+            const TB& rhs_;
+            /*! \brief scale over result */
+            real_t scale_;
+            /*! \brief constructor */
+            DotExp( const TA &lhs, const TB &rhs, real_t scale )
+                :lhs_(lhs),rhs_(rhs),scale_(scale){}
+        };
+
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, 
const ContainerExp<TB> &rhs ){
+            return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, 
const ContainerExp<TB> &rhs ){
+            return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, 
const TransposeExp<TB> &rhs ){
+            return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const 
TransposeExp<TB> &rhs ){
+            return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB, bool ltrans, bool rtrans >
+        inline DotExp<TA,TB,ltrans,rtrans> operator*( const 
DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){
+            return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ 
* rhs );
+        }
+        /*! \brief scale of dot operation */
+        template<typename TA, typename TB, bool ltrans, bool rtrans >
+        inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const 
DotExp<TA,TB,ltrans,rtrans> &rhs ){
+            return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ 
* lhs );
+        }
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief binary map expression lhs [op] rhs
+         * \tparam OP operator
+         * \tparam TA type of lhs
+         * \tparam TB type of rhs
+         * \tparam etype expression type, sa namespace::type
+         */
+        template<typename OP, typename TA, typename TB, int etype >
+        struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{
+            /*! \brief left operand */
+            const TA& lhs_;
+            /*! \brief right operand */
+            const TB& rhs_;
+            /*! \brief constructor */
+            BinaryMapExp( const TA &lhs, const TB &rhs )
+                :lhs_(lhs), rhs_(rhs){}
+        };
+
+        /*! \brief make expression */
+        template<typename OP,typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const 
Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), 
rhs.self() );
+        }
+
+        /*! 
+         * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a 
binary operation expression 
+         * \param lhs left operand
+         * \param rhs right operand
+         * \tparam binary operator 
+         * \tparam TA lhs expression
+         * \tparam ta lhs expression type
+         * \tparam TB rhs expression
+         * \tparam tb rhs expression type
+         * \sa mshadow::op
+         */
+        template<typename OP,typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const 
Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+        /*! \brief operator overload for const */
+        template<typename OP,typename TA, int ta>
+        inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const 
Exp<TA,ta> &lhs, const ScalarExp &rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+        /*! \brief operator overload for const */
+        template<typename OP,typename TB, int tb>
+        inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const 
ScalarExp &lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+
+        // operator rules
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( 
const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > 
operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( 
const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( 
const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+        // constant operators
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > 
operator+( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > 
operator-( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > 
operator*( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > 
operator/( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+        // constant operators 2
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > 
operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > 
operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > 
operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > 
operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+    };
+
+    namespace expr{
+        /*!
+         * \brief unary map expression op(src)
+         * \tparam OP operator
+         * \tparam TA type of src
+         * \tparam etype expression type, sa namespace::type
+         */
+        template<typename OP, typename TA, int etype >
+        struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{
+            /*! \brief source expression */
+            const TA& src_;
+            /*! \brief constructor */
+            UnaryMapExp( const TA &src ):src_(src){}
+        };
+
+        /*! \brief make expression */
+        template<typename OP,typename TA, int ta>
+        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const 
Exp<TA,ta> &src ){
+            return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() );
+        }
+
+        /*! 
+         * \brief short hand for MakeExp, usage F<op>(src), create a unary 
operation expression 
+         * \param src source expression
+         * \tparam operator 
+         * \tparam TA source expression
+         * \tparam ta source expression type
+         * \sa mshadow::op
+         */
+        template<typename OP,typename TA, int ta>
+        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src 
){
+            return MakeExp<OP>(src);
+        }
+    };
+};
+#endif

Reply via email to