Repository: incubator-singa
Updated Branches:
  refs/heads/master 394d78d00 -> 600f27ede


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h 
b/src/core/tensor/tensor_math_cpp.h
index 4f510ed..01d9fe3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -21,7 +21,9 @@
 #include "./tensor_math.h"
 #include <cfloat>
 #include "singa/core/common.h"
+#include "singa/core/tensor.h"
 #include <math.h>
+#include <vector>
 
 #ifdef USE_CBLAS
 #include <cblas.h>
@@ -29,422 +31,856 @@
 
 namespace singa {
 
+// template <>
+// void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = fabs(inPtr[i]);
+//   }
+// }
+
 template <>
-void Abs<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = fabs(inPtr[i]);
-  }
+void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out, Context *ctx) {
+  TraverseUnary<float>(in, out, [](float x) {return fabs(x);});
 }
 
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in, const float x,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = inPtr[i] + x;
+//   }
+// }
+
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, 
Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->block()->data());
+//   vector<int> traversal_info = in->generate_traversal_info();
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = inPtr[traversal_info[in->shape().size()]] + x;
+//     in->traverse_next(traversal_info, i+1);
+//   }
+// }
+
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] + x;
-  }
+void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, 
Context *ctx) {
+  auto add_lambda = [&x](float a) {
+    return (a+x);
+  };
+  TraverseUnary<float>(in, out, add_lambda);
 }
 
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   // CHECK_EQ(ctx->stream, nullptr);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = in1Ptr[i] + in2Ptr[i];
+//   }
+// }
+
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* 
out, Context *ctx) {
+//   // CHECK_EQ(ctx->stream, nullptr);
+//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+//   //call axpy if both strides are 1?
+//   vector<int> traversal_info_in1 = in1->generate_traversal_info();
+//   vector<int> traversal_info_in2 = in2->generate_traversal_info();
+//   for (size_t i = 0; i < in1->Size(); i++) {
+//     outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] + 
in2Ptr[traversal_info_in2[in2->shape().size()]];
+//     in1->traverse_next(traversal_info_in1, i+1);
+//     in2->traverse_next(traversal_info_in2, i+1);
+//   }
+// }
+
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Block *in1, const Block 
*in2,
-                           Block *out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] + in2Ptr[i];
-  }
+  auto add_lambda_binary = [](float a, float b) {
+    return (a+b);
+  };
+  TraverseBinary<float>(in1, in2, out, add_lambda_binary);
+  
 }
 
+// template <>
+// void Clamp<float, lang::Cpp>(const float low,
+//                              const float high, const Tensor* in, Tensor* 
out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     if (inPtr[i] > high) {
+//       outPtr[i] = high;
+//     } else if (inPtr[i] < low) {
+//       outPtr[i] = low;
+//     } else {
+//       outPtr[i] = inPtr[i];
+//     }
+//   }
+// }
+
+// template <>
+// void Clamp<float, lang::Cpp>(const Tensor* in, const float low,
+//                              const float high, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->block()->data());
+//   vector<int> traversal_info = in->generate_traversal_info();
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     int traversed_index = traversal_info[in->shape().size()];
+//     if (inPtr[traversed_index] > high) {
+//       outPtr[i] = high;
+//     } else if (inPtr[traversed_index] < low) {
+//       outPtr[i] = low;
+//     } else {
+//       outPtr[i] = inPtr[traversed_index];
+//     }
+//     in->traverse_next(traversal_info, i+1);
+//   }
+// }
+
 template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low,
-                             const float high, const Block *in, Block *out,
+void Clamp<float, lang::Cpp>(const float low, const float high,
+                             const Tensor* in, Tensor* out,
                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    if (inPtr[i] > high) {
-      outPtr[i] = high;
-    } else if (inPtr[i] < low) {
-      outPtr[i] = low;
-    } else {
-      outPtr[i] = inPtr[i];
-    }
-  }
+  auto clamp_lambda = [&low, &high](float a) {
+    if(a < low){return low;}
+    else if(a > high){return high;}
+    else {return a;}
+  };
+  TraverseUnary<float>(in, out, clamp_lambda);
 }
 
+
+// template <>
+// void Div<float, lang::Cpp>(const float x, const Tensor* in,
+//                            Tensor* out, Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_NE(inPtr[i], 0.f);
+//     outPtr[i] = x / inPtr[i];
+//   }
+// }
+
 template <>
-void Div<float, lang::Cpp>(const size_t num, const Block *in1, const Block 
*in2,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_NE(in2Ptr[i], 0.f);
-    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+void Div<float, lang::Cpp>(const float x, const Tensor* in, Tensor* out,
+                           Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) {
+    CHECK_NE(inPtr[traversal_info[in->shape().size()]], 0.f);
+    outPtr[i] = x / inPtr[traversal_info[in->shape().size()]];
+    in->traverse_next(traversal_info, i+1);
   }
 }
 
+
+// template <>
+// void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_NE(in2Ptr[i], 0.f);
+//     outPtr[i] = in1Ptr[i] / in2Ptr[i];
+//   }
+// }
+
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Block *in,
-                           Block *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_NE(inPtr[i], 0.f);
-    outPtr[i] = x / inPtr[i];
+void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+  vector<int> traversal_info_in1 = in1->generate_traversal_info();
+  vector<int> traversal_info_in2 = in2->generate_traversal_info();
+  for (size_t i = 0; i < in1->Size(); i++) {
+    CHECK_NE(in2Ptr[traversal_info_in2[in2->shape().size()]], 0.f);
+    outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] / 
in2Ptr[traversal_info_in2[in2->shape().size()]];
+    in1->traverse_next(traversal_info_in1, i+1);
+    in2->traverse_next(traversal_info_in2, i+1);
   }
 }
 
+
+// template <>
+// void EltwiseMult<float, lang::Cpp>(const Tensor* in,
+//                                    const float x, Tensor* out, Context 
*ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = inPtr[i] * x;
+//   }
+// }
+
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in,
-                                   const float x, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * x;
-  }
+void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* 
out,
+                                   Context *ctx) {
+  auto eltwisemult_lambda = [&x](float a) {
+    return (a*x);
+  };
+  TraverseUnary<float>(in, out, eltwisemult_lambda);
 }
 
+// template <>
+// void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, 
Tensor* out, 
+//                                    Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = in1Ptr[i] * in2Ptr[i];
+//   }
+// }
+
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in1,
-                                   const Block *in2, Block *out, Context *ctx) 
{
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] * in2Ptr[i];
-  }
+void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, 
Tensor* out, 
+                                   Context *ctx) {
+  auto eltwisemult_lambda_binary = [](float a, float b) {
+    return (a*b);
+  };
+  TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
 }
+
+// template <>
+// void Exp<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = exp(inPtr[i]);
+//   }
+// }
+
 template <>
-void Exp<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = exp(inPtr[i]);
-  }
+void Exp<float, lang::Cpp>(const Tensor* in, Tensor *out, Context *ctx) {
+  TraverseUnary<float>(in, out, [](float x) {return exp(x);});
 }
 
+// template <>
+// void GE<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
-  }
+void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto ge_lambda = [&x](float a) {
+    return (a >= x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, ge_lambda);
 }
 
+// template <>
+// void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
-  }
+void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto ge_lambda_binary = [](float a, float b) {
+    return (a >= b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, ge_lambda_binary);
 }
+
+// template <>
+// void GT<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
-  }
+void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto gt_lambda = [&x](float a) {
+    return (a > x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, gt_lambda);
 }
+
+// template <>
+// void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
-  }
+void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto gt_lambda_binary = [](float a, float b) {
+    return (a > b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, gt_lambda_binary);
 }
 
+// template <>
+// void LE<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
-  }
+void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto le_lambda = [&x](float a) {
+    return (a <= x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, le_lambda);
 }
+
+// template <>
+// void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
-  }
+void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto le_lambda_binary = [](float a, float b) {
+    return (a <= b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, le_lambda_binary);
 }
+
+// template <>
+// void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_GT(inPtr[i], 0.f);
+//     outPtr[i] = log(inPtr[i]);
+//   }
+// }
+
 template <>
-void Log<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = log(inPtr[i]);
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) {
+    CHECK_GT(inPtr[traversal_info[in->shape().size()]], 0.f);
+    outPtr[i] = log(inPtr[traversal_info[in->shape().size()]]);
+    in->traverse_next(traversal_info, i+1);
   }
 }
+
+// template <>
+// void LT<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
-  }
+void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto lt_lambda = [&x](float a) {
+    return (a < x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, lt_lambda);
 }
+
+// template <>
+// void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
-  }
+void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto lt_lambda_binary = [](float a, float b) {
+    return (a < b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, lt_lambda_binary);
 }
 
+// template <>
+// void Pow<float, lang::Cpp>(const Tensor* in, const float x,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = pow(inPtr[i], x);
+//   }
+// }
+
 template <>
-void Pow<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = pow(inPtr[i], x);
-  }
+void Pow<float, lang::Cpp>(const Tensor* in, const float x, Tensor *out, 
Context *ctx) {
+  TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
 }
 
+// template <>
+// void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
+//   }
+// }
+
 template <>
-void Pow<float, lang::Cpp>(const size_t num, const Block *in1, const Block 
*in2,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
-  }
+void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                           Context *ctx) {
+  auto pow_lambda_binary = [](float a, float b) {
+    return pow(a,b);
+  };
+  TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
 }
+
+// template <>
+// void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+//   }
+// }
+
 template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
-  }
+void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto relu_lambda = [](float a) {
+    return (a >= 0.f) ? a : 0.f;
+  };
+  TraverseUnary<float>(in, out, relu_lambda);
 }
+
+// template <>
+// void Set<float, lang::Cpp>(const float x, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
+// }
+
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Set<float, lang::Cpp>(const float x, Tensor* out,
                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
+
+// template <>
+// void Set<int, lang::Cpp>(const int x, Tensor* out,
+//                            Context *ctx) {
+//   int *outPtr = static_cast<int *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
+// }
+
 template <>
-void Set<int, lang::Cpp>(const size_t num, const int x, Block *out,
+void Set<int, lang::Cpp>(const int x, Tensor* out,
                            Context *ctx) {
-  int *outPtr = static_cast<int *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  int *outPtr = static_cast<int *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
+// template <>
+// void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                                Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+//   }
+// }
+
 template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                               Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
-  }
+void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto sigmoid_lambda = [](float a) {
+    return 1.f / (1.f + exp(-a));
+  };
+  TraverseUnary<float>(in, out, sigmoid_lambda);
 }
 
+// template <>
+// void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
+//   }
+// }
+
 template <>
-void Sign<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
-  }
+void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto sign_lambda = [](float a) {
+    return (a > 0) - (a < 0);
+  };
+  TraverseUnary<float>(in, out, sign_lambda);
 }
 
+// template <>
+// void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_GE(inPtr[i], 0.f);
+//     outPtr[i] = sqrt(inPtr[i]);
+//   }
+// }
+
 template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
                             Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_GE(inPtr[i], 0.f);
-    outPtr[i] = sqrt(inPtr[i]);
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) {
+    CHECK_GE(inPtr[traversal_info[in->shape().size()]], 0.f);
+    outPtr[i] = sqrt(inPtr[traversal_info[in->shape().size()]]);
+    in->traverse_next(traversal_info, i+1);
   }
 }
+
 /*
 template <>
-void Square<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Square<float, lang::Cpp>(const Tensor* in, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  for (size_t i = 0; i < in->Size(); i++) {
     outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 */
 
+// template <>
+// void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   // CHECK_EQ(ctx->stream, nullptr);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = in1Ptr[i] - in2Ptr[i];
+//   }
+// }
+
 template <>
-void Sub<float, lang::Cpp>(const size_t num, const Block *in1, const Block 
*in2,
-                           Block *out, Context *ctx) {
+void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] - in2Ptr[i];
-  }
+  auto sub_lambda_binary = [](float a, float b) {
+    return (a-b);
+  };
+  TraverseBinary<float>(in1, in2, out, sub_lambda_binary);
 }
 
 // sum all elements of input into out
 // TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Sum<float, lang::Cpp>(const Tensor* in, float *out,
                            Context *ctx) {
   float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) {
     s += inPtr[i];
   }
   *out = s;
 }
 
+// template <>
+// void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = tanh(inPtr[i]);
+//   }
+// }
+
 template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = tanh(inPtr[i]);
-  }
+void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto tanh_lambda = [](float a) {
+    return tanh(a);
+  };
+  TraverseUnary<float>(in, out, tanh_lambda);
 }
 
 // ===============Random operations==========================================
+// template <>
+// void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
+//                                  Context *ctx) {
+//   std::bernoulli_distribution distribution(p);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+//   }
+// }
+
 template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Block *out,
+void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
                                  Context *ctx) {
   std::bernoulli_distribution distribution(p);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
     outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
+// template <>
+// void Gaussian<float, lang::Cpp>(const float mean,
+//                                 const float std, Tensor* out, Context *ctx) 
{
+//   std::normal_distribution<float> distribution(mean, std);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+//   }
+// }
+
 template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
-                                const float std, Block *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const float mean,
+                                const float std, Tensor* out, Context *ctx) {
   std::normal_distribution<float> distribution(mean, std);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
+
+// template <>
+// void Uniform<float, lang::Cpp>(const float low,
+//                                const float high, Tensor* out, Context *ctx) 
{
+//   std::uniform_real_distribution<float> distribution(low, high);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+//   }
+// }
+
 template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low,
-                               const float high, Block *out, Context *ctx) {
+void Uniform<float, lang::Cpp>(const float low,
+                               const float high, Tensor* out, Context *ctx) {
   std::uniform_real_distribution<float> distribution(low, high);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 // ====================Blas operations======================================
 
+//yisen todo, this function has block M overwritting to block M itself
 template <>
-void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
-                            const size_t ncol, const Block *M, const Block *v,
-                            Block *out, Context *ctx) {
-  const float *MPtr = static_cast<const float *>(M->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void DGMM<float, lang::Cpp>(const bool side_right,
+                            const Tensor* M, const Tensor* v,
+                            Tensor* out, Context *ctx) {
+  const float *MPtr = static_cast<const float *>(M->block()->data());
+  const float *vPtr = static_cast<const float *>(v->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const size_t nrow = M->shape(0);
+  const size_t ncol = M->shape(1);
+  vector<int> traversal_info = M->generate_traversal_info();
+
   if (side_right) {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[offset + c] = MPtr[offset + c] * vPtr[c];
+        outPtr[traversal_info[M->shape().size()]] = 
MPtr[traversal_info[M->shape().size()]] * vPtr[c];
+        M->traverse_next(traversal_info, offset+c+1);
       }
     }
   } else {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[offset + c] = MPtr[offset + c] * vPtr[r];
+        outPtr[traversal_info[M->shape().size()]] = 
MPtr[traversal_info[M->shape().size()]] * vPtr[r];
+        M->traverse_next(traversal_info, offset+c+1);
       }
     }
   }
 }
 
+// #ifdef USE_CBLAS
+// template <>
+// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
+//                             Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   *out = cblas_isamax(in->Size(), inPtr, 1);
+// }
+
+// template <>
+// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
+//                             Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   *out = cblas_sasum(in->Size(), inPtr, 1);
+// }
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+//                             const Tensor* in, Tensor* out, Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+// }
+
+// template <>
+// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            float *out, Context *ctx) {
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   *out = cblas_sdot(in->Size(), in1Ptr, 1, in2Ptr, 1);
+// }
+// template <>
+// void Scale<float, lang::Cpp>(const float x, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   cblas_sscal(in->Size(), x, outPtr, 1);
+// }
+// template <>
+// void Nrm2<float, lang::Cpp>(const Tensor* in, float *out,
+//                             Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   *out = cblas_snrm2(in->Size(), inPtr, 1);
+// }
+
 #ifdef USE_CBLAS
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  *out = cblas_isamax(num, inPtr, 1);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  *out = cblas_isamax(in->Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor *in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  *out = cblas_sasum(num, inPtr, 1);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  *out = cblas_sasum(in->Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
-                            const Block *in, Block *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
+void Axpy<float, lang::Cpp>(const float alpha,
+                            const Tensor *in, Tensor *out, Context *ctx) {
+  //check input tensor for strides first
+  if((in->strides())[0] == 1){
+    const float *inPtr = static_cast<const float *>(in->block()->data());
+    float *outPtr = static_cast<float *>(out->block()->mutable_data());
+    cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+  }
+  //yisen todo
+  //else throw error
 }
 
 template <>
-void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block 
*in2,
+void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
                            float *out, Context *ctx) {
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
+  //check input tensor for strides first
+  if(((in1->strides())[0] == 1) && ((in2->strides())[0] == 1)){
+    const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+    const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+    *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+  }
+  //yisen todo
+  //else throw error
 }
+
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Scale<float, lang::Cpp>(const float x, Tensor *out,
                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_sscal(num, x, outPtr, 1);
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  cblas_sscal(out->Size(), x, outPtr, 1); //not using strided traversal
 }
+
 template <>
-void Nrm2<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Nrm2<float, lang::Cpp>(const Tensor *in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  *out = cblas_snrm2(num, inPtr, 1);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  *out = cblas_snrm2(in->Size(), inPtr, 1); //not using strided traversal
 }
 
+// template <>
+// void GEMV<float, lang::Cpp>(//bool trans,
+//                             const std::vector<int> stridesA,
+//                             const size_t m, const size_t n,
+//                             const float alpha, const Tensor* A, const 
Tensor* v,
+//                             const float beta, Tensor* out, Context *ctx) {
+//   const float *APtr = static_cast<const float *>(A->data());
+//   const float *vPtr = static_cast<const float *>(v->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   auto trans = (stridesA.back() == 1) ? true : false;
+//   if (!trans) {
+//     cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+//                 beta, outPtr, 1);
+//   } else {
+//     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, 
beta,
+//                 outPtr, 1);
+//   }
+// }
+
 template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-                            const float alpha, const Block *A, const Block *v,
-                            const float beta, Block *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor 
*v,
+                            const float beta, Tensor *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->block()->data());
+  const float *vPtr = static_cast<const float *>(v->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  auto trans = ((A->strides())[0] != 1) ? true : false;
+  const size_t m = A->shape()[0];
+  const size_t n = A->shape()[1];
   if (!trans) {
     cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
                 beta, outPtr, 1);
@@ -454,33 +890,147 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, 
const size_t n,
   }
 }
 
+// template <>
+// void GEMM<float, lang::Cpp>(//const bool transA, const bool transB,
+//                             const std::vector<int> stridesA, const 
std::vector<int> stridesB,
+//                             const size_t nrowA, const size_t ncolB,
+//                             const size_t ncolA, const float alpha,
+//                             const Tensor* A, const Tensor* B, const float 
beta,
+//                             Tensor* C, Context *ctx) {
+//   auto transA = (stridesA.back() == 1) ? true : false;
+//   auto transa = transA ? CblasTrans : CblasNoTrans;
+//   auto transB = (stridesB.back() == 1) ? true : false;
+//   auto transb = transB ? CblasTrans : CblasNoTrans;
+//   auto lda = transA ? nrowA : ncolA;
+//   auto ldb = transB ? ncolA : ncolB;
+//   auto ldc = ncolB;
+//   const float *APtr = static_cast<const float *>(A->data());
+//   const float *BPtr = static_cast<const float *>(B->data());
+//   float *CPtr = static_cast<float *>(C->mutable_data());
+//   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, 
APtr,
+//    lda, BPtr, ldb, beta, CPtr, ldc);
+// }
+
 template <>
-void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
-                            const size_t nrowA, const size_t ncolB,
-                            const size_t ncolA, const float alpha,
-                            const Block *A, const Block *B, const float beta,
-                            Block *C, Context *ctx) {
+void GEMM<float, lang::Cpp>(const float alpha,
+                            const Tensor *A, const Tensor *B, const float beta,
+                            Tensor *C, Context *ctx) {
+  auto transA = ((A->strides())[0] != 1) ? true : false;
   auto transa = transA ? CblasTrans : CblasNoTrans;
+  auto transB = ((B->strides())[0] != 1) ? true : false;
   auto transb = transB ? CblasTrans : CblasNoTrans;
+  const size_t nrowA = A->shape()[0];
+  const size_t ncolA = A->shape()[1];
+  const size_t ncolB = B->shape()[1];
   auto lda = transA ? nrowA : ncolA;
   auto ldb = transB ? ncolA : ncolB;
   auto ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *BPtr = static_cast<const float *>(B->data());
-  float *CPtr = static_cast<float *>(C->mutable_data());
+  const float *APtr = static_cast<const float *>(A->block()->data());
+  const float *BPtr = static_cast<const float *>(B->block()->data());
+  float *CPtr = static_cast<float *>(C->block()->mutable_data());
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
-         lda, BPtr, ldb, beta, CPtr, ldc);
+    lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
 #else
 
+// template <>
+// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
+//                             Context *ctx) {
+//   size_t maxPos = 0;
+//   float maxVal = 0;
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     if (i == 0) {
+//       maxVal = inPtr[i];
+//     } else if (inPtr[i] > maxVal) {
+//       maxVal = inPtr[i];
+//       maxPos = i;
+//     }
+//   }
+//   *out = maxPos;
+// }
+// template <>
+// void Amin<float, lang::Cpp>(const Tensor* in, size_t *out,
+//                             Context *ctx) {
+//   size_t minPos = 0;
+//   float minVal = 0;
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     if (i == 0) {
+//       minVal = inPtr[i];
+//     } else if (inPtr[i] > minVal) {
+//       minVal = inPtr[i];
+//       minPos = i;
+//     }
+//   }
+//   *out = minPos;
+// }
+
+// template <>
+// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
+//                             Context *ctx) {
+//   float sum = 0;
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     sum += fabs(inPtr[i]);
+//   }
+// }
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+//                             const Tensor* in, Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] += alpha * inPtr[i];
+//   }
+// }
+
+// template <>
+// void Scale<float, lang::Cpp>(const float x, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] *= x;
+//   }
+// }
+
+// template <>
+// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            float *out, Context *ctx) {
+//   float sum = 0;
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     sum += in1Ptr[i] * in2Ptr[i];
+//   }
+// }
+
+// template <>
+// void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+//                             const float alpha, const Tensor* A, const 
Tensor* v,
+//                             const float beta, Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *APtr = static_cast<const float *>(A->data());
+//   const float *vPtr = static_cast<const float *>(v->data());
+//   for (size_t r = 0; r < m; r++) {
+//     float sum = 0;
+//     for (size_t c = 0; c < n; c++) {
+//       size_t idx = trans ? c * m + r : r * n + c;
+//       sum += APtr[idx] * vPtr[c];
+//     }
+//     outPtr[r] = alpha * sum + beta * outPtr[r];
+//   }
+// }
+
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
                             Context *ctx) {
   size_t maxPos = 0;
   float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
     if (i == 0) {
       maxVal = inPtr[i];
     } else if (inPtr[i] > maxVal) {
@@ -491,12 +1041,12 @@ void Amax<float, lang::Cpp>(const size_t num, const 
Block *in, size_t *out,
   *out = maxPos;
 }
 template <>
-void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
                             Context *ctx) {
   size_t minPos = 0;
   float minVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
     if (i == 0) {
       minVal = inPtr[i];
     } else if (inPtr[i] > minVal) {
@@ -508,52 +1058,67 @@ void Amin<float, lang::Cpp>(const size_t num, const 
Block *in, size_t *out,
 }
 
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor *in, float *out,
                             Context *ctx) {
   float sum = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    sum += fabs(inPtr[i]);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) {
+    sum += fabs(inPtr[i]); //not using strided traversal
   }
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
-                            const Block *in, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] += alpha * inPtr[i];
+void Axpy<float, lang::Cpp>(const float alpha,
+                            const Tensor *in, Tensor *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) { 
+    outPtr[i] += alpha * inPtr[traversal_info[in->shape().size()]];
+    in->traverse_next(traversal_info, i+1);
   }
 }
 
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Scale<float, lang::Cpp>(const float x, Tensor *out,
                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] *= x;
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
+    outPtr[i] *= x; //not using strided traversal
   }
 }
 
+//yisen todo check purpose of sum in this function
 template <>
-void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block 
*in2,
+void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
                            float *out, Context *ctx) {
   float sum = 0;
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    sum += in1Ptr[i] * in2Ptr[i];
+  // const float *in1Ptr = static_cast<const float *>(in1->data());
+  // const float *in2Ptr = static_cast<const float *>(in2->data());
+  // for (size_t i = 0; i < in->Size(); i++) {
+  //   sum += in1Ptr[i] * in2Ptr[i]; 
+  // }
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+  vector<int> traversal_info_in1 = in1->generate_traversal_info();
+  vector<int> traversal_info_in2 = in2->generate_traversal_info();
+  for (size_t i = 0; i < in1->Size(); i++) {
+    sum += in1Ptr[traversal_info_in1[in1->shape().size()]] * 
in2Ptr[traversal_info_in2[in2->shape().size()]];
+    in1->traverse_next(traversal_info_in1, i+1);
+    in2->traverse_next(traversal_info_in2, i+1);
   }
 }
 
 template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-                            const float alpha, const Block *A, const Block *v,
-                            const float beta, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor 
*v,
+                            const float beta, Tensor *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *APtr = static_cast<const float *>(A->block()->data());
+  const float *vPtr = static_cast<const float *>(v->block()->data());
+  bool trans = ((A->strides())[0] != 1) ? true : false;
+  const size_t m = A->shape(0);
+  const size_t n = A->shape(1);
   for (size_t r = 0; r < m; r++) {
     float sum = 0;
     for (size_t c = 0; c < n; c++) {
@@ -564,6 +1129,7 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, 
const size_t n,
   }
 }
 
+//yisen todo
 #endif  // USE_CBLAS
 template <>
 void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
@@ -626,16 +1192,35 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool 
int_target,
   }
 }
 
+// template <>
+// void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+//                               const Tensor* in, Tensor* out, Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t r = 0; r < nrow; r++) {
+//     int offset = (int)(r * ncol);
+//     float maxval = inPtr[offset];
+//     for (size_t c = 1; c < ncol; c++)
+//       maxval = (std::max)(maxval, inPtr[offset + c]);
+//     outPtr[r] = maxval;
+//   }
+// }
+
 template <>
-void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Block *in, Block *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const size_t nrow = in->shape()[0];
+  const size_t ncol = in->shape()[1];
+  vector<int> traversal_info = in->generate_traversal_info();
+    
   for (size_t r = 0; r < nrow; r++) {
-    int offset = (int)(r * ncol);
-    float maxval = inPtr[offset];
-    for (size_t c = 1; c < ncol; c++)
-      maxval = (std::max)(maxval, inPtr[offset + c]);
+    int counter_offset = (r * ncol);
+    float maxval = 0;
+    for (size_t c = 0; c < ncol; c++){
+      maxval = (std::max)(maxval, inPtr[traversal_info[in->shape().size()]]);
+      in->traverse_next(traversal_info, counter_offset+c+1);
+    }
     outPtr[r] = maxval;
   }
 }
@@ -644,7 +1229,7 @@ void RowMax<float, lang::Cpp>(const size_t nrow, const 
size_t ncol,
 /*
 template <>
 void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Block *A, const Block *v, Block *out,
+                              const Tensor* A, const Tensor* v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
@@ -659,7 +1244,7 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const 
size_t ncol,
 
 template <>
 void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Block *A, const Block *v, Block *out,
+                              const Tensor* A, const Tensor* v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
@@ -672,8 +1257,8 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const 
size_t ncol,
   }
 }
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
-                             const Block *in2, Block *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
+                             const Tensor* in2, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -686,7 +1271,7 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t 
n, const Block *in1,
 }
 template <>
 void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Block *in, Block *out, Context *ctx) {
+                               const Tensor* in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   float *bPtr = new float[ncol];
@@ -707,7 +1292,7 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const 
size_t ncol,
 
 template <>
 void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Block *in, Block *out, Context *ctx) {
+                                  const Tensor* in, Tensor* out, Context *ctx) 
{
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t c = 0; c < ncol; c++) {
@@ -723,7 +1308,7 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const 
size_t ncol,
 
 template <>
 void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Block *in, Block *out, Context *ctx) {
+                               const Tensor* in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t r = 0; r < nrow; r++) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index 9264e55..fd25607 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -50,19 +50,19 @@ enum CopyDirection {
 
 // configuration for device memory pool
 message MemPoolConf {
-       optional string type = 1 [default = "cnmem"];
-       // allocation size for each device, default is 256 MB
-       optional uint32 init_size = 2 [default = 256];
+  optional string type = 1 [default = "cnmem"];
+  // allocation size for each device, default is 256 MB
+  optional uint32 init_size = 2 [default = 256];
   // size limit in MB; report error/warning if this limit is reached.
   // 0 for unlimited memory, i.e., use as much memory as the device has
   // not used currently.
-       optional uint32 max_size = 3 [default = 0];
+  optional uint32 max_size = 3 [default = 0];
 
-       // memory manager flag for cnmem
-       // flag = 0: default flag
-       // flag = 1: prevent the manager from growing its memory consumption
-       // flag = 2: prevent the manager from stealing memory
-       optional uint32 flag = 11 [default = 0];
+  // memory manager flag for cnmem
+  // flag = 0: default flag
+  // flag = 1: prevent the manager from growing its memory consumption
+  // flag = 2: prevent the manager from stealing memory
+  optional uint32 flag = 11 [default = 0];
   repeated uint32 device = 12;
 }
 
@@ -70,7 +70,8 @@ message MemPoolConf {
 message TensorProto {
   repeated uint32 shape = 1;
   optional DataType data_type = 2;
-  optional bool transpose = 3;
+  //optional bool transpose = 3;
+  repeated int32 strides = 3;
   repeated float float_data = 4 [packed = true];
   repeated double double_data = 5 [packed = true];
   repeated int32 int_data = 6 [packed = true];

Reply via email to