http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cpp.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h index ec7a892..2c5c272 100644 --- a/src/core/tensor/tensor_math_cpp.h +++ b/src/core/tensor/tensor_math_cpp.h @@ -25,12 +25,11 @@ #include <cblas.h> #endif -/// TODO(wangwei) Clean the implementations following the comments in -/// tensor_math.h. namespace singa { -template<> -void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +template <> +void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { @@ -39,180 +38,150 @@ void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context } template <> -void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) { +void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); - for (size_t i = 0; i < num; i++) outPtr[i] = x; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] = inPtr[i] + x; + } } -// sum all elements of input into out -// TODO(wangwei) optimize using omp template <> -void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) { - float s = 0.f; - const float *inPtr = static_cast<const float *>(in->data()); +void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, + Blob *out, Context *ctx) { + // CHECK_EQ(ctx->stream, nullptr); + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); for (size_t i = 0; i < num; i++) { - s += inPtr[i]; + outPtr[i] = in1Ptr[i] + in2Ptr[i]; } - *out = s; } template <> -void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void Clamp<float, lang::Cpp>(const size_t num, const float low, + const float high, const Blob *in, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float*>(in->data()); + const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; + if (inPtr[i] > high) { + outPtr[i] = high; + } else if (inPtr[i] < low) { + outPtr[i] = low; + } else { + outPtr[i] = inPtr[i]; + } } } template <> -void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + for (size_t i = 0; i < num; i++) { + CHECK_NE(in2Ptr[i], 0.f); + outPtr[i] = in1Ptr[i] / in2Ptr[i]; + } +} + +template <> +void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, + Blob *out, Context *ctx) { const float *inPtr = static_cast<const float *>(in->data()); + float *outPtr = static_cast<float *>(out->mutable_data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = exp(inPtr[i]); + CHECK_NE(inPtr[i], 0.f); + outPtr[i] = x / inPtr[i]; } } template <> -void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, + const float x, Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - CHECK_GT(inPtr[i], 0.f); - outPtr[i] = log(inPtr[i]); + outPtr[i] = inPtr[i] * x; } } template <> -void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, + const Blob *in2, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] = in1Ptr[i] * in2Ptr[i]; + } +} +template <> +void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - CHECK_GT(inPtr[i], 0.f); - outPtr[i] = sqrt(inPtr[i]); + outPtr[i] = exp(inPtr[i]); } } template <> -void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = inPtr[i] * inPtr[i]; + outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f; } } template <> -void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = tanh(inPtr[i]); + outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f; } } - template <> -void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f; + outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f; } } - template <> -void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) { +void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = 1.f / (1.f + exp(-inPtr[i])); + CHECK_GT(inPtr[i], 0.f); + outPtr[i] = log(inPtr[i]); } } - template <> -void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, - Blob *out, Context *ctx) { +void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); - float *bPtr = new float[ncol]; - for (size_t r = 0; r < nrow; r++) { - size_t offset = r * ncol; - float denom = 0.f; - for (size_t c = 0; c < ncol; c++) { - bPtr[c] = exp(inPtr[offset + c]); - denom += bPtr[c]; - } - for (size_t c = 0; c < ncol; c++) { - size_t idx = offset + c; - outPtr[idx] = bPtr[c] / denom; - } - } - delete bPtr; -} - -template <> -void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, - Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t r = 0; r < nrow; r++) { - size_t offset = r * ncol; - outPtr[r] = 0.f; - for (size_t c = 0; c < ncol; c++) { - outPtr[r] += inPtr[offset + c]; - } - } -} - -template <> -void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t c = 0; c < ncol; c++) { - outPtr[c] = 0.f; - } - for (size_t r = 0; r < nrow; r++) { - size_t offset = r * ncol; - for (size_t c = 0; c < ncol; c++) { - outPtr[c] += inPtr[offset + c]; - } - } -} - -template <> -void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, - Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float *APtr = static_cast<const float *>(A->data()); - const float *vPtr = static_cast<const float *>(v->data()); - for (size_t r = 0; r < nrow; r++) { - size_t offset = r * ncol; - for (size_t c = 0; c < ncol; c++) { - outPtr[offset + c] = APtr[offset + c] + vPtr[c]; - } - } -} - -template <> -void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, - Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float *APtr = static_cast<const float *>(A->data()); - const float *vPtr = static_cast<const float *>(v->data()); - for (size_t r = 0; r < nrow; r++) { - size_t offset = r * ncol; - for (size_t c = 0; c < ncol; c++) { - outPtr[offset + c] = APtr[offset + c] + vPtr[r]; - } - } -} - -template <> -void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { + for (size_t i = 0; i < num; i++) { + outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f; + } +} +template <> +void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { outPtr[i] = pow(inPtr[i], x); } } @@ -220,252 +189,230 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob template <> void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, Blob *out, Context *ctx) { - float *outPtr= static_cast<float *>(out->mutable_data()); - const float *in1Ptr= static_cast<const float *>(in1->data()); + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *in1Ptr = static_cast<const float *>(in1->data()); const float *in2Ptr = static_cast<const float *>(in2->data()); for (size_t i = 0; i < num; i++) { outPtr[i] = pow(in1Ptr[i], in2Ptr[i]); } } - template <> -void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in, - Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - if (inPtr[i] > high) { - outPtr[i] = high; - } - else if (inPtr[i] < low) { - outPtr[i] = low; - } - else { - outPtr[i] = inPtr[i]; - } - } +void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f; + } } - template <> -void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x, - Blob *out, Context *ctx) { +void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + for (size_t i = 0; i < num; i++) outPtr[i] = x; +} +template <> +void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = inPtr[i] + x; + outPtr[i] = 1.f / (1.f + exp(-inPtr[i])); } } template <> -void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, - Blob *out, Context *ctx) { - // CHECK_EQ(ctx->stream, nullptr); - float *outPtr= static_cast<float *>(out->mutable_data()); - const float *in1Ptr = static_cast<const float *>(in1->data()); - const float *in2Ptr = static_cast<const float *>(in2->data()); +void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = in1Ptr[i] + in2Ptr[i]; + outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; } } template <> -void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, - Blob *out, Context *ctx) { - // CHECK_EQ(ctx->stream, nullptr); - float *outPtr= static_cast<float *>(out->mutable_data()); - const float *in1Ptr = static_cast<const float *>(in1->data()); - const float *in2Ptr = static_cast<const float *>(in2->data()); +void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = in1Ptr[i] - in2Ptr[i]; + CHECK_GT(inPtr[i], 0.f); + outPtr[i] = sqrt(inPtr[i]); } } template <> -void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x, - Blob *out, Context *ctx) { - float *outPtr= static_cast<float *>(out->mutable_data()); +void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = inPtr[i] * x; + outPtr[i] = inPtr[i] * inPtr[i]; } } template <> -void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, - Blob *out, Context *ctx) { - float *outPtr= static_cast<float *>(out->mutable_data()); +void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, + Blob *out, Context *ctx) { + // CHECK_EQ(ctx->stream, nullptr); + float *outPtr = static_cast<float *>(out->mutable_data()); const float *in1Ptr = static_cast<const float *>(in1->data()); const float *in2Ptr = static_cast<const float *>(in2->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = in1Ptr[i] * in2Ptr[i]; + outPtr[i] = in1Ptr[i] - in2Ptr[i]; } } +// sum all elements of input into out +// TODO(wangwei) optimize using omp template <> -void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, - Blob *out, Context *ctx) { - float *outPtr= static_cast<float *>(out->mutable_data()); - const float *in1Ptr = static_cast<const float *>(in1->data()); - const float *in2Ptr = static_cast<const float *>(in2->data()); +void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, + Context *ctx) { + float s = 0.f; + const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - CHECK_NE(in2Ptr[i],0.f); - outPtr[i] = in1Ptr[i] / in2Ptr[i]; + s += inPtr[i]; } + *out = s; } template <> -void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in, - Blob *out, Context *ctx) { - float *outPtr= static_cast<float *>(out->mutable_data()); +void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - CHECK_NE(inPtr[i],0.f); - outPtr[i] = x / inPtr[i]; + outPtr[i] = tanh(inPtr[i]); } } +// =========Matrix operations ================================================ + template <> -void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2, - Blob *out, Context *ctx) { - float *outPtr= static_cast<float *>(out->mutable_data()); - const float *in1Ptr = static_cast<const float *>(in1->data()); - const float *in2Ptr = static_cast<const float *>(in2->data()); - for (size_t r = 0; r < m ; r++) { - size_t offset = r * n; - for (size_t c = 0; c < n; c++) { - outPtr[offset + c] = in1Ptr[r] * in2Ptr[c]; - } - } +void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *A, const Blob *v, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + for (size_t c = 0; c < ncol; c++) { + outPtr[offset + c] = APtr[offset + c] + vPtr[r]; + } + } } template <> -void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x, - Blob *out, Context *ctx) { +void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *A, const Blob *v, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f; + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + for (size_t c = 0; c < ncol; c++) { + outPtr[offset + c] = APtr[offset + c] + vPtr[c]; + } } } - template <> -void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x, - Blob *out, Context *ctx) { +void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, + const Blob *in2, Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f; + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + for (size_t r = 0; r < m; r++) { + size_t offset = r * n; + for (size_t c = 0; c < n; c++) { + outPtr[offset + c] = in1Ptr[r] * in2Ptr[c]; + } } } - template <> -void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x, - Blob *out, Context *ctx) { +void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *in, Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f; + float *bPtr = new float[ncol]; + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + float denom = 0.f; + for (size_t c = 0; c < ncol; c++) { + bPtr[c] = exp(inPtr[offset + c]); + denom += bPtr[c]; + } + for (size_t c = 0; c < ncol; c++) { + size_t idx = offset + c; + outPtr[idx] = bPtr[c] / denom; + } } + delete bPtr; } template <> -void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x, - Blob *out, Context *ctx) { +void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *in, Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f; + for (size_t c = 0; c < ncol; c++) { + outPtr[c] = 0.f; + } + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + for (size_t c = 0; c < ncol; c++) { + outPtr[c] += inPtr[offset + c]; + } } } template <> -void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) { - size_t maxPos = 0; - float maxVal = 0; - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - if (i == 0) { - maxVal = inPtr[i]; - } - else if (inPtr[i] > maxVal) { - maxVal = inPtr[i]; - maxPos = i; - } - } - *out = maxPos; -} - -template <> -void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) { - size_t minPos = 0; - float minVal = 0; +void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *in, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - if (i == 0) { - minVal = inPtr[i]; - } - else if (inPtr[i] > minVal) { - minVal = inPtr[i]; - minPos = i; - } - } - *out = minPos; + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + outPtr[r] = 0.f; + for (size_t c = 0; c < ncol; c++) { + outPtr[r] += inPtr[offset + c]; + } + } } +// ===============Random operations========================================== template <> -void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) { - float sum = 0; - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - sum += fabs(inPtr[i]); - } +void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, + Context *ctx) { + std::bernoulli_distribution distribution(p); + float *outPtr = static_cast<float *>(out->mutable_data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f; + } } template <> -void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in, - Blob *out, Context *ctx) { +void Gaussian<float, lang::Cpp>(const size_t num, const float mean, + const float std, Blob *out, Context *ctx) { + std::normal_distribution<float> distribution(mean, std); float *outPtr = static_cast<float *>(out->mutable_data()); - const float *inPtr = static_cast<const float *>(in->data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] += alpha * inPtr[i]; - } + for (size_t i = 0; i < num; i++) { + outPtr[i] = static_cast<float>(distribution(ctx->random_generator)); + } } - template <> -void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] *= x; - } +void Uniform<float, lang::Cpp>(const size_t num, const float low, + const float high, Blob *out, Context *ctx) { + std::uniform_real_distribution<float> distribution(low, high); + float *outPtr = static_cast<float *>(out->mutable_data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] = static_cast<float>(distribution(ctx->random_generator)); + } } -//template <> -//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, -// float *out, Context *ctx) { -// float sum = 0; -// const float *in1Ptr = static_cast<const float *>(in1->data()); -// const float *in2Ptr = static_cast<const float *>(in2->data()); -// for (size_t i = 0; i < num; i++) { -// sum += in1Ptr[i] * in2Ptr[i]; -// } -//} - -template <> -void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha, - const Blob *A, const Blob *v, const float beta, - Blob *out, Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - const float* APtr = static_cast<const float *>(A->data()); - const float* vPtr = static_cast<const float *>(v->data()); - for (size_t r = 0; r < m; r++) { - float sum = 0; - for (size_t c = 0; c < n; c++) { - size_t idx = trans ? c * m + r : r * n + c; - sum += APtr[idx] * vPtr[c]; - } - outPtr[r] = alpha * sum + beta * outPtr[r]; - } -} +// ====================Blas operations====================================== template <> void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow, @@ -491,37 +438,21 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow, } } +#ifdef USE_CBLAS template <> -void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) { - std::bernoulli_distribution distribution(p); +void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in, + Blob *out, Context *ctx) { + const float *inPtr = static_cast<const float *>(in->data()); float *outPtr = static_cast<float *>(out->mutable_data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f; - } + cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1); } - -template <> -void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out, - Context *ctx) { - std::uniform_real_distribution<float> distribution(low, high); - float *outPtr= static_cast<float *>(out->mutable_data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = static_cast<float>(distribution(ctx->random_generator)); - } -} - template <> -void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out, - Context *ctx) { - std::normal_distribution<float> distribution(mean, std); +void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, + Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); - for (size_t i = 0; i < num; i++) { - outPtr[i] = static_cast<float>(distribution(ctx->random_generator)); - } + cblas_sscal(num, x, outPtr, 1); } - -#ifdef USE_CBLAS template <> void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, float *out, Context *ctx) { @@ -529,6 +460,21 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, const float *in2Ptr = static_cast<const float *>(in2->data()); *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1); } +template <> +void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, + const float alpha, const Blob *A, const Blob *v, + const float beta, Blob *out, Context *ctx) { + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + float *outPtr = static_cast<float *>(out->mutable_data()); + if (!trans) { + cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1, + beta, outPtr, 1); + } else { + cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta, + outPtr, 1); + } +} template <> void GEMM<float, lang::Cpp>(const bool transA, const bool transB, @@ -548,6 +494,98 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB, lda, BPtr, ldb, beta, CPtr, ldc); } +#else + +template <> +void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, + Context *ctx) { + size_t maxPos = 0; + float maxVal = 0; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + if (i == 0) { + maxVal = inPtr[i]; + } else if (inPtr[i] > maxVal) { + maxVal = inPtr[i]; + maxPos = i; + } + } + *out = maxPos; +} +template <> +void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, + Context *ctx) { + size_t minPos = 0; + float minVal = 0; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + if (i == 0) { + minVal = inPtr[i]; + } else if (inPtr[i] > minVal) { + minVal = inPtr[i]; + minPos = i; + } + } + *out = minPos; +} + +template <> +void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, + Context *ctx) { + float sum = 0; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + sum += fabs(inPtr[i]); + } +} + +template <> +void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] += alpha * inPtr[i]; + } +} + +template <> +void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] *= x; + } +} + +template <> +void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, + float *out, Context *ctx) { + float sum = 0; + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + for (size_t i = 0; i < num; i++) { + sum += in1Ptr[i] * in2Ptr[i]; + } +} + +template <> +void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, + const float alpha, const Blob *A, const Blob *v, + const float beta, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + for (size_t r = 0; r < m; r++) { + float sum = 0; + for (size_t c = 0; c < n; c++) { + size_t idx = trans ? c * m + r : r * n + c; + sum += APtr[idx] * vPtr[c]; + } + outPtr[r] = alpha * sum + beta * outPtr[r]; + } +} + #endif // USE_CBLAS } // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cuda.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h index 4a2ba66..f9841a3 100644 --- a/src/core/tensor/tensor_math_cuda.h +++ b/src/core/tensor/tensor_math_cuda.h @@ -26,75 +26,100 @@ #include "singa/core/common.h" namespace singa { - -// TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h. -// TODO(wangwei) optimize using stream +// =================Elementwise operations=================================== template <> -void Add<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs, - Blob *ret, Context *ctx) { - const float *a = static_cast<const float *>(lhs->data()); - const float *b = static_cast<const float *>(rhs->data()); - float *c = static_cast<float *>(ret->mutable_data()); - cuda::add(count, a, b, c); +void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2, + Blob *out, Context *ctx) { + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + float *outPtr = static_cast<float *>(out->mutable_data()); + cuda::add(num, in1Ptr, in2Ptr, outPtr); } -// TODO(wangwei) optimize using stream +// follow the consistency guide of math API template <> -void Sub<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs, - Blob *ret, Context *ctx) { - const float *a = static_cast<const float *>(lhs->data()); - const float *b = static_cast<const float *>(rhs->data()); - float *c = static_cast<float *>(ret->mutable_data()); - cuda::sub(count, a, b, c); +void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + cuda::Div(num, x, inPtr, outPtr, ctx->stream); } template <> -void EltwiseMult<float, lang::Cuda>(int count, const Blob *input, float x, - Blob *ret, Context *ctx) { - float *dptr = static_cast<float *>(ret->mutable_data()); - const float *lptr = static_cast<const float *>(input->data()); - cuda::mult(count, lptr, x, dptr); +void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in, + const float x, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + cuda::mult(num, inPtr, x, outPtr); } -// TODO(wangwei) optimize using stream template <> -void Square<float, lang::Cuda>(int count, const Blob *input, Blob *ret, - Context *ctx) { - const float *in = static_cast<const float *>(input->data()); - float *out = static_cast<float *>(ret->mutable_data()); - cuda::square(count, in, out); +void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + cuda::GE(num, inPtr, x, outPtr, ctx->stream); } - -// sum all elements of input into ret -// TODO(wangwei) optimize using stream template <> -void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret, - Context *ctx) { - const float *in = static_cast<const float *>(input->data()); - cuda::sum(count, in, ret); +void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + cuda::GT(num, inPtr, x, outPtr, ctx->stream); } - -// follow the consistency guide of math API template <> -void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in, - Blob *out, Context *ctx) { +void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); - cuda::Div(num, alpha, inPtr, outPtr, ctx->stream); + cuda::LE(num, inPtr, x, outPtr, ctx->stream); +} +template <> +void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + cuda::LT(num, inPtr, x, outPtr, ctx->stream); } - template <> void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); cuda::Set(num, x, outPtr, ctx->stream); } +// TODO(wangwei) optimize using stream +template <> +void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + cuda::square(num, inPtr, outPtr); +} +// TODO(wangwei) optimize using stream +template <> +void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2, + Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + cuda::sub(num, in1Ptr, in2Ptr, outPtr); +} +// sum all elements of input into ret +// TODO(wangwei) optimize using stream +template <> +void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out, + Context *ctx) { + const float *inPtr = static_cast<const float *>(in->data()); + cuda::sum(num, inPtr, out); +} + +// =========================Blas operations================================== // NOTE: cublas uses column major order. // http://peterwittek.com/cublas-matrix-c-style.html template <> void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow, const size_t ncol, const Blob *M, const Blob *v, Blob *out, Context *ctx) { - auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream + auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream const float *MPtr = static_cast<const float *>(M->data()); const float *vPtr = static_cast<const float *>(v->data()); float *outPtr = static_cast<float *>(out->mutable_data()); @@ -106,6 +131,22 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow, vPtr, 1, outPtr, ncol)); } } +template <> +void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n, + const float alpha, const Blob *A, const Blob *v, + const float beta, Blob *out, Context *ctx) { + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + float *outPtr = static_cast<float *>(out->mutable_data()); + auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream + if (!trans) + CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr, + 1, &beta, outPtr, 1)); + else + CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr, + 1, &beta, outPtr, 1)); +} + // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm template <> void GEMM<float, lang::Cuda>(const bool transA, const bool transB, @@ -121,44 +162,11 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB, const float *APtr = static_cast<const float *>(A->data()); const float *BPtr = static_cast<const float *>(B->data()); float *CPtr = static_cast<float *>(C->mutable_data()); - auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream + auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha, BPtr, ldb, APtr, lda, &beta, CPtr, ldc)); } -template <> -void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::GE(num, inPtr, x, outPtr, ctx->stream); -} -template <> -void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::GT(num, inPtr, x, outPtr, ctx->stream); -} -template <> -void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::LE(num, inPtr, x, outPtr, ctx->stream); -} -template <> -void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::LT(num, inPtr, x, outPtr, ctx->stream); -} - - - - - } // namespace singa #endif // USE_CUDA http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/test/singa/test_tensor_math.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc index 823445f..94ca283 100644 --- a/test/singa/test_tensor_math.cc +++ b/test/singa/test_tensor_math.cc @@ -117,12 +117,11 @@ TEST_F(TestTensorMath, MemberTanh) { } TEST_F(TestTensorMath, Sum) { - Tensor p1(Shape{1,2}); - p1 = Sum(e, 0); + Tensor p1 = Sum(e, 0); const float *dptr1 = p1.data<const float *>(); EXPECT_FLOAT_EQ(9.0f,dptr1[0]); EXPECT_FLOAT_EQ(12.0f,dptr1[1]); - + Tensor p2(Shape{3,1}); p2 = Sum(e, 1); const float *dptr2 = p2.data<const float *>(); @@ -143,9 +142,9 @@ TEST_F(TestTensorMath, SoftMax) { EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5); EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5); EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5); - + Tensor p2(Shape{3,2}); - p2 = SoftMax(e,1); + p2 = SoftMax(e,1); const float *dptr2 = p2.data<const float *>(); EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5); EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5); @@ -237,12 +236,12 @@ TEST_F(TestTensorMath, MemberDiv) { TEST_F(TestTensorMath, MemberBernoulli) { Tensor p1(Shape{10000}); - Bernoulli(0.3,&p1); + Bernoulli(0.3f, &p1); const float* dptr1 = p1.data<const float*>(); float sum = 0; for(int i = 0; i < 10000; i++) sum += dptr1[i]; float mean = sum/10000; - EXPECT_NEAR(mean, 0.3, 1e-2); + EXPECT_NEAR(mean, 0.3f, 1e-2); sum = 0; for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean); @@ -267,7 +266,7 @@ TEST_F(TestTensorMath, MemberUniform) { TEST_F(TestTensorMath, MemberGaussian) { Tensor p1(Shape{50000}); - Gaussian(0.0,1.0,&p1); + Gaussian(0.0f,1.0f,&p1); const float* dptr1 = p1.data<const float*>(); float sum = 0; for(int i = 0; i < 50000; i++) sum += dptr1[i];
