This is an automated email from the ASF dual-hosted git repository. arnabp20 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push: new b350c04 [SYSTEMDS-2725] NNZ counting for native blas b350c04 is described below commit b350c04f53fd249cc4c916093404c5d760055327 Author: Mark Dokter <m...@dokter.cc> AuthorDate: Tue Nov 17 00:34:01 2020 +0100 [SYSTEMDS-2725] NNZ counting for native blas This patch moves nnz count to native blas. This should improve performance as the native nnz computation leverages openmp reduction. In addition, this patch adds: * a lot of code cleanup * more fallback to java mat mult --- src/main/cpp/CMakeLists.txt | 10 +- src/main/cpp/build.sh | 4 +- src/main/cpp/{libmatrixmult.h => common.h} | 62 ++++---- src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so | Bin 39416 -> 39384 bytes src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll | Bin 26624 -> 27648 bytes .../cpp/lib/libsystemds_openblas-Linux-x86_64.so | Bin 43176 -> 43144 bytes .../cpp/lib/libsystemds_openblas-Windows-AMD64.dll | Bin 28160 -> 140800 bytes src/main/cpp/libmatrixdnn.cpp | 40 +++--- src/main/cpp/libmatrixdnn.h | 46 +++--- src/main/cpp/libmatrixmult.cpp | 57 ++------ src/main/cpp/libmatrixmult.h | 47 ++---- src/main/cpp/systemds.cpp | 159 ++++++++++----------- src/main/cpp/systemds.h | 22 +-- .../sysds/runtime/matrix/data/LibMatrixMult.java | 15 +- .../sysds/runtime/matrix/data/LibMatrixNative.java | 31 ++-- .../java/org/apache/sysds/utils/NativeHelper.java | 24 ++-- 16 files changed, 224 insertions(+), 293 deletions(-) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 9489f8f..b3de917 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -31,7 +31,7 @@ option(USE_OPEN_BLAS "Whether to use OpenBLAS (Defaults to compiling with Intel option(USE_INTEL_MKL "Whether to use Intel MKL (Defaults to compiling with Intel MKL)" OFF) # Build a shared libraray -set(HEADER_FILES libmatrixdnn.h libmatrixmult.h systemds.h) +set(HEADER_FILES libmatrixdnn.h libmatrixmult.h systemds.h common.h) set(SOURCE_FILES libmatrixdnn.cpp libmatrixmult.cpp systemds.cpp) # Build a shared libraray @@ -55,6 +55,14 @@ if (WIN32) set(CMAKE_SHARED_LIBRARY_PREFIX lib CACHE INTERNAL "") endif() +#ToDo: this won't be necessary once we properly redo mkl versions of conv2d* functions +if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(CMAKE_CXX_FLAGS "-Wno-narrowing") +endif() +if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4838") +endif() + set(CMAKE_BUILD_TYPE Release) if (USE_OPEN_BLAS) diff --git a/src/main/cpp/build.sh b/src/main/cpp/build.sh index 4378340..df67aba 100755 --- a/src/main/cpp/build.sh +++ b/src/main/cpp/build.sh @@ -47,13 +47,13 @@ if ! [ -x "$(command -v patchelf)" ]; then fi # configure and compile INTEL MKL -cmake . -B INTEL -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64" +cmake . -B INTEL -DUSE_INTEL_MKL=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-DUSE_GNU_THREADING -m64" cmake --build INTEL --target install --config Release patchelf --add-needed libmkl_rt.so lib/libsystemds_mkl-Linux-x86_64.so rm -R INTEL # configure and compile OPENBLAS -cmake . -B OPENBLAS -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64" +cmake . -B OPENBLAS -DUSE_OPEN_BLAS=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_FLAGS="-m64" cmake --build OPENBLAS --target install --config Release patchelf --add-needed libopenblas.so.0 lib/libsystemds_openblas-Linux-x86_64.so rm -R OPENBLAS diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/common.h similarity index 51% copy from src/main/cpp/libmatrixmult.h copy to src/main/cpp/common.h index a4d33ad..34f9801 100644 --- a/src/main/cpp/libmatrixmult.h +++ b/src/main/cpp/common.h @@ -17,31 +17,18 @@ * under the License. */ -#ifndef _libmatrixmult_h -#define _libmatrixmult_h - -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - -// ***************************************************************** -// We support Intel MKL (recommended) or OpenBLAS. -// These flags are used for conditional compilation with mkl and openblas -// #define USE_INTEL_MKL -// #define USE_GNU_THREADING -// #define USE_OPEN_BLAS -// ***************************************************************** - -//#ifdef __cplusplus -// extern "C" { -//#endif -//#ifdef __cplusplus -//} -//#endif +#pragma once +#ifndef COMMON_H +#define COMMON_H // Since we call cblas_dgemm in openmp for loop, // we call "extension" APIs for setting the number of threads. #ifdef USE_INTEL_MKL #include <mkl.h> + #if INTEL_MKL_VERSION < 20170000 + // Will throw an error at development time in non-standard settings + PLEASE DONOT COMPILE SHARED LIBRARIES WITH OLDER MKL VERSIONS + #endif #include <mkl_service.h> extern "C" void mkl_set_num_threads(int numThreads); #else @@ -49,16 +36,27 @@ extern "C" void mkl_set_num_threads(int numThreads); extern "C" void openblas_set_num_threads(int numThreads); #endif -void setNumThreadsForBLAS(int numThreads); - -// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape -// (m1rlen, m1clen) and (m1clen, m2clen) -void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, - int numThreads); -void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, - int numThreads); - -void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, - bool isLeftTrans, int numThreads); - +template<class FP> +size_t computeNNZ(FP* arr, int limit) { + size_t nnz = 0; +#ifndef USE_INTEL_MKL +#pragma omp parallel for reduction(+: nnz) +#endif + for(int i=0; i<limit; i++) + nnz += (arr[i]!=0) ? 1 : 0; + return nnz; +} + +static int SYSDS_CURRENT_NUM_THREADS = -1; +static void setNumThreadsForBLAS(int numThreads) { + if (SYSDS_CURRENT_NUM_THREADS != numThreads) { +#ifdef USE_OPEN_BLAS + openblas_set_num_threads(numThreads); +#else + mkl_set_num_threads(numThreads); #endif + SYSDS_CURRENT_NUM_THREADS = numThreads; + } +} + +#endif // COMMON_H diff --git a/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so index 800fe13..f6b495a 100644 Binary files a/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_mkl-Linux-x86_64.so differ diff --git a/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll index 2960551..b52adbe 100644 Binary files a/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll and b/src/main/cpp/lib/libsystemds_mkl-Windows-AMD64.dll differ diff --git a/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so index 6e45343..e953e7f 100644 Binary files a/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemds_openblas-Linux-x86_64.so differ diff --git a/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll b/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll index a39d40c..6fe9676 100644 Binary files a/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll and b/src/main/cpp/lib/libsystemds_openblas-Windows-AMD64.dll differ diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp index 3e3e2f3..da6975c 100644 --- a/src/main/cpp/libmatrixdnn.cpp +++ b/src/main/cpp/libmatrixdnn.cpp @@ -17,30 +17,21 @@ * under the License. */ -#include "libmatrixmult.h" -#include "libmatrixdnn.h" -#include <cstdlib> -#include <iostream> -#include <cstdio> -#include <cmath> +#include "common.h" #include <cstring> +#include <iostream> +#include "libmatrixdnn.h" +#include "libmatrixmult.h" + + #ifdef USE_INTEL_MKL - #include "mkl_dnn.h" +#include "mkl_dnn.h" #else - #include "omp.h" -#endif - -template<class FP> int computeNNZ(FP* arr, int limit) { - int nnz = 0; -#ifndef USE_INTEL_MKL - #pragma omp parallel for reduction(+: nnz) +#include "omp.h" #endif - for(int i=0; i<limit; i++) - nnz += (arr[i]!=0) ? 1 : 0; - return nnz; -} -template<class FP> void biasAdd(FP* bias, FP* output, int K, int PQ) { +template<class FP> +void biasAdd(FP* bias, FP* output, int K, int PQ) { for(int k = 0, index=0; k < K; k++) for(int pq = 0; pq < PQ; pq++, index++) output[index] += bias[k]; @@ -91,7 +82,8 @@ void col2im(double* inputArray, double* outputArray, int N, int C, int H, int W, } } -template<class FP> void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W, +template<class FP> +void im2col(FP* inputArray, FP* outputArray, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) { int CRS = C * R * S; std::size_t size = Q * sizeof(FP); @@ -151,7 +143,7 @@ bool MKL_DNN_ERROR(dnnError_t code) { } #endif -int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, +size_t conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { int CRS = C*R*S; @@ -244,7 +236,7 @@ int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, return computeNNZ<double>(retPtr, K*CRS); } -int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, +size_t conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { int CHW = C * H * W; #ifdef USE_INTEL_MKL @@ -370,7 +362,7 @@ void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals delete [] temp1; } -int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, +size_t dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) { int KPQ = K * P * Q; @@ -446,7 +438,7 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do #endif } -int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, +size_t sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) { int KPQ = K * P * Q; diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h index 3f457b0..f3fef37 100644 --- a/src/main/cpp/libmatrixdnn.h +++ b/src/main/cpp/libmatrixdnn.h @@ -16,36 +16,34 @@ * specific language governing permissions and limitations * under the License. */ - -#ifndef _libmatrixdnn_h -#define _libmatrixdnn_h -#ifdef USE_INTEL_MKL - #include <mkl.h> - #if INTEL_MKL_VERSION < 20170000 - // Will throw an error at development time in non-standard settings - PLEASE DONOT COMPILE SHARED LIBRARIES WITH OLDER MKL VERSIONS - #endif -#endif +#pragma once +#ifndef LIBMATRIXDNN_H +#define LIBMATRIXDNN_H -int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, - int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); +#include <cstddef> -int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, - int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); +size_t conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, + int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, + int numThreads); -int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, - int N, int C, int H, int W, int K, int R, int S, - int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads); +size_t conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, + int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, + int numThreads); -int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, - int N, int C, int H, int W, int K, int R, int S, - int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads); +size_t dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, + int W, int K, int R, int S,int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, + bool addBias, int numThreads); + +size_t sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, + bool addBias, int numThreads); void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, double* ret, int N, int C, int H, int W, - int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); -void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret, int N, int C, int H, int W, - int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); +void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals, double* rotatedDoutPtr, double* ret, + int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, + int pad_h, int pad_w, int P, int Q, int numThreads); -#endif \ No newline at end of file +#endif // LIBMATRIXDNN_H diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp index c0c909a..0d38511 100644 --- a/src/main/cpp/libmatrixmult.cpp +++ b/src/main/cpp/libmatrixmult.cpp @@ -17,71 +17,42 @@ * under the License. */ +#include "common.h" #include "libmatrixmult.h" -#include "omp.h" -#include <cmath> -#include <cstdlib> -#ifdef USE_OPEN_BLAS -#include <cblas.h> -#else -#include <mkl_service.h> -#endif - -static int SYSDS_CURRENT_NUM_THREADS = -1; -void setNumThreadsForBLAS(int numThreads) { - if (SYSDS_CURRENT_NUM_THREADS != numThreads) { -#ifdef USE_OPEN_BLAS - openblas_set_num_threads(numThreads); -#else - mkl_set_num_threads(numThreads); -#endif - SYSDS_CURRENT_NUM_THREADS = numThreads; - } -} - -void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, - int numThreads) { +void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, int numThreads) { // BLAS routine dispatch according to input dimension sizes (we don't use // cblas_dgemv with CblasColMajor for matrix-vector because it was generally // slower than dgemm) setNumThreadsForBLAS(numThreads); if (m == 1 && n == 1) // VV - retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1); + retPtr[0] = cblas_ddot(k, m1Ptr, 1, m2Ptr, 1); else if (n == 1) // MV - cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, - retPtr, 1); + cblas_dgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1); else // MM - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, - m2Ptr, n, 0, retPtr, n); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n); } -void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, - int numThreads) { +void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, int numThreads) { // BLAS routine dispatch according to input dimension sizes (we don't use // cblas_sgemv with CblasColMajor for matrix-vector because it was generally // slower than sgemm) setNumThreadsForBLAS(numThreads); if (m == 1 && n == 1) // VV - retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1); + retPtr[0] = cblas_sdot(k, m1Ptr, 1, m2Ptr, 1); else if (n == 1) // MV - cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, - retPtr, 1); + cblas_sgemv(CblasRowMajor, CblasNoTrans, m, k, 1, m1Ptr, k, m2Ptr, 1, 0, retPtr, 1); else // MM - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, - m2Ptr, n, 0, retPtr, n); + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n); } -void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool leftTrans, - int numThreads) { +void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool leftTrans, int numThreads) { setNumThreadsForBLAS(numThreads); - if ((leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1)) { - retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1); - } else { // general case + if ((leftTrans && m1clen == 1) || (!leftTrans && m1rlen == 1)) + retPtr[0] = cblas_ddot(leftTrans ? m1rlen : m1clen, m1Ptr, 1, m1Ptr, 1); + else { // general case int n = leftTrans ? m1clen : m1rlen; int k = leftTrans ? m1rlen : m1clen; - cblas_dsyrk(CblasRowMajor, CblasUpper, - leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, - retPtr, n); + cblas_dsyrk(CblasRowMajor, CblasUpper, leftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n); } } diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h index a4d33ad..f38155d 100644 --- a/src/main/cpp/libmatrixmult.h +++ b/src/main/cpp/libmatrixmult.h @@ -17,48 +17,19 @@ * under the License. */ -#ifndef _libmatrixmult_h -#define _libmatrixmult_h +#ifndef LIBMATRIXMULT_H +#define LIBMATRIXMULT_H #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) -// ***************************************************************** -// We support Intel MKL (recommended) or OpenBLAS. -// These flags are used for conditional compilation with mkl and openblas -// #define USE_INTEL_MKL -// #define USE_GNU_THREADING -// #define USE_OPEN_BLAS -// ***************************************************************** +// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape (m1rlen, m1clen) and (m1clen, m2clen) +void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, int numThreads); -//#ifdef __cplusplus -// extern "C" { -//#endif -//#ifdef __cplusplus -//} -//#endif +// Same matrix multiply for single precision +void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, int numThreads); -// Since we call cblas_dgemm in openmp for loop, -// we call "extension" APIs for setting the number of threads. -#ifdef USE_INTEL_MKL -#include <mkl.h> -#include <mkl_service.h> -extern "C" void mkl_set_num_threads(int numThreads); -#else -#include <cblas.h> -extern "C" void openblas_set_num_threads(int numThreads); -#endif +// Matrix transpose +void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads); -void setNumThreadsForBLAS(int numThreads); - -// Multiplies two matrices m1Ptr and m2Ptr in row-major format of shape -// (m1rlen, m1clen) and (m1clen, m2clen) -void dmatmult(double *m1Ptr, double *m2Ptr, double *retPtr, int m, int k, int n, - int numThreads); -void smatmult(float *m1Ptr, float *m2Ptr, float *retPtr, int m, int k, int n, - int numThreads); - -void tsmm(double *m1Ptr, double *retPtr, int m1rlen, int m1clen, - bool isLeftTrans, int numThreads); - -#endif +#endif // LIBMATRIXMULT_H diff --git a/src/main/cpp/systemds.cpp b/src/main/cpp/systemds.cpp index 193e847..47b6e8f 100644 --- a/src/main/cpp/systemds.cpp +++ b/src/main/cpp/systemds.cpp @@ -16,28 +16,11 @@ * specific language governing permissions and limitations * under the License. */ - -#include "systemds.h" -#include "libmatrixmult.h" -#include "libmatrixdnn.h" - -// Linux: -// g++ -o lib/libsystemds_mkl-Linux-x86_64.so *.cpp -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread -lm -ldl -DUSE_INTEL_MKL -DUSE_MKL_DNN -L$MKLROOT/lib/intel64 -m64 -O3 -shared -fPIC -// g++ -o lib/libsystemds_openblas-Linux-x86_64.so *.cpp -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -I/opt/OpenBLAS/include/ -L/opt/OpenBLAS/lib/ -fopenmp -O3 -shared -fPIC -// Mac OSX: -// g++ -o libsystemds_mkl-linux-x86_64.dylib *.cpp -I$JAVA_HOME/include -I$MKLROOT/include -I$JAVA_HOME/include/linux -lmkl_rt -lpthread -lm -ldl -DUSE_INTEL_MKL -DUSE_GNU_THREADING -L$MKLROOT/lib/intel64 -m64 -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup -// g++ -o libsystemds_openblas-linux-x86_64.dylib *.cpp -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -lopenblas -lpthread -lm -ldl -DUSE_OPEN_BLAS -L/opt/OpenBLAS/lib/ -fopenmp -O3 -dynamiclib -fPIC -undefined dynamic_lookup - -// Windows MKL: -// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64 -// "%MKLROOT%"\bin\mklvars.bat intel64 -// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25 -// cl *.cpp -I. -I"%MKLROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_INTEL_MKL -Fesystemds_mkl-windows-x86_64.dll -MD -LD "%MKLROOT%"\lib\intel64_win\mkl_intel_thread_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_core_dll.lib "%MKLROOT%"\lib\intel64_win\mkl_intel_lp64_dll.lib -// Windows OpenBLAS: -// "C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\vcvarsall.bat" amd64 -// set JAVA_HOME=C:\Program Files\Java\jdk1.8.0_25 -// cl *.cpp -I. -I"%OPENBLASROOT%"\include -I"%JAVA_HOME%"\include -I"%JAVA_HOME%"\include\win32 -DUSE_OPEN_BLAS -Fesystemds_openblas-windows-x86_64.dll -MD -LD "%OPENBLASROOT%"\lib\libopenblas.dll.a +#include "common.h" +#include "libmatrixdnn.h" +#include "libmatrixmult.h" +#include "systemds.h" // Results from Matrix-vector/vector-matrix 1M x 1K, dense show that GetDoubleArrayElements creates a copy on OpenJDK. @@ -67,14 +50,12 @@ // ( maxThreads != -1 && ((int)numThreads) == maxThreads ? env->ReleasePrimitiveArrayCritical(input, inputPtr, 0) : env->ReleaseDoubleArrayElements(input, inputPtr, 0) ) // ------------------------------------------------------------------- - -int maxThreads = -1; JNIEXPORT void JNICALL Java_org_apache_sysds_utils_NativeHelper_setMaxNumThreads (JNIEnv *, jclass, jint jmaxThreads) { - maxThreads = (int) jmaxThreads; + setNumThreadsForBLAS(jmaxThreads); } -JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd( +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd( JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray ret, jint m1rlen, jint m1clen, jint m2clen, jint numThreads) { @@ -82,17 +63,19 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd( double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL) - return (jboolean) false; + return -1; dmatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads); + size_t nnz = computeNNZ<double>(retPtr, m1rlen * m2clen); RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads); RELEASE_INPUT_ARRAY(env, m2, m2Ptr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); - return (jboolean) true; + + return static_cast<jlong>(nnz); } -JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd( +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd( JNIEnv* env, jclass cls, jobject m1, jobject m2, jobject ret, jint m1rlen, jint m1clen, jint m2clen, jint numThreads) { @@ -100,38 +83,40 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd( float* m2Ptr = (float*) env->GetDirectBufferAddress(m2); float* retPtr = (float*) env->GetDirectBufferAddress(ret); if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL) - return (jboolean) false; + return -1; smatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads); - return (jboolean) true; + return static_cast<jlong>(computeNNZ<float>(retPtr, m1rlen * m2clen)); } -JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean leftTrans, jint numThreads) { double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(m1Ptr == NULL || retPtr == NULL) - return (jboolean) false; + return -1; tsmm(m1Ptr, retPtr, (int)m1rlen, (int)m1clen, (bool)leftTrans, (int)numThreads); - + size_t nnz = computeNNZ<double>(retPtr, m1rlen * m1clen); + RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); - return (jboolean) true; + + return static_cast<jlong>(nnz); } JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dSparse - (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter, - jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { + (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray filter, + jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL)); double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads); double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); conv2dSparse((int)apos, (int)alen, aixPtr, avalsPtr, filterPtr, retPtr, (int)N, (int)C, (int)H, (int)W, - (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads); + (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads); RELEASE_INPUT_ARRAY(env, avals, avalsPtr, numThreads); RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads); @@ -141,16 +126,16 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dSparse } JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterSparseDense - (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout, - jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { + (JNIEnv * env, jclass, jint apos, jint alen, jintArray aix, jdoubleArray avals, jdoubleArray dout, + jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { int* aixPtr = ((int*)env->GetPrimitiveArrayCritical(aix, NULL)); double* avalsPtr = GET_DOUBLE_ARRAY(env, avals, numThreads); double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); conv2dBackwardFilterSparseDense((int)apos, (int)alen, aixPtr, avalsPtr, doutPtr, retPtr, (int)N, (int)C, (int)H, (int)W, - (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads); + (int)K, (int)R, (int)S, (int)stride_h, (int)stride_w, (int)pad_h, (int)pad_w, (int)P, (int)Q, (int)numThreads); RELEASE_INPUT_ARRAY(env, avals, avalsPtr, numThreads); RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads); @@ -159,104 +144,106 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwa return (jboolean) true; } -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense( - JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter, - jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense( + JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter, + jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads); double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jint) -1; + return -1; - int nnz = dconv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, - (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, - (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, false, (int) numThreads); + size_t nnz = dconv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, + (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, + (int) Q, false, (int) numThreads); RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads); RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); - return (jint) nnz; + return static_cast<jlong>(nnz); } -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense( - JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter, - jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense( + JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter, + jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads); double* biasPtr = GET_DOUBLE_ARRAY(env, bias, numThreads); double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jint) -1; + return -1; - int nnz = dconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, - (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, - (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads); + size_t nnz = dconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, + (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, + (int) Q, true, (int) numThreads); RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads); RELEASE_INPUT_ARRAY(env, bias, biasPtr, numThreads); RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); - return (jint) nnz; + return static_cast<jlong>(nnz); } -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense( - JNIEnv* env, jclass, jobject input, jobject bias, jobject filter, - jobject ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense( + JNIEnv* env, jclass, jobject input, jobject bias, jobject filter, + jobject ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { float* inputPtr = (float*) env->GetDirectBufferAddress(input); float* biasPtr = (float*) env->GetDirectBufferAddress(bias); float* filterPtr = (float*) env->GetDirectBufferAddress(filter); float* retPtr = (float*) env->GetDirectBufferAddress(ret); if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jint) -1; + return -1; - int nnz = sconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, - (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, - (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads); - - return (jint) nnz; + size_t nnz = sconv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, + (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, + (int) Q, true, (int) numThreads); + + return static_cast<jlong>(nnz); } -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense( - JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout, - jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense( + JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout, + jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads); double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jint) -1; + return -1; - int nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, - (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads); + size_t nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, + (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, + (int) P, (int) Q, (int) numThreads); RELEASE_INPUT_ARRAY(env, filter, filterPtr, numThreads); RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); - return (jint) nnz; + return static_cast<jlong>(nnz); } -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense( - JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout, - jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, - jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense( + JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout, + jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, + jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { double* inputPtr = GET_DOUBLE_ARRAY(env, input, numThreads); double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL) - return (jint) -1; + return -1; - int nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, - (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads); + size_t nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int)N, (int) C, (int) H, (int) W, (int) K, (int) R, + (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, + (int) Q, (int) numThreads); RELEASE_INPUT_ARRAY(env, input, inputPtr, numThreads); RELEASE_INPUT_ARRAY(env, dout, doutPtr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); - return (jint) nnz; + return static_cast<jlong>(nnz); } diff --git a/src/main/cpp/systemds.h b/src/main/cpp/systemds.h index d84392f..b4f741f 100644 --- a/src/main/cpp/systemds.h +++ b/src/main/cpp/systemds.h @@ -29,25 +29,25 @@ extern "C" { /* * Class: org_apache_sysds_utils_NativeHelper * Method: dmmdd - * Signature: ([D[D[DIIII)Z + * Signature: ([D[D[DIIII)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dmmdd (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint); /* * Class: org_apache_sysds_utils_NativeHelper * Method: smmdd - * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIII)Z + * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIII)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_smmdd (JNIEnv *, jclass, jobject, jobject, jobject, jint, jint, jint, jint); /* * Class: org_apache_sysds_utils_NativeHelper * Method: tsmm - * Signature: ([D[DIIZI)Z + * Signature: ([D[DIIZI)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jint, jint, jboolean, jint); /* @@ -55,7 +55,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysds_utils_NativeHelper_tsmm * Method: conv2dDense * Signature: ([D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* @@ -63,7 +63,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dDense * Method: dconv2dBiasAddDense * Signature: ([D[D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* @@ -71,7 +71,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_dconv2dBiasAddDe * Method: sconv2dBiasAddDense * Signature: (Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;Ljava/nio/FloatBuffer;IIIIIIIIIIIIII)I */ -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDense (JNIEnv *, jclass, jobject, jobject, jobject, jobject, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* @@ -79,7 +79,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_sconv2dBiasAddDe * Method: conv2dBackwardFilterDense * Signature: ([D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFilterDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* @@ -87,7 +87,7 @@ JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardFi * Method: conv2dBackwardDataDense * Signature: ([D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jint JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense +JNIEXPORT jlong JNICALL Java_org_apache_sysds_utils_NativeHelper_conv2dBackwardDataDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index 228aec7..f8d5436 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -28,6 +28,8 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.stream.IntStream; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.commons.math3.util.FastMath; import org.apache.sysds.hops.OptimizerUtils; import org.apache.sysds.lops.MapMultChain.ChainType; @@ -68,7 +70,8 @@ public class LibMatrixMult private static final long PAR_MINFLOP_THRESHOLD2 = 128L*1024; //MIN 2 MFLOP public static final int L2_CACHESIZE = 256 * 1024; //256KB (common size) public static final int L3_CACHESIZE = 16 * 1024 * 1024; //16MB (common size) - + private static final Log LOG = LogFactory.getLog(LibMatrixMult.class.getName()); + private LibMatrixMult() { //prevent instantiation via private constructor } @@ -2490,9 +2493,13 @@ public class LibMatrixMult //call native matrix multiplication (only called for single-threaded and matrix-vector //because this ensures that we can deal with the transpose mV without additional transpose) - if(!NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(), - ((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1) ) - throw new DMLRuntimeException("Error executing native matrix mult."); + long nnz =NativeHelper.dmmdd(((m==1)?mV:mU).getDenseBlockValues(), + ((m==1)?mU:mV).getDenseBlockValues(), c, (m==1)?n:m, cd, 1, 1); + if(nnz < 0) { + //fallback to default java implementation + LOG.warn("matrixMultWSigmoidDenseNative: Native mat mult failed. Falling back to java version."); + matrixMult(((m==1)?mV:mU), ((m==1)?mU:mV), ret, false); + } //compute remaining wsigmoid for all relevant outputs for( int i=0; i<m*n; i++ ) { diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java index 5d33e2e..d64a936 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java @@ -88,25 +88,26 @@ public class LibMatrixNative ret.sparse = false; ret.allocateDenseBlock(); long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - boolean rccode = false; + long nnz; if( isSinglePrecision() ) { FloatBuffer fin1 = toFloatBuffer(m1.getDenseBlockValues(), inBuff, true); FloatBuffer fin2 = toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true); FloatBuffer fout = toFloatBuffer(ret.getDenseBlockValues(), outBuff, false); - rccode = NativeHelper.smmdd(fin1, fin2, fout, + nnz = NativeHelper.smmdd(fin1, fin2, fout, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k); fromFloatBuffer(outBuff.get(), ret.getDenseBlockValues()); } else { - rccode = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(), + nnz = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(), ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k); } - if (rccode) { + + if(nnz > -1) { if(DMLScript.STATISTICS) { Statistics.nativeLibMatrixMultTime += System.nanoTime() - start; Statistics.numNativeLibMatrixMultCalls.increment(); } - ret.recomputeNonZeros(); + ret.setNonZeros(nnz); if(examSparsity) ret.examSparsity(); return; @@ -114,6 +115,8 @@ public class LibMatrixNative //else record failure and fallback to java Statistics.incrementNativeFailuresCounter(); } + //fallback to default java implementation + LOG.warn("matrixMult: Native mat mult failed. Falling back to java version."); if (k == 1) LibMatrixMult.matrixMult(m1, m2, ret, !examSparsity); else @@ -127,18 +130,16 @@ public class LibMatrixNative && (!m1.sparse && m1.getDenseBlock().isContiguous() ) ) { ret.sparse = false; ret.allocateDenseBlock(); - if( NativeHelper.tsmm(m1.getDenseBlockValues(), - ret.getDenseBlockValues(), m1.rlen, m1.clen, leftTrans, k) ) - { - // ToDo: add native tsmm() to stats - long nnz = (ret.clen==1) ? ret.recomputeNonZeros() : - LibMatrixMult.copyUpperToLowerTriangle(ret); + + long nnz = NativeHelper.tsmm(m1.getDenseBlockValues(), ret.getDenseBlockValues(), + m1.rlen, m1.clen, leftTrans, k); + if(nnz > -1) { ret.setNonZeros(nnz); ret.examSparsity(); return; } //fallback to default java implementation - LOG.info("Falling back to java TSMM()"); + LOG.warn("Native TSMM failed. Falling back to java version."); Statistics.incrementNativeFailuresCounter(); } if( k > 1 ) @@ -161,7 +162,7 @@ public class LibMatrixNative if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) { setNumThreads(params); long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - int nnz = 0; + long nnz; if(params.bias == null) { nnz = NativeHelper.conv2dDense(input.getDenseBlockValues(), filter.getDenseBlockValues(), outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, @@ -237,7 +238,7 @@ public class LibMatrixNative if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) { setNumThreads(params); long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - int nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(), + long nnz = NativeHelper.conv2dBackwardFilterDense(input.getDenseBlockValues(), dout.getDenseBlockValues(), outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, params.P, params.Q, params.numThreads); @@ -273,7 +274,7 @@ public class LibMatrixNative if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) { setNumThreads(params); long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - int nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(), + long nnz = NativeHelper.conv2dBackwardDataDense(filter.getDenseBlockValues(), dout.getDenseBlockValues(), outputBlock.getDenseBlockValues(), params.N, params.C, params.H, params.W, params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, params.P, params.Q, params.numThreads); diff --git a/src/main/java/org/apache/sysds/utils/NativeHelper.java b/src/main/java/org/apache/sysds/utils/NativeHelper.java index cd9f8f9..83869c2 100644 --- a/src/main/java/org/apache/sysds/utils/NativeHelper.java +++ b/src/main/java/org/apache/sysds/utils/NativeHelper.java @@ -333,14 +333,15 @@ public class NativeHelper { * @return true if successfully loaded BLAS */ public static boolean loadLibraryHelperFromResource(String libFileName) { - OutputStream out = null; try(InputStream in = NativeHelper.class.getResourceAsStream("/lib/"+ libFileName)) { // This logic is added because Java does not allow to load library from a resource file. if(in != null) { File temp = File.createTempFile(libFileName, ""); temp.deleteOnExit(); - out = FileUtils.openOutputStream(temp); + OutputStream out = FileUtils.openOutputStream(temp); IOUtils.copy(in, out); + // not closing the stream here makes dll loading fail on Windows + IOUtilFunctions.closeSilently(out); System.load(temp.getAbsolutePath()); return true; } @@ -350,22 +351,19 @@ public class NativeHelper { catch(IOException e) { LOG.warn("Unable to load library " + libFileName + " from resource:" + e.getMessage()); } - finally { - IOUtilFunctions.closeSilently(out); - } return false; } // TODO: Add pmm, wsloss, mmchain, etc. //double-precision matrix multiply dense-dense - public static native boolean dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, + public static native long dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads); //single-precision matrix multiply dense-dense - public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen, + public static native long smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen, int numThreads); //transpose-self matrix multiply - public static native boolean tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads); + public static native long tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads); // ---------------------------------------------------------------------------------------------------------------- // LibMatrixDNN operations: @@ -376,25 +374,25 @@ public class NativeHelper { // Returns -1 if failures or returns number of nonzeros // Called by DnnCPInstruction if both input and filter are dense - public static native int conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, + public static native long conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); - public static native int dconv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, + public static native long dconv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); - public static native int sconv2dBiasAddDense(FloatBuffer input, FloatBuffer bias, FloatBuffer filter, FloatBuffer ret, + public static native long sconv2dBiasAddDense(FloatBuffer input, FloatBuffer bias, FloatBuffer filter, FloatBuffer ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); // Called by DnnCPInstruction if both input and filter are dense - public static native int conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, + public static native long conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); // If both filter and dout are dense, then called by DnnCPInstruction // Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse. - public static native int conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, + public static native long conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads);