This is an automated email from the ASF dual-hosted git repository. markd pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 0084f24dc02c325a6a6eb2fe5c823f97e0db5359 Author: Mark Dokter <[email protected]> AuthorDate: Mon Feb 22 14:00:42 2021 +0100 [SYSTEMDS-2826] Sparse input support for CUDA codegen * Code template handling refactor * A few code snippets from the row template that the diff didn't cleanly separate (so things might not compile/run without the other commit (7bc6379d59a0c19d881fdac8229be64d880d30cc)). Intent was to split it in smaller chunks with moderate effort. --- pom.xml | 11 + src/main/cuda/CMakeLists.txt | 32 +- src/main/cuda/headers/Matrix.h | 299 ++++++++++++ src/main/cuda/headers/reduction.cuh | 172 ++++--- src/main/cuda/headers/spoof_utils.cuh | 40 +- src/main/cuda/kernels/reduction.cu | 306 ++++++------ src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp | 151 +++--- src/main/cuda/spoof-launcher/SpoofCUDAContext.h | 531 +++++++++++++-------- src/main/cuda/spoof-launcher/jni_bridge.cpp | 187 ++++++-- src/main/cuda/spoof-launcher/jni_bridge.h | 18 +- src/main/cuda/spoof/cellwise.cu | 64 ++- .../org/apache/sysds/hops/codegen/cplan/CNode.java | 87 ++-- .../sysds/hops/codegen/cplan/CNodeBinary.java | 28 +- .../apache/sysds/hops/codegen/cplan/CNodeCell.java | 37 +- .../apache/sysds/hops/codegen/cplan/CNodeData.java | 3 +- .../apache/sysds/hops/codegen/cplan/CNodeNary.java | 17 +- .../sysds/hops/codegen/cplan/CNodeUnary.java | 15 +- .../sysds/hops/codegen/cplan/CodeTemplate.java | 58 ++- .../sysds/hops/codegen/cplan/cuda/Binary.java | 31 +- .../sysds/hops/codegen/cplan/cuda/CellWise.java | 77 --- .../sysds/hops/codegen/cplan/cuda/Ternary.java | 40 +- .../sysds/hops/codegen/cplan/cuda/Unary.java | 33 +- .../sysds/hops/codegen/cplan/java/Binary.java | 26 +- .../sysds/hops/codegen/cplan/java/CellWise.java | 79 --- .../Cellwise.java.template} | 31 +- .../sysds/hops/codegen/cplan/java/Ternary.java | 27 +- .../sysds/hops/codegen/cplan/java/Unary.java | 25 +- .../apache/sysds/runtime/codegen/SpoofCUDA.java | 92 ++-- .../controlprogram/context/ExecutionContext.java | 21 +- .../instructions/gpu/SpoofCUDAInstruction.java | 44 +- .../instructions/gpu/context/GPUObject.java | 15 +- 31 files changed, 1596 insertions(+), 1001 deletions(-) diff --git a/pom.xml b/pom.xml index 7212f14..f95eafd 100644 --- a/pom.xml +++ b/pom.xml @@ -111,9 +111,20 @@ <include>spoof_utils.cuh</include> <include>TempStorage.cuh</include> <include>utils.cuh</include> + <include>vector_write.cuh</include> + <include>vector_add.cuh</include> + <include>Matrix.h</include> </includes> <targetPath>cuda/headers</targetPath> </resource> + <resource> + <directory>src/main/java/org/apache/sysds/hops/codegen/cplan/java</directory> + <includes> + <include>Cellwise.java.template</include> + <include>Rowwise.java.template</include> + </includes> + <targetPath>java/spoof</targetPath> + </resource> </resources> <plugins> diff --git a/src/main/cuda/CMakeLists.txt b/src/main/cuda/CMakeLists.txt index 8b74dee..cfa72c4 100644 --- a/src/main/cuda/CMakeLists.txt +++ b/src/main/cuda/CMakeLists.txt @@ -19,7 +19,7 @@ # #------------------------------------------------------------- -cmake_minimum_required(VERSION 3.17 FATAL_ERROR) +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) # default to gcc 8.x while we're still supporting CUDA 10.x only if (UNIX) @@ -49,8 +49,13 @@ target_include_directories(SystemDS PUBLIC "${CMAKE_SOURCE_DIR}/headers") find_package(CUDAToolkit REQUIRED) cmake_policy(SET CMP0104 NEW) + set(CMAKE_CUDA_ARCHITECTURES OFF) #ToDo: more compiler flag settings for Debug/Release compilation +#set(CMAKE_CUDA_ARCHITECTURES 52 60 61 75 CACHE STRING "CUDA architectures" FORCE) +#set(CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) +#message("CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES}") + set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr") set_property(TARGET SystemDS PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) @@ -85,15 +90,28 @@ endif() set(SPOOF_HEADERS spoof-launcher/jni_bridge.h - spoof-launcher/SpoofCUDAContext.h) + spoof-launcher/SpoofCUDAContext.h headers/Matrix.h headers/TempStorage.cuh) set(SPOOF_SOURCES spoof-launcher/jni_bridge.cpp spoof-launcher/SpoofCUDAContext.cpp) - -add_library(spoof_cuda SHARED ${SPOOF_HEADERS} ${SPOOF_SOURCES} ) - -target_include_directories(spoof_cuda PRIVATE "${CMAKE_SOURCE_DIR}/ext/jitify") -target_link_libraries(spoof_cuda CUDA::nvrtc CUDA::cuda_driver CUDA::cudart) +set(SPOOF_CUDA_HEADERS + headers/agg_ops.cuh + headers/reduction.cuh + headers/spoof_utils.cuh + headers/utils.cuh headers/operators.cuh headers/Matrix.h headers/vector_write.cuh headers/vector_add.cuh) + #headers/intellisense_cuda_intrinsics.h +set(SPOOF_TEMPLATES + spoof/cellwise.cu + spoof/rowwise.cu) +source_group("SPOOF Templates" FILES ${SPOOF_TEMPLATES}) +set_source_files_properties( ${SPOOF_TEMPLATES} PROPERTIES HEADER_FILE_ONLY ON) +add_library(spoof_cuda SHARED ${SPOOF_HEADERS} ${SPOOF_CUDA_HEADERS} ${SPOOF_SOURCES} ${SPOOF_TEMPLATES}) + +set_property(TARGET reduction PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) +set_property(TARGET spoof_cuda PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}) + +target_include_directories(spoof_cuda PRIVATE "${CMAKE_SOURCE_DIR}/ext/jitify" headers) +target_link_libraries(spoof_cuda CUDA::nvrtc CUDA::cuda_driver CUDA::cudart CUDA::cublas) target_compile_features(spoof_cuda PUBLIC cxx_std_11) set_target_properties(spoof_cuda PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(spoof_cuda PROPERTIES OUTPUT_NAME "systemds_spoof_cuda-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h new file mode 100644 index 0000000..a11fc33 --- /dev/null +++ b/src/main/cuda/headers/Matrix.h @@ -0,0 +1,299 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once +#ifndef SYSTEMDS_MATRIX_H +#define SYSTEMDS_MATRIX_H + +using uint32_t = unsigned int; + +template <typename T> +struct Matrix { + T* data; + uint32_t* row_ptr; + uint32_t* col_idx; + + uint32_t rows; + uint32_t cols; + uint32_t nnz; + typedef T value_type; +}; + +//#ifdef __CUDACC_RTC__ +#ifdef __CUDACC__ + +template<typename T> +uint32_t bin_search(T* values, uint32_t lower, uint32_t upper, T val) { + upper -= 1; + while(lower <= (upper-1)) { + uint32_t idx = lower + upper >> 1; + uint32_t vi = values[idx]; + if (vi < val) + lower = idx + 1; + else { + if (vi <= val) + return idx; + upper = idx - 1; + } + } + return upper + 1; +} + +template<typename T> +class MatrixAccessor { + + Matrix<T>* _mat; + + // Member function pointers + uint32_t (MatrixAccessor::*_len)(); + uint32_t (MatrixAccessor::*_row_len)(uint32_t); + uint32_t (MatrixAccessor::*_pos)(uint32_t); + uint32_t* (MatrixAccessor::*_col_idxs)(uint32_t); + + T& (MatrixAccessor::*_val_i)(uint32_t); + T& (MatrixAccessor::*_val_rc)(uint32_t, uint32_t); + T* (MatrixAccessor::*_vals)(uint32_t); + void (MatrixAccessor::*_set)(uint32_t, uint32_t, T); + +public: + MatrixAccessor() = default; + + __device__ MatrixAccessor(Matrix<T>* mat) { init(mat); } + + __device__ void init(Matrix<T>* mat) { + _mat = mat; + + if (_mat->row_ptr == nullptr) { + _len = &MatrixAccessor::len_dense; + _pos = &MatrixAccessor::pos_dense; + _col_idxs = &MatrixAccessor::cols_dense; + _val_rc = &MatrixAccessor::val_dense_rc; + _vals = &MatrixAccessor::vals_dense; + _row_len = &MatrixAccessor::row_len_dense; + _val_i = &MatrixAccessor::val_dense_i; + } else { + _len = &MatrixAccessor::len_sparse; + _pos = &MatrixAccessor::pos_sparse; + _col_idxs = &MatrixAccessor::cols_sparse; + _val_rc = &MatrixAccessor::val_sparse_rc; + _vals = &MatrixAccessor::vals_sparse; + _row_len = &MatrixAccessor::row_len_sparse; + _val_i = &MatrixAccessor::val_sparse_i; + _set = &MatrixAccessor::set_sparse; + } + } + + __device__ uint32_t& nnz() { return _mat->nnz; } + __device__ uint32_t cols() { return _mat->cols; } + __device__ uint32_t rows() { return _mat->rows; } + + __device__ uint32_t len() { return (this->*_len)(); } + + __device__ uint32_t pos(uint32_t rix) { + return (this->*_pos)(rix); + } + + __device__ T& val(uint32_t r, uint32_t c) { + return (this->*_val_rc)(r, c); + } + + __device__ T& val(uint32_t i) { + return (this->*_val_i)(i); + } + + __device__ T* vals(uint32_t rix) { + return (this->*_vals)(rix); + } + + __device__ T& operator[](uint32_t i) { + return (this->*_val_i)(i); + } + + __device__ uint32_t row_len(uint32_t rix) { + return (this->*_row_len)(rix); + } + + __device__ uint32_t* col_idxs(uint32_t rix) { + return (this->*_col_idxs)(rix); + } + + __device__ void set(uint32_t r, uint32_t c, T v) { + (this->*_set)(r,c,v); + } + + __device__ uint32_t* indexes() { + return _mat->row_ptr; + } +private: + __device__ uint32_t len_dense() { + return _mat->rows * _mat->cols; + } + + __device__ uint32_t pos_dense(uint32_t rix) { + return _mat->cols * rix; + } + + __device__ uint32_t* cols_dense(uint32_t rix) { + printf("ERROR: no column indices array in a dense matrix! This will likely crash :-/\n"); + return nullptr; + } + + __device__ T& val_dense_rc(uint32_t r, uint32_t c) { + return _mat->data[_mat->cols * r + c]; + } + + __device__ T& val_dense_i(uint32_t i) { + return _mat->data[i]; + } + + __device__ T* vals_dense(uint32_t rix) { + return &(_mat->data[rix]); + } + + __device__ uint32_t row_len_dense(uint32_t rix) { + return _mat->rows; + } + + //ToDo sparse accessors + __device__ uint32_t len_sparse() { + return _mat->nnz; + } + + __device__ uint32_t pos_sparse(uint32_t rix) { + return _mat->row_ptr[rix]; + } + + __device__ uint32_t* cols_sparse(uint32_t rix) { + return &_mat->col_idx[_mat->row_ptr[rix]]; + } + + __device__ T& val_sparse_rc(uint32_t r, uint32_t c) { +// printf("TBI: val_sparse_rc\n"); +// asm("trap;"); + + return _mat->data[0]; + } + + __device__ T& val_sparse_i(uint32_t i) { + return _mat->data[i]; + } + + __device__ T* vals_sparse(uint32_t rix) { + return &_mat->data[_mat->row_ptr[rix]]; + } + + __device__ uint32_t row_len_sparse(uint32_t rix) { + return _mat->row_ptr[rix+1]-_mat->row_ptr[rix]; + } + + __device__ void set_sparse(uint32_t idx, uint32_t c, T v) { +// uint32_t idx = _mat->cols*r+c; + _mat->data[idx] = v; + _mat->col_idx[idx] = c; +// _mat->row_ptr[r+1] = _mat->row_ptr[r+1] > 0 ? min(idx, _mat->row_ptr[r+1]) : idx; +// if(threadIdx.x == 0) +// atomicMax(&(_mat->row_ptr[r+1]), idx < _mat->nnz-1 ? idx+1 : idx); +// v == 0.0 ? atomicAdd(&(_mat->nnz), -1) : atomicAdd(&(_mat->nnz), -1); + +// if(blockIdx.x == 0 && threadIdx.x > 20 && threadIdx.x < 30) +// printf("nnz=%d idx=%d r=%d c=%d v=%4.3f\n", _mat->nnz, idx, r, c, v); +// _mat->row_ptr[r+1] = _mat->row_ptr[r+1] > 0 ? max(idx, _mat->row_ptr[r+1]) : idx; + } +}; +#endif + + +#ifdef __CUDACC_RTC__ + +//ToDo: move to separate file +template <typename T> +struct Vector { + T* data; + uint32_t length; + + __device__ T* vals(uint32_t idx) { return &data[idx]; } + + __device__ T& operator[](uint32_t idx) { + return data[idx]; + } + + __device__ void print(const char* name, uint32_t end_ = 0, uint32_t start = 0, uint32_t bID = 0, uint32_t tID = 0) { + if(blockIdx.x == bID && threadIdx.x==tID) { + uint32_t end = end_; + if(end > 0) + end = min(end, length); + printf("%s: ", name); + for(auto i = start; i < end; ++i) + print("%4.3f ", data[i]); + } + } +}; + +template <typename T, uint32_t ELEMENTS> +class RingBuffer { + Vector<T> vec[ELEMENTS]; + int32_t pos; + +public: + __device__ void init(uint32_t offset, uint32_t length, T* buffer) { + pos = -1; + for(auto i=0;i<ELEMENTS;++i) { + vec[i].data = &buffer[offset + length * i]; + vec[i].length = length; + } + } + + __device__ Vector<T>& next() { + pos = (pos+1>=ELEMENTS) ? 0 : pos+1; + __syncthreads(); + return vec[pos]; + } +}; + +template <typename T, int NUM_B> +struct SpoofOp { + MatrixAccessor<T> a; + MatrixAccessor<T> b[NUM_B]; + MatrixAccessor<T> c; + T* scalars; + uint32_t grix; + T* avals; + uint32_t* aix; + uint32_t alen; + + SpoofOp(Matrix<T>* A, Matrix<T>* B, Matrix<T>* C, T* scalars, T* tmp_stor, uint32_t grix) : + scalars(scalars), grix(grix), avals(A->data), aix(A->col_idx) { + a.init(A); + c.init(C); + alen = a.row_len(grix); + + if(B) + for(auto i = 0; i < NUM_B; ++i) + b[i].init(&(B[i])); + } + +// __device__ Vector<T>& getTempStorage(uint32_t len) { +// Vector<T>& vec = temp_rb.next(); +// tvec.length = len; +// return vec; +// } +}; +#endif // __CUDACC_RTC__ + +#endif //SYSTEMDS_MATRIX_H diff --git a/src/main/cuda/headers/reduction.cuh b/src/main/cuda/headers/reduction.cuh index 56845b5..8a024cc 100644 --- a/src/main/cuda/headers/reduction.cuh +++ b/src/main/cuda/headers/reduction.cuh @@ -25,6 +25,7 @@ using uint = unsigned int; #include <cuda_runtime.h> #include "utils.cuh" +#include "Matrix.h" /** * Does a reduce operation over all elements of the array. @@ -50,15 +51,7 @@ using uint = unsigned int; * @param SpoofCellwiseOp initial value for the reduction variable */ template<typename T, typename ReductionOp, typename SpoofCellwiseOp> -__device__ void FULL_AGG( - T *g_idata, ///< input data stored in device memory (of size n) - T *g_odata, ///< output/temporary array stored in device memory (of size n) - uint m, - uint n, - T initialValue, - ReductionOp reduction_op, - SpoofCellwiseOp spoof_op) -{ +__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) { auto sdata = shared_memory_proxy<T>(); // perform first level of reduction, @@ -66,20 +59,19 @@ __device__ void FULL_AGG( uint tid = threadIdx.x; uint i = blockIdx.x * blockDim.x * 2 + threadIdx.x; uint gridSize = blockDim.x * 2 * gridDim.x; - uint N = m * n; - T v = initialValue; + + T v = reduction_op.init(); // we reduce multiple elements per thread. The number is determined by the // number of active thread blocks (via gridDim). More blocks will result // in a larger gridSize and therefore fewer elements per thread while (i < N) { - v = reduction_op(v, spoof_op(g_idata[i], i)); + v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / N, i % N)); - if (i + blockDim.x < N) - { + if (i + blockDim.x < N) { //__syncthreads(); //printf("loop fetch i(%d)+blockDim.x(%d)=%d, in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]); - v = reduction_op(v, spoof_op(g_idata[i + blockDim.x], blockDim.x + i)); + v = reduction_op(v, spoof_op(*(in->vals(i+blockDim.x)), blockDim.x + i, (i+blockDim.x) / in->cols(), (i+blockDim.x) % in->cols())); } i += gridSize; @@ -140,12 +132,12 @@ __device__ void FULL_AGG( } } - // write result for this block to global mem - if (tid == 0) { - if(gridDim.x < 10) - printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]); - g_odata[blockIdx.x] = sdata[0]; - } + // write result for this block to global mem + if (tid == 0) { +// if(gridDim.x < 10) +// printf("blockIdx.x=%d reduction result: %3.1f\n", blockIdx.x, sdata[0]); + out->val(0, blockIdx.x) = sdata[0]; + } } /** @@ -168,32 +160,35 @@ __device__ void FULL_AGG( * the value before writing it to its final location in global memory for each * row */ +//template<typename T, typename ReductionOp, typename SpoofCellwiseOp> +//__device__ void ROW_AGG( +// T *g_idata, ///< input data stored in device memory (of size rows*cols) +// T *g_odata, ///< output/temporary array store in device memory (of size +// /// rows*cols) +// uint rows, ///< rows in input and temporary/output arrays +// uint cols, ///< columns in input and temporary/output arrays +// T initialValue, ///< initial value for the reduction variable +// ReductionOp reduction_op, ///< Reduction operation to perform (functor object) +// SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this template<typename T, typename ReductionOp, typename SpoofCellwiseOp> -__device__ void ROW_AGG( - T *g_idata, ///< input data stored in device memory (of size rows*cols) - T *g_odata, ///< output/temporary array store in device memory (of size - /// rows*cols) - uint rows, ///< rows in input and temporary/output arrays - uint cols, ///< columns in input and temporary/output arrays - T initialValue, ///< initial value for the reduction variable - ReductionOp reduction_op, ///< Reduction operation to perform (functor object) - SpoofCellwiseOp spoof_op) ///< Operation to perform before assigning this -{ +__device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, + SpoofCellwiseOp spoof_op) { auto sdata = shared_memory_proxy<T>(); // one block per row - if (blockIdx.x >= rows) { + if (blockIdx.x >= in->rows()) { return; } uint block = blockIdx.x; uint tid = threadIdx.x; - uint i = tid; - uint block_offset = block * cols; + uint32_t i = tid; + uint block_offset = block * in->cols(); - T v = initialValue; - while (i < cols) { - v = reduction_op(v, spoof_op(g_idata[block_offset + i], i)); +// T v = initialValue; + T v = reduction_op.init(); + while (i < in->cols()) { + v = reduction_op(v, spoof_op(in->val(block_offset + i), i, i / in->cols(), i % in->cols())); i += blockDim.x; } @@ -254,7 +249,7 @@ __device__ void ROW_AGG( // write result for this block to global mem, modify it with assignment op if (tid == 0) - g_odata[block] = sdata[0]; + out->val(block) = sdata[0]; } /** @@ -273,41 +268,98 @@ __device__ void ROW_AGG( * column */ template<typename T, typename ReductionOp, typename SpoofCellwiseOp> -__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols) - T *g_odata, ///< output/temporary array store in device memory (of size rows*cols) - uint rows, ///< rows in input and temporary/output arrays - uint cols, ///< columns in input and temporary/output arrays - T initialValue, ///< initial value for the reduction variable - ReductionOp reduction_op, ///< Reduction operation to perform (functor object) - SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation - -{ +__device__ void COL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, + SpoofCellwiseOp spoof_op) { +//__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory (of size rows*cols) +// T *g_odata, ///< output/temporary array store in device memory (of size rows*cols) +// uint rows, ///< rows in input and temporary/output arrays +// uint cols, ///< columns in input and temporary/output arrays +// T initialValue, ///< initial value for the reduction variable +// ReductionOp reduction_op, ///< Reduction operation to perform (functor object) +// SpoofCellwiseOp spoof_op) ///< Operation to perform before aggregation +// +//{ uint global_tid = blockIdx.x * blockDim.x + threadIdx.x; - if (global_tid >= cols) { + if (global_tid >= in->cols()) { return; } uint i = global_tid; - uint grid_size = cols; - T val = initialValue; + uint grid_size = in->cols(); + T val = reduction_op.init(); - while (i < rows * cols) { - val = reduction_op(val, spoof_op(g_idata[i], i)); + while (i < N) { + val = reduction_op(val, spoof_op(in->val(i), i, i / in->cols(), i % in->cols())); i += grid_size; } - g_odata[global_tid] = val; + out->val(global_tid) = val; } template<typename T, typename ReductionOp, typename SpoofCellwiseOp> -__device__ void NO_AGG(T* g_idata, T* g_odata, uint rows, uint cols, - T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) +__device__ void NO_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int first_idx = tid * static_cast<int>(VT); - int last_idx = min(first_idx + static_cast<int>(VT), spoof_op.m * spoof_op.n); + uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t first_idx = gtid * static_cast<uint32_t>(VT); + uint32_t last_idx = min(first_idx + static_cast<uint32_t>(VT), N); #pragma unroll - for(int i = first_idx; i < last_idx; i++) { - g_odata[i] = spoof_op(g_idata[i], i); + for(auto i = first_idx; i < last_idx; i++) { + T result = spoof_op(in->vals(0)[i], i, i / in->cols(), i % in->cols()); + out->vals(0)[i] = result; + //if(i < 4) + // printf("tid=%d in=%4.3f res=%4.3f out=%4.3f r=%d\n", i, in->vals(0)[i], result, out->vals(0)[i], i/in->cols()); + } +} + +template<typename T, typename ReductionOp, typename SpoofCellwiseOp> +__device__ void NO_AGG_SPARSE(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) +{ + const uint32_t& rix = blockIdx.x; + uint32_t tid = threadIdx.x; +// uint32_t rix = (gtid * VT) / in->cols(); +// //uint32_t cix = (gtid % in->cols());// *static_cast<uint32_t>(VT); +// uint32_t cix = in->col_idxs(0)[gtid]; + uint32_t row_start = in->pos(rix); + uint32_t row_len = in->row_len(rix); +//// if(cix == 0) { +// //if(row_start == gtid) { +// //if(threadIdx.x == 0) { +// if(rix < in->rows()) { +// if(rix > 895 && rix < 905) +// printf("gtid=%d in->indexes()[rix=%d]=%d rowlen=%d row_start=%d cix=%d\n", gtid, rix, in->indexes()[rix], in->row_len(rix), row_start, cix); +//// out->indexes()[gtid] = in->indexes()[gtid]; +// out->indexes()[rix] = in->indexes()[rix]; +// } + + + while(tid < row_len) { + + uint32_t* aix = in->col_idxs(rix); + uint32_t cix = aix[tid]; +// T result = spoof_op(in->val(rix, cix), rix*in->rows()+cix, rix, cix); + T result = spoof_op(in->val(row_start+tid), rix*in->rows()+cix, rix, cix); + out->set(row_start+tid, cix, result); + + //if(rix > 899 && rix < 903 && cix==0) + // printf("rix=%d row_start=%d tid=%d result=%4.3f\n", rix, row_start, tid, result); + + tid+=blockDim.x; + + +//#pragma unroll +// for (auto i = first_idx; i < last_idx; i++) { +//// out->vals(0)[i] = spoof_op(in->vals(0)[i], i); +//// out->col_idxs(0)[i] = gtid % blockDim.x; +// T result = spoof_op(in->vals(0)[i], i); +// out->vals(0)[i] = result; +// //out->col_idxs(0)[i] = i % in->cols(); +// out->col_idxs(0)[i] = in->col_idxs(0)[i]; +// //out->set(i/in->cols(), i%in->cols(), result); +// //out->set(rix, i%in->cols(), result); +// if (i > in->nnz() - 10) +// printf("i=%d in=%4.3f res=%4.3f out=%4.3f r=%d out->index(i=%d)=%d out->col_idxs()[i=%d]=%d first=%d last=%d gtid=%d\n", +// i, in->vals(0)[i], result, out->vals(0)[i], +// i / in->cols(), i, out->indexes()[i], i, out->col_idxs(0)[i], first_idx, last_idx, gtid); +// } } } diff --git a/src/main/cuda/headers/spoof_utils.cuh b/src/main/cuda/headers/spoof_utils.cuh index e28d254..dab9aec 100644 --- a/src/main/cuda/headers/spoof_utils.cuh +++ b/src/main/cuda/headers/spoof_utils.cuh @@ -28,21 +28,31 @@ __constant__ double FLOAT_EPS = 1.49012E-08; // 2 ^ -26 __constant__ double EPSILON = 1E-11; // margin for comparisons ToDo: make consistent use of it __device__ long long toInt64(double a) { - return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + DOUBLE_EPS)); + return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + DOUBLE_EPS)); } __device__ int toInt32(float a) { - return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + FLOAT_EPS)); + return (signbit(a) == 0 ? 1.0 : -1.0) * abs(floor(a + FLOAT_EPS)); } template<typename T> __device__ T getValue(T* data, int rowIndex) { - return data[rowIndex]; + return data[rowIndex]; } template<typename T> -__device__ T getValue(T* data, int n, int rowIndex, int colIndex) { - return data[rowIndex * n + colIndex]; +__device__ T getValue(MatrixAccessor<T> data, int rowIndex) { + return data[rowIndex]; +} + +template<typename T> +__device__ T getValue(T* data, uint32_t n, uint32_t rowIndex, uint32_t colIndex) { + return data[rowIndex * n + colIndex]; +} + +template<typename T> +__device__ T getValue(MatrixAccessor<T>& data, uint32_t n, uint32_t rowIndex, uint32_t colIndex) { + return data[rowIndex * n + colIndex]; } template<typename T> @@ -50,14 +60,14 @@ __device__ T intDiv(T a, T b); template<> __device__ double intDiv(double a, double b) { - double ret = a / b; - return (isnan(ret) || isinf(ret)) ? ret : toInt64(ret); + double ret = a / b; + return (isnan(ret) || isinf(ret)) ? ret : toInt64(ret); } template<> __device__ float intDiv(float a, float b) { - float ret = a / b; - return (isnan(ret) || isinf(ret)) ? ret : toInt32(ret); + float ret = a / b; + return (isnan(ret) || isinf(ret)) ? ret : toInt32(ret); } template<typename T> @@ -65,16 +75,16 @@ __device__ T modulus(T a, T b); template<> __device__ double modulus(double a, double b) { - if (fabs(b) < DOUBLE_EPS) - return CUDART_NAN; - return a - intDiv(a, b) * b; + if (fabs(b) < DOUBLE_EPS) + return CUDART_NAN; + return a - intDiv(a, b) * b; } template<> __device__ float modulus(float a, float b) { - if (fabs(b) < FLOAT_EPS) - return CUDART_NAN_F; - return a - intDiv(a, b) * b; + if (fabs(b) < FLOAT_EPS) + return CUDART_NAN_F; + return a - intDiv(a, b) * b; } template<typename T> diff --git a/src/main/cuda/kernels/reduction.cu b/src/main/cuda/kernels/reduction.cu index 04fd098..3a11f77 100644 --- a/src/main/cuda/kernels/reduction.cu +++ b/src/main/cuda/kernels/reduction.cu @@ -20,9 +20,11 @@ #include "utils.cuh" #include "agg_ops.cuh" #include "reduction.cuh" +#include "Matrix.h" using uint = unsigned int; #include <cuda_runtime.h> +#ifdef __CUDACC__ /** * Do a summation over all elements of an array/matrix @@ -31,19 +33,21 @@ using uint = unsigned int; * @param n size of the input and temporary/output arrays */ template<typename T> -__device__ void reduce_sum(T *g_idata, T *g_odata, uint n) { +__device__ void reduce_sum(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t n) { SumOp<T> agg_op; IdentityOp<T> spoof_op; - FULL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op); + FULL_AGG<T, SumOp<T>, IdentityOp<T>>(in, out, n, (T) 0.0, agg_op, spoof_op); } -extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata, uint n) { - reduce_sum(g_idata, g_odata, n); -} - -extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) { - reduce_sum(g_idata, g_odata, n); +extern "C" __global__ void reduce_sum_d(Matrix<double>* in, Matrix<double>* out, uint32_t n) { + MatrixAccessor<double> _in(in); + MatrixAccessor<double> _out(out); + reduce_sum(&_in, &_out, n); } +#endif +//extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) { +// reduce_sum(g_idata, g_odata, n); +//} /** * Do a summation over all rows of a matrix @@ -52,20 +56,20 @@ extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, uint n) * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_row_sum(T *g_idata, T *g_odata, uint rows, uint cols) { - SumOp<T> agg_op; - IdentityOp<T> spoof_op; - ROW_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, 0.0, agg_op, spoof_op); -} - -extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) { - reduce_row_sum(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) { - reduce_row_sum(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_row_sum(T *g_idata, T *g_odata, uint rows, uint cols) { +// SumOp<T> agg_op; +// IdentityOp<T> spoof_op; +// ROW_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, 0.0, agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) { +// reduce_row_sum(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) { +// reduce_row_sum(g_idata, g_odata, rows, cols); +//} /** * Do a summation over all columns of a matrix @@ -74,20 +78,20 @@ extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, uint * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_col_sum(T *g_idata, T *g_odata, uint rows, uint cols) { - SumOp<T> agg_op; - IdentityOp<T> spoof_op; - COL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); -} - -extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) { - reduce_col_sum(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) { - reduce_col_sum(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_col_sum(T *g_idata, T *g_odata, uint rows, uint cols) { +// SumOp<T> agg_op; +// IdentityOp<T> spoof_op; +// COL_AGG<T, SumOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, uint rows, uint cols) { +// reduce_col_sum(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint rows, uint cols) { +// reduce_col_sum(g_idata, g_odata, rows, cols); +//} /** @@ -97,19 +101,21 @@ extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, uint * @param n size of the input and temporary/output arrays */ template<typename T> -__device__ void reduce_max(T *g_idata, T *g_odata, uint n) { +__device__ void reduce_max(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t n) { MaxOp<T> agg_op; IdentityOp<T> spoof_op; - FULL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, -MAX<T>(), agg_op, spoof_op); + FULL_AGG<T, MaxOp<T>, IdentityOp<T>>(in, out, n, -MAX<T>(), agg_op, spoof_op); } -extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata, uint n) { - reduce_max(g_idata, g_odata, n); +extern "C" __global__ void reduce_max_d(Matrix<double>* in, Matrix<double>* out, uint32_t n) { + MatrixAccessor<double> _in(in); + MatrixAccessor<double> _out(out); + reduce_max(&_in, &_out, n); } -extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) { - reduce_max(g_idata, g_odata, n); -} +//extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) { +// reduce_max(g_idata, g_odata, n); +//} /** * Do a max over all rows of a matrix @@ -118,20 +124,20 @@ extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, uint n) * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_row_max(T *g_idata, T *g_odata, uint rows, uint cols) { - MaxOp<T> agg_op; - IdentityOp<T> spoof_op; - ROW_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op); -} - -extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, uint rows, uint cols) { - reduce_row_max(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint rows, uint cols) { - reduce_row_max(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_row_max(T *g_idata, T *g_odata, uint rows, uint cols) { +// MaxOp<T> agg_op; +// IdentityOp<T> spoof_op; +// ROW_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op); +//} + +//extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, uint rows, uint cols) { +// reduce_row_max(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint rows, uint cols) { +// reduce_row_max(g_idata, g_odata, rows, cols); +//} /** * Do a max over all columns of a matrix @@ -140,20 +146,20 @@ extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, uint * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_col_max(T *g_idata, T *g_odata, uint rows, uint cols) { - MaxOp<T> agg_op; - IdentityOp<T> spoof_op; - COL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op); -} - -extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, uint rows, uint cols) { - reduce_col_max(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint rows, uint cols) { - reduce_col_max(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_col_max(T *g_idata, T *g_odata, uint rows, uint cols) { +// MaxOp<T> agg_op; +// IdentityOp<T> spoof_op; +// COL_AGG<T, MaxOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, -MAX<T>(), agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, uint rows, uint cols) { +// reduce_col_max(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint rows, uint cols) { +// reduce_col_max(g_idata, g_odata, rows, cols); +//} /** @@ -163,19 +169,21 @@ extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, uint * @param n size of the input and temporary/output arrays */ template<typename T> -__device__ void reduce_min(T *g_idata, T *g_odata, uint n) { +__device__ void reduce_min(MatrixAccessor<T>* in, MatrixAccessor<T>* out, uint32_t n) { MinOp<T> agg_op; IdentityOp<T> spoof_op; - FULL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, MAX<T>(), agg_op, spoof_op); + FULL_AGG<T, MinOp<T>, IdentityOp<T>>(in, out, n, MAX<T>(), agg_op, spoof_op); } -extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata, uint n) { - reduce_min(g_idata, g_odata, n); +extern "C" __global__ void reduce_min_d(Matrix<double>* in, Matrix<double>* out, uint32_t n) { + MatrixAccessor<double> _in(in); + MatrixAccessor<double> _out(out); + reduce_min(&_in, &_out, n); } -extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) { - reduce_min(g_idata, g_odata, n); -} +//extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) { +// reduce_min(g_idata, g_odata, n); +//} /** @@ -185,20 +193,20 @@ extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, uint n) * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_row_min(T *g_idata, T *g_odata, uint rows, uint cols) { - MinOp<T> agg_op; - IdentityOp<T> spoof_op; - ROW_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op); -} - -extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, uint rows, uint cols) { - reduce_row_min(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint rows, uint cols) { - reduce_row_min(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_row_min(T *g_idata, T *g_odata, uint rows, uint cols) { +// MinOp<T> agg_op; +// IdentityOp<T> spoof_op; +// ROW_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, uint rows, uint cols) { +// reduce_row_min(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint rows, uint cols) { +// reduce_row_min(g_idata, g_odata, rows, cols); +//} /** * Do a min over all columns of a matrix @@ -207,20 +215,20 @@ extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, uint * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_col_min(T *g_idata, T *g_odata, uint rows, uint cols) { - MinOp<T> agg_op; - IdentityOp<T> spoof_op; - COL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op); -} - -extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, uint rows, uint cols) { - reduce_col_min(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint rows, uint cols) { - reduce_col_min(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_col_min(T *g_idata, T *g_odata, uint rows, uint cols) { +// MinOp<T> agg_op; +// IdentityOp<T> spoof_op; +// COL_AGG<T, MinOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, MAX<T>(), agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, uint rows, uint cols) { +// reduce_col_min(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint rows, uint cols) { +// reduce_col_min(g_idata, g_odata, rows, cols); +//} /** @@ -229,20 +237,20 @@ extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, uint * @param g_odata output/temporary array stored in device memory (of size n) * @param n size of the input and temporary/output arrays */ -template<typename T> -__device__ void reduce_sum_sq(T *g_idata, T *g_odata, uint n) { - SumSqOp<T> agg_op; - IdentityOp<T> spoof_op; - FULL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op); -} - -extern "C" __global__ void reduce_sum_sq_d(double *g_idata, double *g_odata, uint n) { - reduce_sum_sq(g_idata, g_odata, n); -} - -extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint n) { - reduce_sum_sq(g_idata, g_odata, n); -} +//template<typename T> +//__device__ void reduce_sum_sq(T *g_idata, T *g_odata, uint n) { +// SumSqOp<T> agg_op; +// IdentityOp<T> spoof_op; +// FULL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, n, 1, (T) 0.0, agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_sum_sq_d(double *g_idata, double *g_odata, uint n) { +// reduce_sum_sq(g_idata, g_odata, n); +//} +// +//extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint n) { +// reduce_sum_sq(g_idata, g_odata, n); +//} /** * Do a summation over all squared elements of an array/matrix @@ -251,32 +259,32 @@ extern "C" __global__ void reduce_sum_sq_f(float *g_idata, float *g_odata, uint * @param rows number of rows in input matrix * @param cols number of columns in input matrix */ -template<typename T> -__device__ void reduce_col_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) { - SumSqOp<T> agg_op; - IdentityOp<T> spoof_op; - COL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); -} - -extern "C" __global__ void reduce_col_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) { - reduce_col_sum_sq(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_col_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) { - reduce_col_sum_sq(g_idata, g_odata, rows, cols); -} - -template<typename T> -__device__ void reduce_row_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) { - SumSqOp<T> agg_op; - IdentityOp<T> spoof_op; - ROW_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); -} - -extern "C" __global__ void reduce_row_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) { - reduce_row_sum_sq(g_idata, g_odata, rows, cols); -} - -extern "C" __global__ void reduce_row_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) { - reduce_row_sum_sq(g_idata, g_odata, rows, cols); -} +//template<typename T> +//__device__ void reduce_col_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) { +// SumSqOp<T> agg_op; +// IdentityOp<T> spoof_op; +// COL_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_col_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) { +// reduce_col_sum_sq(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_col_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) { +// reduce_col_sum_sq(g_idata, g_odata, rows, cols); +//} + +//template<typename T> +//__device__ void reduce_row_sum_sq(T* g_idata, T* g_odata, uint rows, uint cols) { +// SumSqOp<T> agg_op; +// IdentityOp<T> spoof_op; +// ROW_AGG<T, SumSqOp<T>, IdentityOp<T>>(g_idata, g_odata, rows, cols, (T)0.0, agg_op, spoof_op); +//} +// +//extern "C" __global__ void reduce_row_sum_sq_d(double* g_idata, double* g_odata, uint rows, uint cols) { +// reduce_row_sum_sq(g_idata, g_odata, rows, cols); +//} +// +//extern "C" __global__ void reduce_row_sum_sq_f(float* g_idata, float* g_odata, uint rows, uint cols) { +// reduce_row_sum_sq(g_idata, g_odata, rows, cols); +//} diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp index 36299c7..d3f0778 100644 --- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp +++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp @@ -45,26 +45,26 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* resourc // SUM CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d")); ctx->reduction_kernels.insert(std::make_pair("reduce_sum_d", func)); - CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f")); - ctx->reduction_kernels.insert(std::make_pair("reduce_sum_f", func)); - - // SUM_SQ - CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_d")); - ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", func)); - CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_f")); - ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", func)); - - // MIN +// CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f")); +// ctx->reduction_kernels.insert(std::make_pair("reduce_sum_f", func)); +// +// // SUM_SQ +// CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_d")); +// ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", func)); +// CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_f")); +// ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", func)); +// +// // MIN CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_d")); ctx->reduction_kernels.insert(std::make_pair("reduce_min_d", func)); - CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f")); - ctx->reduction_kernels.insert(std::make_pair("reduce_min_f", func)); - - // MAX +// CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f")); +// ctx->reduction_kernels.insert(std::make_pair("reduce_min_f", func)); +// +// // MAX CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d")); ctx->reduction_kernels.insert(std::make_pair("reduce_max_d", func)); - CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_f")); - ctx->reduction_kernels.insert(std::make_pair("reduce_max_f", func)); +// CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_f")); +// ctx->reduction_kernels.insert(std::make_pair("reduce_max_f", func)); return reinterpret_cast<size_t>(ctx); } @@ -95,47 +95,80 @@ bool SpoofCUDAContext::compile_cuda(const std::string &src, std::cout << "cuda_path: " << cuda_include_path << std::endl; #endif - SpoofOperator::AggType type = SpoofOperator::AggType::NONE; - SpoofOperator::AggOp op = SpoofOperator::AggOp::NONE; - - auto pos = 0; - if((pos = src.find("CellType")) != std::string::npos) { - if(src.substr(pos, pos+30).find("FULL_AGG") != std::string::npos) - type = SpoofOperator::AggType::FULL_AGG; - else if(src.substr(pos, pos+30).find("ROW_AGG") != std::string::npos) - type = SpoofOperator::AggType::ROW_AGG; - else if(src.substr(pos, pos+30).find("COL_AGG") != std::string::npos) - type = SpoofOperator::AggType::COL_AGG; - else if(src.substr(pos, pos+30).find("NO_AGG") != std::string::npos) - type = SpoofOperator::AggType::NO_AGG; - else { - std::cerr << "error: unknown aggregation type" << std::endl; - return false; - } - - if(type != SpoofOperator::AggType::NO_AGG) { - if((pos = src.find("AggOp")) != std::string::npos) { - if(src.substr(pos, pos+30).find("AggOp.SUM") != std::string::npos) - op = SpoofOperator::AggOp::SUM; - else if(src.substr(pos, pos+30).find("AggOp.SUM_SQ") != std::string::npos) - op = SpoofOperator::AggOp::SUM_SQ; - else if(src.substr(pos, pos+30).find("AggOp.MIN") != std::string::npos) - op = SpoofOperator::AggOp::MIN; - else if(src.substr(pos, pos+30).find("AggOp.MAX") != std::string::npos) - op = SpoofOperator::AggOp::MAX; - else { - std::cerr << "error: unknown aggregation operator" << std::endl; - return false; - } - } - } - } - - std::stringstream s1, s2, s3; - s1 << "-I" << resource_path << "/cuda/headers"; - s2 << "-I" << resource_path << "/cuda/spoof"; - - jitify::Program program = kernel_cache.program(src, 0, {s1.str(), s2.str(), cuda_include_path}); - ops.insert(std::make_pair(name, SpoofOperator({std::move(program), type, op}))); - return true; + SpoofOperator::AggType agg_type= SpoofOperator::AggType::NONE; + SpoofOperator::AggOp agg_op = SpoofOperator::AggOp::NONE; + SpoofOperator::OpType op_type = SpoofOperator::OpType::NONE; + + auto pos = 0; + + if((pos = src.find("SpoofCellwiseOp")) != std::string::npos) + op_type = SpoofOperator::OpType::CW; + else if((pos = src.find("SpoofRowwiseOp")) != std::string::npos) + op_type = SpoofOperator::OpType::RA; + else { + std::cerr << "error: unknown spoof operator" << std::endl; + return false; + } + + bool TB1 = false; + if((pos = src.find("TB1")) != std::string::npos) + if(src.substr(pos, pos+8).find("true") != std::string::npos) + TB1 = true; + + uint32_t numTempVect = 0; + if((pos = src.find("// VectMem: ")) != std::string::npos) + numTempVect = std::stoi(std::string(src.begin() + pos + 12, std::find(src.begin()+pos, src.end(), '\n'))); + + int32_t constDim2 = 0; + if((pos = src.find("// ConstDim2: ")) != std::string::npos) + constDim2 = std::stoi(std::string(src.begin() + pos + 14, std::find(src.begin()+pos, src.end(), '\n'))); + + bool sparse_safe = false; + if ((pos = src.find("// SparseSafe:")) != std::string::npos) + if (src.substr(pos, pos + 15).find("true") != std::string::npos) + sparse_safe = true; + + if(((pos = src.find("CellType")) != std::string::npos) || ((pos = src.find("RowType")) != std::string::npos)){ + if(src.substr(pos, pos+30).find("FULL_AGG") != std::string::npos) + agg_type= SpoofOperator::AggType::FULL_AGG; + else if(src.substr(pos, pos+30).find("ROW_AGG") != std::string::npos) + agg_type= SpoofOperator::AggType::ROW_AGG; + else if(src.substr(pos, pos+30).find("COL_AGG") != std::string::npos) + agg_type= SpoofOperator::AggType::COL_AGG; + else if(src.substr(pos, pos+30).find("NO_AGG") != std::string::npos) + agg_type= SpoofOperator::AggType::NO_AGG; + else if(src.substr(pos, pos+30).find("NO_AGG_CONST") != std::string::npos) + agg_type= SpoofOperator::AggType::NO_AGG_CONST; + else if(src.substr(pos, pos+30).find("COL_AGG_T") != std::string::npos) + agg_type= SpoofOperator::AggType::COL_AGG_T; + else { + std::cerr << "error: unknown aggregation type" << std::endl; + return false; + } + + if((agg_type!= SpoofOperator::AggType::NO_AGG) && (op_type == SpoofOperator::OpType::CW)) { + if((pos = src.find("AggOp")) != std::string::npos) { + if(src.substr(pos, pos+30).find("AggOp.SUM") != std::string::npos) + agg_op = SpoofOperator::AggOp::SUM; + else if(src.substr(pos, pos+30).find("AggOp.SUM_SQ") != std::string::npos) + agg_op = SpoofOperator::AggOp::SUM_SQ; + else if(src.substr(pos, pos+30).find("AggOp.MIN") != std::string::npos) + agg_op = SpoofOperator::AggOp::MIN; + else if(src.substr(pos, pos+30).find("AggOp.MAX") != std::string::npos) + agg_op = SpoofOperator::AggOp::MAX; + else { + std::cerr << "error: unknown aggregation operator" << std::endl; + return false; + } + } + } + } + + std::stringstream s1, s2, s3; + s1 << "-I" << resource_path << "/cuda/headers"; + s2 << "-I" << resource_path << "/cuda/spoof"; + + jitify::Program program = kernel_cache.program(src, 0, {s1.str(), s2.str(), cuda_include_path}); + ops.insert(std::make_pair(name, SpoofOperator({std::move(program), agg_type, agg_op, op_type, name, TB1, constDim2, numTempVect, sparse_safe}))); + return true; } diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h index 36d29ec..fec38bc 100644 --- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h +++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h @@ -21,14 +21,22 @@ #ifndef SPOOFCUDACONTEXT_H #define SPOOFCUDACONTEXT_H +#define NOMINMAX + #include <cmath> #include <cstdint> #include <map> #include <string> +#include <algorithm> -#ifdef __DEBUG - #define JITIFY_PRINT_ALL 1 +#include "Matrix.h" + +#ifdef _DEBUG +#define __DEBUG #endif +// #ifdef __DEBUG + // #define JITIFY_PRINT_ALL 1 +// #endif #include <jitify.hpp> @@ -37,13 +45,19 @@ using jitify::reflection::type_of; struct SpoofOperator { - enum class AggType : int { NO_AGG, ROW_AGG, COL_AGG, FULL_AGG, NONE }; - enum class AggOp : int {SUM, SUM_SQ, MIN, MAX, NONE }; - - jitify::Program program; - AggType agg_type; - AggOp agg_op; - + enum class AggType : int { NO_AGG, NO_AGG_CONST, ROW_AGG, COL_AGG, FULL_AGG, COL_AGG_T, NONE }; + enum class AggOp : int {SUM, SUM_SQ, MIN, MAX, NONE }; + enum class OpType : int { CW, RA, MA, OP, NONE }; + + jitify::Program program; + AggType agg_type; + AggOp agg_op; + OpType op_type; + const std::string name; + bool TB1 = false; + int32_t const_dim2; + uint32_t num_temp_vectors; + bool sparse_safe = true; }; class SpoofCUDAContext { @@ -71,199 +85,338 @@ public: bool compile_cuda(const std::string &src, const std::string &name); - template <typename T> - T execute_kernel(const std::string &name, T **in_ptrs, int num_inputs, - T **side_ptrs, int num_sides, T *out_ptr, T *scalars_ptr, - int num_scalars, int m, int n, int grix) { - - T result = 0.0; - size_t dev_buf_size; - T **d_sides = nullptr; - T *d_scalars = nullptr; - T *d_temp_agg_buf; - uint32_t N = m * n; - - auto o = ops.find(name); - if (o != ops.end()) { - SpoofOperator *op = &(o->second); - - if (num_sides > 0) { - dev_buf_size = sizeof(T *) * num_sides; - CHECK_CUDART(cudaMalloc((void **)&d_sides, dev_buf_size)); - CHECK_CUDART(cudaMemcpy(d_sides, side_ptrs, dev_buf_size, cudaMemcpyHostToDevice)); - } - - if (num_scalars > 0) { - dev_buf_size = sizeof(T) * num_scalars; - CHECK_CUDART(cudaMalloc((void **)&d_scalars, dev_buf_size)); - CHECK_CUDART(cudaMemcpy(d_scalars, scalars_ptr, dev_buf_size, cudaMemcpyHostToDevice)); - } - - switch (op->agg_type) { - case SpoofOperator::AggType::FULL_AGG: { - // num ctas - int NB = std::ceil((N + NT * 2 - 1) / (NT * 2)); - dim3 grid(NB, 1, 1); - dim3 block(NT, 1, 1); - unsigned int shared_mem_size = NT * sizeof(T); - - dev_buf_size = sizeof(T) * NB; - CHECK_CUDART(cudaMalloc((void **)&d_temp_agg_buf, dev_buf_size)); + template <typename T> + T execute_kernel(const std::string &name, std::vector<Matrix<T>>& input, std::vector<Matrix<T>>& sides, Matrix<T>* output, + T *scalars_ptr, uint32_t num_scalars, uint32_t grix) { + + T result = 0.0; + size_t dev_buf_size; + Matrix<T>* d_in = nullptr; + Matrix<T>* d_out = nullptr; + Matrix<T>* d_sides = nullptr; + T* b1_transposed = nullptr; + T *d_scalars = nullptr; + + auto o = ops.find(name); + if (o != ops.end()) { + SpoofOperator *op = &(o->second); + + // ToDo: multiple inputs for SpoofOuterProduct template + CHECK_CUDART(cudaMalloc((void **)&d_in, sizeof(Matrix<T>))); + CHECK_CUDART(cudaMemcpy(d_in, reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>), cudaMemcpyHostToDevice)); + + if(output != nullptr) { + if (op->sparse_safe == true && input.front().row_ptr != nullptr) { #ifdef __DEBUG - // ToDo: connect output to SystemDS logging facilities - std::cout << "launching spoof cellwise kernel " << name << " with " - << NT * NB << " threads in " << NB << " blocks and " - << shared_mem_size - << " bytes of shared memory for full aggregation of " - << N << " elements" - << std::endl; + std::cout << "copying sparse safe row ptrs" << std::endl; #endif - CHECK_CUDA(op->program.kernel(name) - .instantiate(type_of(result)) - .configure(grid, block, shared_mem_size) - .launch(in_ptrs[0], d_sides, d_temp_agg_buf, d_scalars, m, n, grix)); - - if(NB > 1) { - std::string reduction_kernel_name = determine_agg_kernel<T>(op); - - CUfunction reduce_kernel = reduction_kernels.find(reduction_kernel_name)->second; - N = NB; - int iter = 1; - while (NB > 1) { - void* args[3] = { &d_temp_agg_buf, &d_temp_agg_buf, &N}; - - NB = std::ceil((N + NT * 2 - 1) / (NT * 2)); + CHECK_CUDART(cudaMemcpy(output->row_ptr, input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t), cudaMemcpyDeviceToDevice)); + } + + CHECK_CUDART(cudaMalloc((void **) &d_out, sizeof(Matrix<T>))); + //CHECK_CUDART(cudaMemset(out->data, 0, out->rows*out->cols*sizeof(T))); + CHECK_CUDART(cudaMemcpy(d_out, reinterpret_cast<void *>(output), sizeof(Matrix<T>), + cudaMemcpyHostToDevice)); + + } + else { + uint32_t num_blocks = 1; + if (op->op_type == SpoofOperator::OpType::CW) + num_blocks = std::ceil(((input.front().rows * input.front().cols) + NT * 2 - 1) / (NT * 2)); + + CHECK_CUDART(cudaMalloc((void **) &d_out, sizeof(Matrix<T>))); + T* d_out_data = nullptr; + CHECK_CUDART(cudaMalloc((void **) &d_out_data, sizeof(T) * num_blocks)); + Matrix<T> agg_out{d_out_data, 0, 0, num_blocks, 1, num_blocks}; + CHECK_CUDART(cudaMemcpy(d_out, reinterpret_cast<void *>(&agg_out), sizeof(Matrix<T>), + cudaMemcpyHostToDevice)); + } + + if (!sides.empty()) { + if(op->TB1) { #ifdef __DEBUG - std::cout << "agg iter " << iter++ << " launching spoof cellwise kernel " << name << " with " + std::cout << "transposing TB1 for " << op->name << std::endl; +#endif + T* b1 = sides[0].data; + uint32_t m = sides[0].rows; + uint32_t n = sides[0].cols; + + cudaMalloc(reinterpret_cast<void**>(&b1_transposed), sizeof(T) * m * n); + double alpha = 1.0; + double beta = 0.0; + cublasHandle_t handle; + + CHECK_CUBLAS(cublasCreate(&handle)); + CHECK_CUBLAS(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, b1, n, &beta, b1, n, b1_transposed, m)); + sides[0].data = b1_transposed; + sides[0].rows = n; + sides[0].cols = m; + CHECK_CUBLAS(cublasDestroy(handle)); + } + dev_buf_size = sizeof(Matrix<T>) * sides.size(); + CHECK_CUDART(cudaMalloc(reinterpret_cast<void **>(&d_sides), dev_buf_size)); + CHECK_CUDART(cudaMemcpy(d_sides, &sides[0], dev_buf_size, cudaMemcpyHostToDevice)); + } + + if (num_scalars > 0) { + dev_buf_size = sizeof(T) * num_scalars; + CHECK_CUDART(cudaMalloc((void **)&d_scalars, dev_buf_size)); + CHECK_CUDART(cudaMemcpy(d_scalars, scalars_ptr, dev_buf_size, cudaMemcpyHostToDevice)); + } + + switch(op->op_type) { + case SpoofOperator::OpType::CW: + launch_cw_kernel(op, d_in, d_out, d_sides, sides.size(), d_scalars, input.front().rows, + input.front().cols, grix, input[0]); + break; + case SpoofOperator::OpType::RA: + launch_ra_kernel(op, d_in, d_out, d_sides, sides.size(), d_scalars, input.front().rows, + input.front().cols, grix, input[0].row_ptr!=nullptr); + break; + default: + std::cerr << "error: unknown spoof operator" << std::endl; + return result; + } + + if (num_scalars > 0) + CHECK_CUDART(cudaFree(d_scalars)); + + if (sides.size() > 0) + CHECK_CUDART(cudaFree(d_sides)); + + if (op->TB1) + CHECK_CUDART(cudaFree(b1_transposed)); + + if (op->agg_type == SpoofOperator::AggType::FULL_AGG) { +// std::cout << "retrieving scalar result" << std::endl; + + Matrix<T> res_mat; + CHECK_CUDART(cudaMemcpy(&res_mat, d_out, sizeof(Matrix<T>), cudaMemcpyDeviceToHost)); + CHECK_CUDART(cudaMemcpy(&result, res_mat.data, sizeof(T), cudaMemcpyDeviceToHost)); + + CHECK_CUDART(cudaFree(res_mat.data)); + CHECK_CUDART(cudaFree(d_out)); + } + } + else { + std::cerr << "kernel " << name << " not found." << std::endl; + return result; + } + return result; + } + + template<typename T> + std::string determine_agg_kernel(SpoofOperator* op) { + std::string reduction_kernel_name; + std::string reduction_type; + std::string suffix = (typeid(T) == typeid(double) ? "_d" : "_f"); + switch (op->agg_type) { + case SpoofOperator::AggType::FULL_AGG: + reduction_type = "_"; + break; + case SpoofOperator::AggType::ROW_AGG: + reduction_type = "_row_"; + break; + case SpoofOperator::AggType::COL_AGG: + reduction_type = "_col_"; + break; + default: + std::cerr << "unknown reduction type" << std::endl; + return ""; + } + + switch (op->agg_op) { + case SpoofOperator::AggOp::MIN: + reduction_kernel_name = "reduce" + reduction_type + "min" + suffix; + break; + case SpoofOperator::AggOp::MAX: + reduction_kernel_name = "reduce" + reduction_type + "max" + suffix; + break; + case SpoofOperator::AggOp::SUM_SQ: + reduction_kernel_name = "reduce" + reduction_type + "sum_sq" + suffix; + break; + case SpoofOperator::AggOp::SUM: + reduction_kernel_name = "reduce" + reduction_type + "sum" + suffix; + break; + default: + std::cerr << "unknown reduction op" << std::endl; + return ""; + } + + return reduction_kernel_name; + } + + template<typename T> + void launch_cw_kernel(SpoofOperator* op, Matrix<T>* d_in, Matrix<T>* d_out, Matrix<T>* d_sides, uint32_t num_sides, + T* d_scalars, uint32_t in_rows, uint32_t in_cols, uint32_t grix, const Matrix<T>& h_in) { + T value_type; + bool sparse = h_in.row_ptr != nullptr; + uint32_t N = in_rows * in_cols; + std::string op_name(op->name + "_DENSE"); + if(sparse) { + op_name = std::string(op->name + "_SPARSE"); + N = h_in.nnz; + } + + switch (op->agg_type) { + case SpoofOperator::AggType::FULL_AGG: { + // num ctas + uint32_t NB = std::ceil((N + NT * 2 - 1) / (NT * 2)); + dim3 grid(NB, 1, 1); + dim3 block(NT, 1, 1); + uint32_t shared_mem_size = NT * sizeof(T); + +//#ifdef __DEBUG + // ToDo: connect output to SystemDS logging facilities + std::cout << "launching spoof cellwise kernel " << op_name << " with " + << NT * NB << " threads in " << NB << " blocks and " + << shared_mem_size + << " bytes of shared memory for full aggregation of " + << N << " elements" + << std::endl; +//#endif + + CHECK_CUDA(op->program.kernel(op_name) + .instantiate(type_of(value_type), std::max(1u, num_sides)) + .configure(grid, block, shared_mem_size) + .launch(d_in, d_sides, d_out, d_scalars, N, grix)); + + if(NB > 1) { + std::string reduction_kernel_name = determine_agg_kernel<T>(op); + + CUfunction reduce_kernel = reduction_kernels.find(reduction_kernel_name)->second; + N = NB; + uint32_t iter = 1; + while (NB > 1) { + void* args[3] = { &d_out, &d_out, &N}; + + NB = std::ceil((N + NT * 2 - 1) / (NT * 2)); +//#ifdef __DEBUG + std::cout << "agg iter " << iter++ << " launching spoof cellwise kernel " << op_name << " with " << NT * NB << " threads in " << NB << " blocks and " << shared_mem_size << " bytes of shared memory for full aggregation of " << N << " elements" << std::endl; -#endif - CHECK_CUDA(cuLaunchKernel(reduce_kernel, - NB, 1, 1, - NT, 1, 1, - shared_mem_size, 0, args, 0)); - N = NB; - } - } - - CHECK_CUDART(cudaMemcpy(&result, d_temp_agg_buf, sizeof(T), cudaMemcpyDeviceToHost)); - CHECK_CUDART(cudaFree(d_temp_agg_buf)); - break; - } - case SpoofOperator::AggType::COL_AGG: { - // num ctas - int NB = std::ceil((N + NT - 1) / NT); - dim3 grid(NB, 1, 1); - dim3 block(NT, 1, 1); - unsigned int shared_mem_size = 0; +//#endif + CHECK_CUDA(cuLaunchKernel(reduce_kernel, + NB, 1, 1, + NT, 1, 1, + shared_mem_size, 0, args, 0)); + N = NB; + } + } + break; + } + case SpoofOperator::AggType::COL_AGG: { + // num ctas + uint32_t NB = std::ceil((N + NT - 1) / NT); + dim3 grid(NB, 1, 1); + dim3 block(NT, 1, 1); + uint32_t shared_mem_size = 0; #ifdef __DEBUG - std::cout << " launching spoof cellwise kernel " << name << " with " - << NT * NB << " threads in " << NB << " blocks for column aggregation of " - << N << " elements" << std::endl; + std::cout << " launching spoof cellwise kernel " << op_name << " with " + << NT * NB << " threads in " << NB << " blocks for column aggregation of " + << N << " elements" << std::endl; #endif - CHECK_CUDA(op->program.kernel(name) - .instantiate(type_of(result)) - .configure(grid, block) - .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix)); - - break; - } - case SpoofOperator::AggType::ROW_AGG: { - // num ctas - int NB = m; - dim3 grid(NB, 1, 1); - dim3 block(NT, 1, 1); - unsigned int shared_mem_size = NT * sizeof(T); + CHECK_CUDA(op->program.kernel(op_name) + .instantiate(type_of(value_type), std::max(1u, num_sides)) + .configure(grid, block, shared_mem_size) + .launch(d_in, d_sides, d_out, d_scalars, N, grix)); + + break; + } + case SpoofOperator::AggType::ROW_AGG: { + // num ctas + uint32_t NB = in_rows; + dim3 grid(NB, 1, 1); + dim3 block(NT, 1, 1); + uint32_t shared_mem_size = NT * sizeof(T); +//#ifdef __DEBUG + std::cout << " launching spoof cellwise kernel " << op_name << " with " + << NT * NB << " threads in " << NB << " blocks and " + << shared_mem_size << " bytes of shared memory for row aggregation of " + << N << " elements" << std::endl; +//#endif + CHECK_CUDA(op->program.kernel(op_name) + .instantiate(type_of(value_type), std::max(1u, num_sides)) + .configure(grid, block, shared_mem_size) + .launch(d_in, d_sides, d_out, d_scalars, N, grix)); + + break; + } + case SpoofOperator::AggType::NO_AGG: + default: { + // num ctas + // ToDo: VT not a template parameter anymore + uint32_t NB = std::ceil((N + NT * VT - 1) / (NT * VT)); + if(sparse) + NB = in_rows; + dim3 grid(NB, 1, 1); + dim3 block(NT, 1, 1); + uint32_t shared_mem_size = 0; #ifdef __DEBUG - std::cout << " launching spoof cellwise kernel " << name << " with " - << NT * NB << " threads in " << NB << " blocks and " - << shared_mem_size << " bytes of shared memory for row aggregation of " - << N << " elements" << std::endl; -#endif - CHECK_CUDA(op->program.kernel(name) - .instantiate(type_of(result)) - .configure(grid, block, shared_mem_size) - .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix)); - - break; - } - case SpoofOperator::AggType::NO_AGG: - default: { - // num ctas - // ToDo: VT not a template parameter anymore - int NB = std::ceil((N + NT * VT - 1) / (NT * VT)); - dim3 grid(NB, 1, 1); - dim3 block(NT, 1, 1); -#ifdef __DEBUG - std::cout << "launching spoof cellwise kernel " << name << " with " << NT * NB - << " threads in " << NB << " blocks without aggregation for " - << N << " elements" - << std::endl; + if(sparse) { + std::cout << "launching sparse spoof cellwise kernel " << op_name << " with " << NT * NB + << " threads in " << NB << " blocks without aggregation for " << N << " elements" + << std::endl; + } + else { + std::cout << "launching spoof cellwise kernel " << op_name << " with " << NT * NB + << " threads in " << NB << " blocks without aggregation for " << N << " elements" + << std::endl; + } #endif - CHECK_CUDA(op->program.kernel(name) - .instantiate(type_of(result)) - .configure(grid, block) - .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix)); - } - } - - if (num_scalars > 0) - CHECK_CUDART(cudaFree(d_scalars)); - - if (num_sides > 0) - CHECK_CUDART(cudaFree(d_sides)); - } - else { - std::cerr << "kernel " << name << " not found." << std::endl; - return result; - } - return result; - } - - template<typename T> - std::string determine_agg_kernel(SpoofOperator* op) { - std::string reduction_kernel_name; - std::string reduction_type; - std::string suffix = (typeid(T) == typeid(double) ? "_d" : "_f"); - switch (op->agg_type) { - case SpoofOperator::AggType::FULL_AGG: - reduction_type = "_"; - break; - case SpoofOperator::AggType::ROW_AGG: - reduction_type = "_row_"; - break; - case SpoofOperator::AggType::COL_AGG: - reduction_type = "_col_"; - break; - default: - std::cerr << "unknown reduction type" << std::endl; - return ""; - } - - switch (op->agg_op) { - case SpoofOperator::AggOp::MIN: - reduction_kernel_name = "reduce" + reduction_type + "min" + suffix; - break; - case SpoofOperator::AggOp::MAX: - reduction_kernel_name = "reduce" + reduction_type + "max" + suffix; - break; - case SpoofOperator::AggOp::SUM_SQ: - reduction_kernel_name = "reduce" + reduction_type + "sum_sq" + suffix; - break; - case SpoofOperator::AggOp::SUM: - reduction_kernel_name = "reduce" + reduction_type + "sum" + suffix; - break; - default: - std::cerr << "unknown reduction op" << std::endl; - return ""; - } - - return reduction_kernel_name; - } +// CHECK_CUDA(op->program.kernel(op_name) +// .instantiate(type_of(result)) +// .configure(grid, block) +// .launch(in_ptrs[0], d_sides, out_ptr, d_scalars, m, n, grix)); + + CHECK_CUDA(op->program.kernel(op_name) + .instantiate(type_of(value_type), std::max(1u, num_sides)) + .configure(grid, block, shared_mem_size) + .launch(d_in, d_sides, d_out, d_scalars, N, grix)); + } + } + } + + template<typename T> + void launch_ra_kernel(SpoofOperator* op, Matrix<T>* d_in, Matrix<T>* d_out, Matrix<T>* d_sides, uint32_t num_sides, + T* d_scalars, uint32_t in_rows, uint32_t in_cols, uint32_t grix, bool sparse) { + T value_type; + dim3 grid(in_rows, 1, 1); + dim3 block(NT, 1, 1); + unsigned int shared_mem_size = NT * sizeof(T); + + uint32_t tmp_len = 0; + uint32_t temp_buf_size = 0; + T* d_temp = nullptr; + if(op->num_temp_vectors>0) { + tmp_len = std::max(in_cols, op->const_dim2 < 0 ? 0 : static_cast<uint32_t>(op->const_dim2)); + temp_buf_size = op->num_temp_vectors * tmp_len * in_rows * sizeof(T); + CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size)); + CHECK_CUDART(cudaMemset(d_temp, 0, temp_buf_size)); + } + + std::string name(op->name + "_DENSE"); + if(sparse) + name = std::string(op->name + "_SPARSE"); + +//#ifdef __DEBUG + // ToDo: connect output to SystemDS logging facilities + std::cout << "launching spoof rowwise kernel " << name << " with " << NT * in_rows << " threads in " << in_rows + << " blocks and " << shared_mem_size << " bytes of shared memory for " << in_cols << " cols processed by " + << NT << " threads per row, adding " << temp_buf_size / 1024 << " kb of temp buffer in global memory." << std::endl; +//#endif + CHECK_CUDA(op->program.kernel(name) + .instantiate(type_of(value_type), std::max(1u, num_sides), op->num_temp_vectors, tmp_len) + .configure(grid, block, shared_mem_size) + .launch(d_in, d_sides, d_out, d_scalars, d_temp, grix)); + + if(op->num_temp_vectors>0) + CHECK_CUDART(cudaFree(d_temp)); + } }; #endif // SPOOFCUDACONTEXT_H diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp b/src/main/cuda/spoof-launcher/jni_bridge.cpp index 6645003..86d0a1f 100644 --- a/src/main/cuda/spoof-launcher/jni_bridge.cpp +++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp @@ -19,6 +19,7 @@ #include "jni_bridge.h" #include "SpoofCUDAContext.h" +#include "Matrix.h" // JNI Methods to get/release arrays #define GET_ARRAY(env, input) \ @@ -58,52 +59,144 @@ Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel( JNIEXPORT jdouble JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d( - JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs, - jlongArray side_ptrs, jlong out_ptr, jdoubleArray scalars_, jlong m, jlong n, jlong grix) { - - SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx); - const char *cstr_name = env->GetStringUTFChars(name, NULL); - - double **inputs = reinterpret_cast<double **>(GET_ARRAY(env, in_ptrs)); - double **sides = reinterpret_cast<double **>(GET_ARRAY(env, side_ptrs)); - double *scalars = reinterpret_cast<double *>(GET_ARRAY(env, scalars_)); - - double result = ctx_->execute_kernel( - cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs), - reinterpret_cast<double*>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, grix); - - RELEASE_ARRAY(env, in_ptrs, inputs); - RELEASE_ARRAY(env, side_ptrs, sides); - RELEASE_ARRAY(env, scalars_, scalars); - - // FIXME: that release causes an error - //std::cout << "releasing " << name_ << std::endl; - env->ReleaseStringUTFChars(name, cstr_name); - return result; + JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs, jint input_offset, jlongArray side_ptrs, + jlongArray out_ptrs, jdoubleArray scalars_, jlong grix, jobject inputs_, jobject out_obj) { + + SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx); + const char *cstr_name = env->GetStringUTFChars(name, NULL); + + size_t* inputs = reinterpret_cast<size_t*>(GET_ARRAY(env, in_ptrs)); + size_t* sides = reinterpret_cast<size_t*>(GET_ARRAY(env, side_ptrs)); + size_t *output = reinterpret_cast<size_t*>(GET_ARRAY(env, out_ptrs)); + double *scalars = reinterpret_cast<double*>(GET_ARRAY(env, scalars_)); + + //ToDo: call once while init + jclass CacheableData = env->FindClass("org/apache/sysds/runtime/controlprogram/caching/CacheableData"); + if(!CacheableData) { + std::cerr << " JNIEnv -> FindClass(CacheableData) failed" << std::endl; + return -1.0; + } + jclass ArrayList = env->FindClass("java/util/ArrayList"); + if(!ArrayList) { + std::cerr << " JNIEnv -> FindClass(ArrayList) failed" << std::endl; + return -1.0; + } + jmethodID mat_obj_num_rows = env->GetMethodID(CacheableData, "getNumRows", "()J"); + if(!mat_obj_num_rows) { + std::cerr << " JNIEnv -> GetMethodID() failed" << std::endl; + return -1.0; + } + jmethodID mat_obj_num_cols = env->GetMethodID(CacheableData, "getNumColumns", "()J"); + if(!mat_obj_num_cols) { + std::cerr << " JNIEnv -> GetMethodID() failed" << std::endl; + return -1.0; + } + jmethodID ArrayList_size = env->GetMethodID(ArrayList, "size", "()I"); + jmethodID ArrayList_get = env->GetMethodID(ArrayList, "get", "(I)Ljava/lang/Object;"); + + std::vector<Matrix<double>> in; + jint num_inputs = env->CallIntMethod(inputs_, ArrayList_size); +#ifdef __DEBUG + std::cout << "num inputs: " << num_inputs << " offsets: " << input_offset << std::endl; +#endif + + for(auto ptr_idx = 0, input_idx = 0; input_idx < input_offset; ptr_idx+=4, input_idx++) { + jobject input_obj = env->CallObjectMethod(inputs_, ArrayList_get, input_idx); + uint32_t m = static_cast<uint32_t>(env->CallIntMethod(input_obj, mat_obj_num_rows)); + uint32_t n = static_cast<uint32_t>(env->CallIntMethod(input_obj, mat_obj_num_cols)); + + in.push_back(Matrix<double>{reinterpret_cast<double*>(inputs[ptr_idx+3]), reinterpret_cast<uint32_t*>(inputs[ptr_idx+1]), + reinterpret_cast<uint32_t*>(inputs[ptr_idx+2]), m, n, + static_cast<uint32_t>(inputs[ptr_idx])}); +#ifdef __DEBUG + std::cout << "input #" << input_idx << " m=" << m << " n=" << n << std::endl; +#endif + } + + std::vector<Matrix<double>> side_inputs; + for(uint32_t ptr_idx = 0, input_idx = input_offset; input_idx < num_inputs; ptr_idx+=4, input_idx++) { + jobject side_input_obj = env->CallObjectMethod(inputs_, ArrayList_get, input_idx); + uint32_t m = static_cast<uint32_t>(env->CallIntMethod(side_input_obj, mat_obj_num_rows)); + uint32_t n = static_cast<uint32_t>(env->CallIntMethod(side_input_obj, mat_obj_num_cols)); + +// std::cout << "sides["<<i << "]=" << sides[i] << std::endl; +// std::cout << "sides["<<i+1 << "]=" << sides[i+1] << std::endl; +// std::cout << "sides["<<i+2 << "]=" << sides[i+2] << std::endl; +// std::cout << "sides["<<i+3 << "]=" << sides[i+3] << std::endl; + + side_inputs.push_back(Matrix<double>{reinterpret_cast<double*>(sides[ptr_idx+3]), reinterpret_cast<uint32_t*>(sides[ptr_idx+1]), + reinterpret_cast<uint32_t*>(sides[ptr_idx+2]), m, n, + static_cast<uint32_t>(sides[ptr_idx])}); + + uint32_t* row_ptr = reinterpret_cast<uint32_t*>(sides[ptr_idx+1]); + uint32_t* col_idxs = reinterpret_cast<uint32_t*>(sides[ptr_idx+2]); + double* data_ptr = reinterpret_cast<double*>(sides[ptr_idx+3]); +#ifdef __DEBUG + if(row_ptr != nullptr) { + for (auto i = 0; i < 2/*m*/; ++i) { + uint32_t alen = row_ptr[i+1] - row_ptr[i]; + std::cout << "row_start=" << row_ptr[i] << " row_end=" << row_ptr[i+1] << " alen[" << i << "]=" << alen << std::endl << " col_idxs:\n"; + for (auto j = 0; j < alen; ++j) { + std::cout << " " << *col_idxs; + } + std::cout << "\ndata:" << std::endl; + for (auto j = 0; j < alen; ++j) { + std::cout << " " << *data_ptr; + } + std::cout << std::endl; + } + } + std::cout << "side input #" << input_idx << " m=" << m << " n=" << n << std::endl; +#endif + } + + std::unique_ptr<Matrix<double>> out; + if(out_obj != nullptr) { +// std::cout << "out not null" << std::endl; + out = std::make_unique<Matrix<double>>(Matrix<double>{reinterpret_cast<double*>(output[3]), + reinterpret_cast<uint32_t*>(output[1]), + reinterpret_cast<uint32_t*>(output[2]), + static_cast<uint32_t>(env->CallIntMethod(out_obj, mat_obj_num_rows)), + static_cast<uint32_t>(env->CallIntMethod(out_obj, mat_obj_num_cols)), + static_cast<uint32_t>(output[0])}); + } + + double result = ctx_->execute_kernel(cstr_name, in, side_inputs, out.get(), scalars, + env->GetArrayLength(scalars_), grix); + + RELEASE_ARRAY(env, in_ptrs, inputs); + RELEASE_ARRAY(env, side_ptrs, sides); + RELEASE_ARRAY(env, out_ptrs, output); + RELEASE_ARRAY(env, scalars_, scalars); + + // FIXME: that release causes an error + //std::cout << "releasing " << name_ << std::endl; + env->ReleaseStringUTFChars(name, cstr_name); + return result; } -JNIEXPORT jfloat JNICALL -Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f( - JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs, - jlongArray side_ptrs, jlong out_ptr, jfloatArray scalars_, jlong m, jlong n, jlong grix) { - - SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx); - - const char *cstr_name = env->GetStringUTFChars(name, NULL); - - float **inputs = reinterpret_cast<float**>(GET_ARRAY(env, in_ptrs)); - float **sides = reinterpret_cast<float **>(GET_ARRAY(env, side_ptrs)); - float *scalars = reinterpret_cast<float *>(GET_ARRAY(env, scalars_)); - - float result = ctx_->execute_kernel( - cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs), - reinterpret_cast<float *>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, grix); - - RELEASE_ARRAY(env, in_ptrs, inputs); - RELEASE_ARRAY(env, side_ptrs, sides); - RELEASE_ARRAY(env, scalars_, scalars); - - // FIXME: that release causes an error - env->ReleaseStringUTFChars(name, cstr_name); - return result; -} +//JNIEXPORT jfloat JNICALL +//Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f( +// JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs, +// jlongArray side_ptrs, jlong out_ptr, jfloatArray scalars_, jlong m, jlong n, jlong out_len, jlong grix) { +// +// SpoofCUDAContext *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx); +// +// const char *cstr_name = env->GetStringUTFChars(name, NULL); +// +// float **inputs = reinterpret_cast<float**>(GET_ARRAY(env, in_ptrs)); +// float **sides = reinterpret_cast<float **>(GET_ARRAY(env, side_ptrs)); +// float *scalars = reinterpret_cast<float *>(GET_ARRAY(env, scalars_)); +// +// float result = ctx_->execute_kernel( +// cstr_name, inputs, env->GetArrayLength(in_ptrs), sides, env->GetArrayLength(side_ptrs), +// reinterpret_cast<float *>(out_ptr), scalars, env->GetArrayLength(scalars_), m, n, out_len, grix); +// +// RELEASE_ARRAY(env, in_ptrs, inputs); +// RELEASE_ARRAY(env, side_ptrs, sides); +// RELEASE_ARRAY(env, scalars_, scalars); +// +// // FIXME: that release causes an error +// env->ReleaseStringUTFChars(name, cstr_name); +// return result; +//} diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h b/src/main/cuda/spoof-launcher/jni_bridge.h index a06bb1b..788f93e 100644 --- a/src/main/cuda/spoof-launcher/jni_bridge.h +++ b/src/main/cuda/spoof-launcher/jni_bridge.h @@ -63,16 +63,16 @@ Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel( */ JNIEXPORT jdouble JNICALL Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d( - JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jdoubleArray, jlong, jlong, jlong); + JNIEnv *, jobject, jlong, jstring, jlongArray, jint, jlongArray, jlongArray, jdoubleArray, jlong, jobject, jobject); -/* - * Class: org_apache_sysds_runtime_instructions_gpu_SpoofCUDAInstruction - * Method: execute_f - * Signature: (...)Z - */ -JNIEXPORT jfloat JNICALL -Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f( - JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jfloatArray, jlong, jlong, jlong); +///* +// * Class: org_apache_sysds_runtime_instructions_gpu_SpoofCUDAInstruction +// * Method: execute_f +// * Signature: (...)Z +// */ +//JNIEXPORT jfloat JNICALL +//Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1f( +// JNIEnv *, jobject, jlong, jstring, jlongArray, jlongArray, jlong, jfloatArray, jlong, jlong, jlong, jlong); #ifdef __cplusplus } diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu index 2f76802..f26161b 100644 --- a/src/main/cuda/spoof/cellwise.cu +++ b/src/main/cuda/spoof/cellwise.cu @@ -1,5 +1,4 @@ -%TMP% - +/*%TMP%*/SPOOF_OP_NAME /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file @@ -28,27 +27,56 @@ #include "reduction.cuh" #include "spoof_utils.cuh" #include "utils.cuh" +#include "Matrix.h" -template<typename T> +template<typename T, int NUM_B> struct SpoofCellwiseOp { - T**b; T* scalars; - int m, n, grix_; + MatrixAccessor<T> A; + MatrixAccessor<T> b[NUM_B]; + MatrixAccessor<T> c; + T* scalars; + T* avals; + uint32_t* aix; + uint32_t alen; + uint32_t& n; + uint32_t _grix; + + SpoofCellwiseOp(Matrix<T>* _A, Matrix<T>* _B, Matrix<T>* _C, T* scalars, uint32_t grix) : + n(_A->cols), scalars(scalars), _grix(grix) { + A.init(_A); + c.init(_C); + alen = A.row_len(grix); + + if(_B) + for(auto i = 0; i < NUM_B; ++i) + b[i].init(&(_B[i])); + } - SpoofCellwiseOp(T** b, T* scalars, int m, int n, int grix) : - b(b), scalars(scalars), m(m), n(n), grix_(grix) {} + __device__ __forceinline__ T operator()(T a, uint32_t idx, uint32_t rix, uint32_t cix) { +//%NEED_RIX% +//%NEED_CIX% +//%NEED_GRIX% - __device__ __forceinline__ T operator()(T a, int idx) const { - int rix = idx / n; - int cix = idx % n; - int grix = grix_ + rix; %BODY_dense% - return %OUT%; - } + return %OUT%; + } }; -template<typename T> -__global__ void %TMP% (T *a, T** b, T* c, T* scalars, int m, int n, int grix) { - %AGG_OP%<T> agg_op; - SpoofCellwiseOp<T> spoof_op(b, scalars, m, n, grix); - %TYPE%<T, %AGG_OP%<T>, SpoofCellwiseOp<T>>(a, c, m, n, %INITIAL_VALUE%, agg_op, spoof_op); +template<typename T, int NUM_B> +__global__ void /*%TMP%*/SPOOF_OP_NAME_DENSE (Matrix<T>* a, Matrix<T>* b, Matrix<T>* c, T* scalars, uint32_t n, uint32_t grix) { + %AGG_OP%<T> agg_op; + SpoofCellwiseOp<T, NUM_B> spoof_op(a, b, c, scalars, grix); + %TYPE%<T, %AGG_OP%<T>, SpoofCellwiseOp<T, NUM_B>>(&(spoof_op.A), &(spoof_op.c), n, %INITIAL_VALUE%, agg_op, spoof_op); }; + +template<typename T, int NUM_B> +__global__ void /*%TMP%*/SPOOF_OP_NAME_SPARSE (Matrix<T>* a, Matrix<T>* b, Matrix<T>* c, T* scalars, uint32_t n, uint32_t grix) { + %AGG_OP%<T> agg_op; + SpoofCellwiseOp<T, NUM_B> spoof_op(a, b, c, scalars, grix); + %TYPE%_SPARSE<T, %AGG_OP%<T>, SpoofCellwiseOp<T, NUM_B>>(&(spoof_op.A), &(spoof_op.c), n, %INITIAL_VALUE%, agg_op, spoof_op); + +// if(blockIdx.x == 0 && threadIdx.x == 0) { +// for(auto i = 0; i < 30; ++i) +// printf("%4.3f ", spoof_op.c.val(i)); +// } +}; \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java index bbe5454..80192b5 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNode.java @@ -21,6 +21,7 @@ package org.apache.sysds.hops.codegen.cplan; import java.util.ArrayList; +import org.apache.sysds.hops.codegen.SpoofCompiler; import org.apache.sysds.hops.codegen.template.TemplateUtils; import org.apache.sysds.common.Types.DataType; import org.apache.sysds.runtime.controlprogram.parfor.util.IDSequence; @@ -84,13 +85,23 @@ public abstract class CNode public String getVarname(GeneratorAPI api) { return getVarname(); } - public String getVectorLength() { - if( getVarname().startsWith("a") ) - return "len"; - else if( getVarname().startsWith("b") ) - return getVarname()+".clen"; - else if( _dataType==DataType.MATRIX ) - return getVarname()+".length"; + public String getVectorLength(GeneratorAPI api) { + if(api == GeneratorAPI.CUDA) { + if( getVarname().startsWith("a") ) + return "a.cols()"; + if(getVarname().startsWith("b")) + return getVarname()+".cols()"; + else + return getVarname()+".length"; + } + else { + if( getVarname().startsWith("a") ) + return "len"; + if(getVarname().startsWith("b")) + return getVarname() + ".clen"; + else if(_dataType == DataType.MATRIX) + return getVarname() + ".length"; + } return ""; } @@ -210,13 +221,17 @@ public abstract class CNode && _literal == cthat._literal; } - protected String replaceUnaryPlaceholders(String tmp, String varj, boolean vectIn) { + protected String replaceUnaryPlaceholders(String tmp, String varj, boolean vectIn, GeneratorAPI api) { //replace sparse and dense inputs tmp = tmp.replace("%IN1v%", varj+"vals"); tmp = tmp.replace("%IN1i%", varj+"ix"); tmp = tmp.replace("%IN1%", - (vectIn && TemplateUtils.isMatrix(_inputs.get(0))) ? varj + ".values(rix)" : - (vectIn && TemplateUtils.isRowVector(_inputs.get(0)) ? varj + ".values(0)" : varj)); + (vectIn && TemplateUtils.isMatrix(_inputs.get(0))) ? + ((api == GeneratorAPI.JAVA) ? varj + ".values(rix)" : varj + ".vals(0)" ) : + (vectIn && TemplateUtils.isRowVector(_inputs.get(0)) ? + ((api == GeneratorAPI.JAVA) ? varj + ".values(0)" : varj + ".val(0)") : + (varj.startsWith("a") || TemplateUtils.isMatrix(_inputs.get(0))) ? + (api == GeneratorAPI.JAVA ? varj : varj + ".vals(0)") : varj)); //replace start position of main input String spos = (_inputs.get(0) instanceof CNodeData @@ -228,7 +243,7 @@ public abstract class CNode //replace length if( _inputs.get(0).getDataType().isMatrix() ) - tmp = tmp.replace("%LEN%", _inputs.get(0).getVectorLength()); + tmp = tmp.replace("%LEN%", _inputs.get(0).getVectorLength(api)); return tmp; } @@ -236,33 +251,45 @@ public abstract class CNode protected CodeTemplate getLanguageTemplateClass(CNode caller, GeneratorAPI api) { switch (api) { case CUDA: - if(caller instanceof CNodeCell) - return new org.apache.sysds.hops.codegen.cplan.cuda.CellWise(); - else if (caller instanceof CNodeUnary) - return new org.apache.sysds.hops.codegen.cplan.cuda.Unary(); - else if (caller instanceof CNodeBinary) + if(caller instanceof CNodeBinary) return new org.apache.sysds.hops.codegen.cplan.cuda.Binary(); - else if (caller instanceof CNodeTernary) + else if(caller instanceof CNodeTernary) return new org.apache.sysds.hops.codegen.cplan.cuda.Ternary(); - else - return null; - case JAVA: - if(caller instanceof CNodeCell) - return new org.apache.sysds.hops.codegen.cplan.java.CellWise(); - else if (caller instanceof CNodeUnary) - return new org.apache.sysds.hops.codegen.cplan.java.Unary(); - else if (caller instanceof CNodeBinary) + else if(caller instanceof CNodeUnary) + return new org.apache.sysds.hops.codegen.cplan.cuda.Unary(); + else return null; + case JAVA: + if(caller instanceof CNodeBinary) return new org.apache.sysds.hops.codegen.cplan.java.Binary(); - else if (caller instanceof CNodeTernary) + else if(caller instanceof CNodeTernary) return new org.apache.sysds.hops.codegen.cplan.java.Ternary(); - - else - return null; + else if(caller instanceof CNodeUnary) + return new org.apache.sysds.hops.codegen.cplan.java.Unary(); + else return null; default: throw new RuntimeException("API not supported by code generator: " + api.toString()); } } - + + protected String getLanguageTemplate(CNode caller, GeneratorAPI api) { + switch (api) { + case CUDA: + if(caller instanceof CNodeCell) + return CodeTemplate.getTemplate("/cuda/spoof/cellwise.cu"); + else if(caller instanceof CNodeRow) + return CodeTemplate.getTemplate("/cuda/spoof/rowwise.cu"); + else return null; + case JAVA: + if(caller instanceof CNodeCell) + return CodeTemplate.getTemplate("/java/org/apache/sysds/hops/codegen/cplan/java/Cellwise.java.template"); + else if(caller instanceof CNodeRow) + return CodeTemplate.getTemplate("/java/org/apache/sysds/hops/codegen/cplan/java/Rowwise.java.template"); + else return null; + default: + throw new RuntimeException("API not supported by code generator: " + api.toString()); + } + } + public abstract boolean isSupported(GeneratorAPI api); public void setVarName(String name) { _genVar = name; } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java index 15a26bc..925e055 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeBinary.java @@ -113,6 +113,9 @@ public class CNodeBinary extends CNode { String [] tmp = this.name().split("_"); return StringUtils.capitalize(tmp[1].toLowerCase()); } + + public boolean isNotSupportedBySpoofCUDA() { + return this == VECT_BIASADD || this == VECT_BIASMULT;} } private final BinType _type; @@ -157,7 +160,6 @@ public class CNodeBinary extends CNode { boolean scalarVector = (_inputs.get(0).getDataType().isScalar() && _inputs.get(1).getDataType().isMatrix()); String var = createVarname(); -// String tmp = _type.getTemplate(api, lang, lsparseLhs, lsparseRhs, scalarVector, scalarInput); String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparseLhs, lsparseRhs, scalarVector, scalarInput); tmp = tmp.replace("%TMP%", var); @@ -169,24 +171,29 @@ public class CNodeBinary extends CNode { //replace sparse and dense inputs tmp = tmp.replace("%IN"+(j+1)+"v%", varj+"vals"); tmp = tmp.replace("%IN"+(j+1)+"i%", varj+"ix"); - tmp = tmp.replace("%IN"+(j+1)+"%", - varj.startsWith("b") ? varj + ".values(rix)" : varj ); + tmp = tmp.replace("%IN"+(j+1)+"%", + varj.startsWith("a") ? (api == GeneratorAPI.JAVA ? varj : + (_inputs.get(j).getDataType() == DataType.MATRIX ? varj + ".vals(0)" : varj)) : +// varj.startsWith("b") ? (api == GeneratorAPI.JAVA ? varj + ".values(rix)" : varj + ".vals(0)") : varj); + varj.startsWith("b") ? (api == GeneratorAPI.JAVA ? varj + ".values(rix)" : + (_type == BinType.VECT_MATRIXMULT ? varj : varj + ".vals(0)")) : + _inputs.get(j).getDataType() == DataType.MATRIX ? (api == GeneratorAPI.JAVA ? varj : varj + ".vals(0)") : varj); //replace start position of main input tmp = tmp.replace("%POS"+(j+1)+"%", (_inputs.get(j) instanceof CNodeData - && _inputs.get(j).getDataType().isMatrix()) ? (!varj.startsWith("b")) ? varj+"i" : - (TemplateUtils.isMatrix(_inputs.get(j)) && _type!=BinType.VECT_MATRIXMULT) ? - varj + ".pos(rix)" : "0" : "0"); + && _inputs.get(j).getDataType().isMatrix()) ? (!varj.startsWith("b")) ? varj+"i" : + (TemplateUtils.isMatrix(_inputs.get(j)) && _type!=BinType.VECT_MATRIXMULT) ? + varj + ".pos(rix)" : "0" : "0"); } //replace length information (e.g., after matrix mult) if( _type == BinType.VECT_OUTERMULT_ADD ) { for( int j=0; j<2; j++ ) - tmp = tmp.replace("%LEN"+(j+1)+"%", _inputs.get(j).getVectorLength()); + tmp = tmp.replace("%LEN"+(j+1)+"%", _inputs.get(j).getVectorLength(api)); } else { //general case CNode mInput = getIntermediateInputVector(); if( mInput != null ) - tmp = tmp.replace("%LEN%", mInput.getVectorLength()); + tmp = tmp.replace("%LEN%", mInput.getVectorLength(api)); } sb.append(tmp); @@ -418,6 +425,11 @@ public class CNodeBinary extends CNode { @Override public boolean isSupported(GeneratorAPI api) { boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA); + + // ToDo: support these + if(api == GeneratorAPI.CUDA) + is_supported = !_type.isNotSupportedBySpoofCUDA(); + int i = 0; while(is_supported && i < _inputs.size()) { CNode in = _inputs.get(i++); diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java index 74292b6..f2f0179 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java @@ -119,16 +119,42 @@ public class CNodeCell extends CNodeTpl public String codegen(boolean sparse, GeneratorAPI _api) { api = _api; - String tmp = getLanguageTemplateClass(this, api).getTemplate(_type); + String tmp = getLanguageTemplate(this, api); //generate dense/sparse bodies String tmpDense = _output.codegen(false, api); + // ToDo: workaround to fix name clash of cell and row template + if(api == GeneratorAPI.CUDA) + tmpDense = tmpDense.replace("a.vals(0)", "a"); _output.resetGenerated(); + String varName; if(getVarname() == null) - tmp = tmp.replace("%TMP%", createVarname()); +// tmp = tmp.replace("%TMP%", createVarname()); + varName = createVarname(); else - tmp = tmp.replace("%TMP%", getVarname()); +// tmp = tmp.replace("%TMP%", getVarname()); + varName = getVarname(); + + if(api == GeneratorAPI.JAVA) + tmp = tmp.replace("%TMP%", varName); + else + tmp = tmp.replace("/*%TMP%*/SPOOF_OP_NAME", varName); + + if(tmpDense.contains("grix")) + tmp = tmp.replace("//%NEED_GRIX%", "\t\tuint32_t grix=_grix + rix;"); + else + tmp = tmp.replace("//%NEED_GRIX%", ""); + +// if(tmpDense.contains("rix")) +// tmp = tmp.replace("//%NEED_RIX%", "\t\tuint32_t rix = idx / A.cols();\n"); +// else + tmp = tmp.replace("//%NEED_RIX%", ""); + +// if(tmpDense.contains("cix")) +// tmp = tmp.replace("//%NEED_CIX%", "\t\tuint32_t cix = idx % A.cols();"); +// else + tmp = tmp.replace("//%NEED_CIX%", ""); tmp = tmp.replace("%BODY_dense%", tmpDense); @@ -140,7 +166,10 @@ public class CNodeCell extends CNodeTpl tmp = tmp.replace("%AGG_OP_NAME%", (_aggOp != null) ? "AggOp." + _aggOp.name() : "null"); tmp = tmp.replace("%SPARSE_SAFE%", String.valueOf(isSparseSafe())); tmp = tmp.replace("%SEQ%", String.valueOf(containsSeq())); - + + // maybe empty lines + //tmp = tmp.replaceAll("(?m)^[ \t]*\r?\n", ""); + if(api == GeneratorAPI.CUDA) { // ToDo: initial_value is misused to pass VT (values per thread) to no_agg operator String agg_op = "IdentityOp"; diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java index b91c66f..940f3e2 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeData.java @@ -23,6 +23,7 @@ import org.apache.commons.lang.StringUtils; import org.apache.sysds.hops.Hop; import org.apache.sysds.common.Types.DataType; import org.apache.sysds.hops.codegen.SpoofCompiler; +import org.apache.sysds.runtime.codegen.CodegenUtils; import org.apache.sysds.runtime.util.UtilFunctions; import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI; import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision; @@ -92,7 +93,7 @@ public class CNodeData extends CNode return isSinglePrecision() ? "-CUDART_INF_F" : "-CUDART_INF"; else if ("true".equals(_name) || "false".equals(_name)) return "true".equals(_name) ? "1" : "0"; - else if (StringUtils.isNumeric(_name)) + else if (CodegenUtils.isNumeric(_name)) return isSinglePrecision() ? _name + ".0f" : _name + ".0"; else return _name; diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java index 5500ddb..dcf18ec 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeNary.java @@ -115,7 +115,7 @@ public class CNodeNary extends CNode public String codegen(boolean sparse, GeneratorAPI api) { if( isGenerated() ) return ""; - + StringBuilder sb = new StringBuilder(); //generate children @@ -134,8 +134,8 @@ public class CNodeNary extends CNode String varj1 = _inputs.get(0).getVarname(); String varj2 = _inputs.get(1).getVarname(); tmp = (_type == NaryType.VECT_CONV2DMM) ? - replaceBinaryPlaceholders(tmp, new String[]{varj1,varj2}, false) : - replaceUnaryPlaceholders(tmp, varj1, false); + replaceBinaryPlaceholders(tmp, new String[]{varj1,varj2}, false, api) : + replaceUnaryPlaceholders(tmp, varj1, false, api); sb.append(tmp); @@ -251,7 +251,7 @@ public class CNodeNary extends CNode } - private String replaceBinaryPlaceholders(String tmp, String[] vars, boolean vectIn) { + private String replaceBinaryPlaceholders(String tmp, String[] vars, boolean vectIn, GeneratorAPI api) { //replace sparse and dense inputs for( int j=0; j<2; j++ ) { String varj = vars[j]; @@ -259,8 +259,11 @@ public class CNodeNary extends CNode //replace sparse and dense inputs tmp = tmp.replace("%IN"+(j+1)+"v%", varj+"vals"); tmp = tmp.replace("%IN"+(j+1)+"i%", varj+"ix"); - tmp = tmp.replace("%IN"+(j+1)+"%", - varj.startsWith("b") ? varj + ".values(rix)" : varj ); +// tmp = tmp.replace("%IN"+(j+1)+"%", +// varj.startsWith("b") ? varj + ".values(rix)" : varj ); + tmp = tmp.replace("%IN"+(j+1)+"%", + varj.startsWith("b") ? ((api == GeneratorAPI.JAVA) ? varj + ".values(rix)" : + varj + ".vals(0)") : varj); //replace start position of main input tmp = tmp.replace("%POS"+(j+1)+"%", (_inputs.get(j) instanceof CNodeData @@ -270,7 +273,7 @@ public class CNodeNary extends CNode //replace length if( _inputs.get(0).getDataType().isMatrix() ) - tmp = tmp.replace("%LEN%", _inputs.get(0).getVectorLength()); + tmp = tmp.replace("%LEN%", _inputs.get(0).getVectorLength(api)); return tmp; } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java index 2ff9054..6ec252a 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeUnary.java @@ -76,6 +76,11 @@ public class CNodeUnary extends CNode POW2, MULT2, ABS, ROUND, CEIL, FLOOR, SIGN, SIN, TAN, SPROP}, this); } + + public boolean isNotSupportedBySpoofCUDA() { + return this == VECT_CUMSUM || this == VECT_CUMMIN || this == VECT_CUMMAX|| + this == VECT_SPROP || this == VECT_SIGMOID; + } } private UnaryType _type; @@ -110,12 +115,12 @@ public class CNodeUnary extends CNode && !_inputs.get(0).isLiteral()); String var = createVarname(); String tmp = getLanguageTemplateClass(this, api).getTemplate(_type, lsparse); - tmp = tmp.replace("%TMP%", var); + tmp = tmp.replaceAll("%TMP%", var); //replace sparse and dense inputs String varj = _inputs.get(0).getVarname(); boolean vectIn = varj.startsWith("b") && !_type.isScalarLookup(); - tmp = replaceUnaryPlaceholders(tmp, varj, vectIn); + tmp = replaceUnaryPlaceholders(tmp, varj, vectIn, api); sb.append(tmp); @@ -260,9 +265,15 @@ public class CNodeUnary extends CNode return super.equals(that) && _type == that._type; } + @Override public boolean isSupported(GeneratorAPI api) { boolean is_supported = (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA); + + // ToDo: support these + if(api == GeneratorAPI.CUDA) + is_supported = !_type.isNotSupportedBySpoofCUDA(); + int i = 0; while(is_supported && i < _inputs.size()) { CNode in = _inputs.get(i++); diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java index 8a8a3be..34f0b66 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java @@ -19,18 +19,52 @@ package org.apache.sysds.hops.codegen.cplan; -import org.apache.sysds.runtime.codegen.SpoofCellwise; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.conf.DMLConfig; +import org.apache.sysds.runtime.io.IOUtilFunctions; -public interface CodeTemplate { +import java.io.FileInputStream; +import java.io.IOException; - String getTemplate(); - - String getTemplate(CNodeUnary.UnaryType type, boolean sparse); - - String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, - boolean scalarInput); - - String getTemplate(CNodeTernary.TernaryType type, boolean sparse); - - String getTemplate(SpoofCellwise.CellType ct); +public abstract class CodeTemplate { + + public String getTemplate() { + throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); + } + + public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, + boolean scalarInput) { + throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); + } + + public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { + throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); + } + + public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { + throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); + } + + public static String getTemplate(String templateFileName) { + try { + // Change prefix to the code template file if running from jar. File were extracted to a temporary + // directory in that case. By default we load the template from the source tree. + if(CodeTemplate.class.getProtectionDomain().getCodeSource().getLocation().getPath().contains(".jar")) { + if(templateFileName.contains(".java")) { + templateFileName = templateFileName + .replace("/java/org/apache/sysds/hops/codegen/cplan/java/", "/java/spoof/"); + } + return (IOUtilFunctions.toString(new FileInputStream(ConfigurationManager.getDMLConfig() + .getTextValue(DMLConfig.LOCAL_TMP_DIR) + templateFileName))); + } + else + return IOUtilFunctions.toString(new FileInputStream(System.getProperty("user.dir") + + "/src/main" + templateFileName)); + } + catch(IOException e) { + System.out.println(e.getMessage()); + return null; + } + } + } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java index 0365afd..30b48da 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Binary.java @@ -20,29 +20,13 @@ package org.apache.sysds.hops.codegen.cplan.cuda; import org.apache.sysds.hops.codegen.cplan.CNodeBinary; -import org.apache.sysds.hops.codegen.cplan.CNodeTernary; -import org.apache.sysds.hops.codegen.cplan.CNodeUnary; import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision; -public class Binary implements CodeTemplate { +public class Binary extends CodeTemplate { + @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) { @@ -263,11 +247,11 @@ public class Binary implements CodeTemplate { //scalar-scalar operations case MULT: - return " T %TMP% = %IN1% * %IN2%;\n"; + return " T %TMP% = %IN1% * %IN2%;\n"; case DIV: return " T %TMP% = %IN1% / %IN2%;\n"; case PLUS: - return " T %TMP% = %IN1% + %IN2%;\n"; + return " T %TMP% = %IN1% + %IN2%;\n"; case MINUS: return " T %TMP% = %IN1% - %IN2%;\n"; case MODULUS: @@ -307,16 +291,11 @@ public class Binary implements CodeTemplate { case BITWAND: return " T %TMP% = bwAnd(%IN1%, %IN2%);\n"; case SEQ_RIX: - return " T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix + return " T %TMP% = %IN1% + grix * %IN2%;\n"; //0-based global rix default: throw new RuntimeException("Invalid binary type: " + this.toString()); } } } - - @Override - public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/CellWise.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/CellWise.java deleted file mode 100644 index beb2398..0000000 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/CellWise.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.hops.codegen.cplan.cuda; - -import java.io.FileInputStream; -import java.io.IOException; - -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.conf.DMLConfig; -import org.apache.sysds.hops.codegen.cplan.CNodeBinary; -import org.apache.sysds.hops.codegen.cplan.CNodeTernary; -import org.apache.sysds.hops.codegen.cplan.CNodeUnary; -import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; -import org.apache.sysds.runtime.io.IOUtilFunctions; - - -// ToDo: clean code template and load from file -public class CellWise implements CodeTemplate { - - private static final String TEMPLATE_PATH = "/cuda/spoof/cellwise.cu"; - - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - try { - // Change prefix to the code template file if running from jar. File were extracted to a temporary - // directory in that case. By default we load the template from the source tree. - if(CellWise.class.getProtectionDomain().getCodeSource().getLocation().getPath().contains(".jar")) - return(IOUtilFunctions.toString(new FileInputStream(ConfigurationManager.getDMLConfig() - .getTextValue(DMLConfig.LOCAL_TMP_DIR) + TEMPLATE_PATH))); - else - return IOUtilFunctions.toString(new FileInputStream(System.getProperty("user.dir") + - "/src/main" + TEMPLATE_PATH)); - } - catch(IOException e) { - System.out.println(e.getMessage()); - return null; - } - } - - @Override - public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } -} diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java index 355e579..dd06d6c 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Ternary.java @@ -19,15 +19,12 @@ package org.apache.sysds.hops.codegen.cplan.cuda; -import org.apache.sysds.hops.codegen.cplan.CNodeBinary; import org.apache.sysds.hops.codegen.cplan.CNodeTernary; -import org.apache.sysds.hops.codegen.cplan.CNodeUnary; import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision; -public class Ternary implements CodeTemplate { +public class Ternary extends CodeTemplate { @Override public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { @@ -58,10 +55,11 @@ public class Ternary implements CodeTemplate { case LOOKUP_RC1: return sparse ? " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" : - " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n"; +// " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n"; + " T %TMP% = %IN1%.val(rix, %IN3%-1);\n"; case LOOKUP_RVECT1: - return " T[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n"; + return "\t\tVector<T>& %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n"; default: throw new RuntimeException("Invalid ternary type: " + this.toString()); @@ -94,10 +92,12 @@ public class Ternary implements CodeTemplate { case LOOKUP_RC1: return sparse ? " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, %IN3%-1);\n" : - " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n"; - +// " T %TMP% = getValue(%IN1%, %IN2%, rix, %IN3%-1);\n"; + " T %TMP% = %IN1%.val(rix, %IN3%-1);\n"; + + case LOOKUP_RVECT1: - return " T[] %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1);\n"; + return "\t\tVector<T>& %TMP% = getVector(%IN1%, %IN2%, rix, %IN3%-1, this);\n"; default: throw new RuntimeException("Invalid ternary type: "+this.toString()); @@ -105,26 +105,4 @@ public class Ternary implements CodeTemplate { } } - - @Override - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, - boolean scalarInput) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java index 0b5852d..459d1c8 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java @@ -20,15 +20,13 @@ package org.apache.sysds.hops.codegen.cplan.cuda; import org.apache.commons.lang.StringUtils; -import org.apache.sysds.hops.codegen.cplan.CNodeBinary; -import org.apache.sysds.hops.codegen.cplan.CNodeTernary; import org.apache.sysds.hops.codegen.cplan.CNodeUnary; import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision; -public class Unary implements CodeTemplate { +public class Unary extends CodeTemplate { + @Override public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { if(isSinglePrecision()) { @@ -179,7 +177,8 @@ public class Unary implements CodeTemplate { case LOOKUP_R: return sparse ? " T %TMP% = getValue(%IN1v%, %IN1i%, ai, alen, 0);\n" : - " T %TMP% = getValue(%IN1%, rix);\n"; + " T %TMP% = %IN1%.val(rix);\n"; +// " T %TMP% = getValue(%IN1%, rix);\n"; case LOOKUP_C: return " T %TMP% = getValue(%IN1%, n, 0, cix);\n"; case LOOKUP_RC: @@ -211,11 +210,11 @@ public class Unary implements CodeTemplate { case TANH: return " T %TMP% = tanh(%IN1%);\n"; case SIGN: - return " T %TMP% = signbit(%IN1%) == 0 ? 1.0f : -1.0f;\n"; + return " T %TMP% = signbit(%IN1%) == 0 ? 1.0 : -1.0;\n"; case SQRT: return " T %TMP% = sqrt(%IN1%);\n"; case LOG: - return " T %TMP% = log(%IN1%);\n"; + return " T %TMP% = log(%IN1%);\n"; case ROUND: return " T %TMP% = round(%IN1%);\n"; case CEIL: @@ -235,24 +234,4 @@ public class Unary implements CodeTemplate { } } - - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java index 28b970a..1453b44 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Binary.java @@ -20,13 +20,10 @@ package org.apache.sysds.hops.codegen.cplan.java; import org.apache.sysds.hops.codegen.cplan.CNodeBinary.BinType; -import org.apache.sysds.hops.codegen.cplan.CNodeTernary; -import org.apache.sysds.hops.codegen.cplan.CNodeUnary; import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; -public class Binary implements CodeTemplate { - @Override +public class Binary extends CodeTemplate { + public String getTemplate(BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) { @@ -178,23 +175,4 @@ public class Binary implements CodeTemplate { } } - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java deleted file mode 100644 index 85476a7..0000000 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/CellWise.java +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.hops.codegen.cplan.java; - -import org.apache.sysds.hops.codegen.cplan.CNodeBinary; -import org.apache.sysds.hops.codegen.cplan.CNodeTernary; -import org.apache.sysds.hops.codegen.cplan.CNodeUnary; -import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; - -public class CellWise implements CodeTemplate { - public static final String TEMPLATE = - "package codegen;\n" - + "import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;\n" - + "import org.apache.sysds.runtime.codegen.SpoofCellwise;\n" - + "import org.apache.sysds.runtime.codegen.SpoofCellwise.AggOp;\n" - + "import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType;\n" - + "import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput;\n" - + "import org.apache.commons.math3.util.FastMath;\n" - + "\n" - + "public final class %TMP% extends SpoofCellwise {\n" - + " public %TMP%() {\n" - + " super(CellType.%TYPE%, %SPARSE_SAFE%, %SEQ%, %AGG_OP_NAME%);\n" - + " }\n" - + " protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long grix, int rix, int cix) { \n" - + "%BODY_dense%" - + " return %OUT%;\n" - + " }\n" - + "}\n"; - - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - switch(ct) { - case NO_AGG: - case FULL_AGG: - case ROW_AGG: - case COL_AGG: - default: - return TEMPLATE; - } - } - - @Override - public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } -} diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Cellwise.java.template similarity index 57% copy from src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java copy to src/main/java/org/apache/sysds/hops/codegen/cplan/java/Cellwise.java.template index 8a8a3be..84183ae 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CodeTemplate.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Cellwise.java.template @@ -17,20 +17,19 @@ * under the License. */ -package org.apache.sysds.hops.codegen.cplan; - +package codegen; +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; import org.apache.sysds.runtime.codegen.SpoofCellwise; - -public interface CodeTemplate { - - String getTemplate(); - - String getTemplate(CNodeUnary.UnaryType type, boolean sparse); - - String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, - boolean scalarInput); - - String getTemplate(CNodeTernary.TernaryType type, boolean sparse); - - String getTemplate(SpoofCellwise.CellType ct); -} +import org.apache.sysds.runtime.codegen.SpoofCellwise.AggOp; +import org.apache.sysds.runtime.codegen.SpoofCellwise.CellType; +import org.apache.sysds.runtime.codegen.SpoofOperator.SideInput; +import org.apache.commons.math3.util.FastMath; + +/* This is a SPOOF code generation template */ +public final class %TMP% extends SpoofCellwise { + public %TMP%() { super(CellType.%TYPE%, %SPARSE_SAFE%, %SEQ%, %AGG_OP_NAME%); } + + protected double genexec(double a, SideInput[] b, double[] scalars, int m, int n, long grix, int rix, int cix) { +%BODY_dense% return %OUT%; + } +}; diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java index a499f49..a86d51c 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Ternary.java @@ -19,13 +19,10 @@ package org.apache.sysds.hops.codegen.cplan.java; -import org.apache.sysds.hops.codegen.cplan.CNodeBinary; import org.apache.sysds.hops.codegen.cplan.CNodeTernary; -import org.apache.sysds.hops.codegen.cplan.CNodeUnary; import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; -public class Ternary implements CodeTemplate { +public class Ternary extends CodeTemplate { @Override public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { @@ -64,26 +61,4 @@ public class Ternary implements CodeTemplate { throw new RuntimeException("Invalid ternary type: "+this.toString()); } } - - @Override - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, - boolean scalarInput) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeUnary.UnaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - } diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java index 5f6a392..43f1b5d 100644 --- a/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java +++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/java/Unary.java @@ -20,13 +20,10 @@ package org.apache.sysds.hops.codegen.cplan.java; import org.apache.commons.lang.StringUtils; -import org.apache.sysds.hops.codegen.cplan.CNodeBinary; -import org.apache.sysds.hops.codegen.cplan.CNodeTernary; import org.apache.sysds.hops.codegen.cplan.CNodeUnary.UnaryType; import org.apache.sysds.hops.codegen.cplan.CodeTemplate; -import org.apache.sysds.runtime.codegen.SpoofCellwise; -public class Unary implements CodeTemplate { +public class Unary extends CodeTemplate { @Override public String getTemplate(UnaryType type, boolean sparse) { switch( type ) { @@ -129,24 +126,4 @@ public class Unary implements CodeTemplate { throw new RuntimeException("Invalid unary type: "+this.toString()); } } - - @Override - public String getTemplate(CNodeBinary.BinType type, boolean sparseLhs, boolean sparseRhs, boolean scalarVector, boolean scalarInput) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate() { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(SpoofCellwise.CellType ct) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } - - @Override - public String getTemplate(CNodeTernary.TernaryType type, boolean sparse) { - throw new RuntimeException("Calling wrong getTemplate method on " + getClass().getCanonicalName()); - } } diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java index e0fc9c4..f3ccf12 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java @@ -30,19 +30,22 @@ import org.apache.sysds.hops.codegen.cplan.CNodeTpl; import org.apache.sysds.runtime.controlprogram.caching.MatrixObject; import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; import org.apache.sysds.runtime.instructions.cp.ScalarObject; +import org.apache.sysds.runtime.instructions.gpu.context.GPUObject; import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import static org.apache.sysds.runtime.matrix.data.LibMatrixNative.isSinglePrecision; - public class SpoofCUDA extends SpoofOperator { private static final long serialVersionUID = -2161276866245388359L; private final CNodeTpl cnt; + private final Class<?> java_op; public final String name; + public final String src; - public SpoofCUDA(CNodeTpl cnode) { + public SpoofCUDA(String source, CNodeTpl cnode, Class<?> java_op) { name = "codegen." + cnode.getVarname(); cnt = cnode; + src = source; + this.java_op = java_op; } public String getName() { @@ -71,40 +74,65 @@ public class SpoofCUDA extends SpoofOperator { } public double execute(ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects, MatrixObject out_obj, - ExecutionContext ec) { + ExecutionContext ec, boolean sparseOut) { double ret; - long out_ptr = 0; - - if(out_obj != null) - out_ptr = ec.getGPUPointerAddress(out_obj); - + long[] out_ptr = {0,0,0,0}; + + if(out_obj != null) { + if(sparseOut) { + out_ptr[0] = ec.getGPUSparsePointerAddress(out_obj).nnz; + out_ptr[1] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(out_obj).rowPtr); + out_ptr[2] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(out_obj).colInd); + out_ptr[3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(out_obj).val); + } + else + out_ptr[3] = ec.getGPUDensePointerAddress(out_obj); + } + int offset = 1; if(cnt instanceof CNodeOuterProduct) offset = 2; - + // only dense input preparation for now - long[] in_ptrs = new long[offset]; - for(int i = 0; i < offset; ++i) - in_ptrs[i] = ec.getGPUPointerAddress(inputs.get(i)); - - long[] side_ptrs = new long[inputs.size() - offset]; - for(int i = offset; i < inputs.size(); ++i) - side_ptrs[i - offset] = ec.getGPUPointerAddress(inputs.get(i)); - - if(isSinglePrecision()) { - float[] scalars = prepInputScalarsFloat(scalarObjects); - - // ToDo: handle float - ret = execute_f(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1], - in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), 0); - + long[] in_ptrs = new long[offset * 4]; + for(int i = 0; i < offset; i += 4) { + if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) { + in_ptrs[i * 4] = ec.getGPUSparsePointerAddress(inputs.get(i)).nnz; + in_ptrs[i * 4 + 1] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr); + in_ptrs[i * 4 + 2] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd); + in_ptrs[i * 4 + 3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val); + } + else + in_ptrs[i * 4 + 3] = ec.getGPUDensePointerAddress(inputs.get(i)); + } + + long[] side_ptrs = new long[(inputs.size() - offset) * 4]; + for(int i = offset; i < inputs.size(); i++) { + int j = (i - offset) * 4; + if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) { + side_ptrs[j] = ec.getGPUSparsePointerAddress(inputs.get(i)).nnz; + side_ptrs[j + 1] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr); + side_ptrs[j + 2] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd); + side_ptrs[j + 3] = GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val); + } + else + side_ptrs[j + 3] = ec.getGPUDensePointerAddress(inputs.get(i)); } - else { - double[] scalars = prepInputScalars(scalarObjects); +// if(isSinglePrecision()) { +// float[] scalars = prepInputScalarsFloat(scalarObjects); +// +// // ToDo: handle float +// ret = execute_f(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1], +// in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), out_obj.getNumColumns(),0); +// +// } +// else { + double[] scalars = prepInputScalars(scalarObjects); + ret = execute_d(SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA), name.split("\\.")[1], - in_ptrs, side_ptrs, out_ptr, scalars, inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), 0); - } + in_ptrs, offset, side_ptrs, out_ptr, scalars,0, inputs, out_obj); +// } return ret; } @@ -115,8 +143,8 @@ public class SpoofCUDA extends SpoofOperator { } private native float execute_f(long ctx, String name, long[] in_ptr, long[] side_ptr, - long out_ptr, float[] scalars, long m, long n, long grix); + long out_ptr, float[] scalars, long m, long n, long out_len, long grix); - private native double execute_d(long ctx, String name, long[] in_ptr, long[] side_ptr, - long out_ptr, double[] scalars, long m, long n, long grix); + private native double execute_d(long ctx, String name, long[] in_ptr, int offset, long[] side_ptr, long[] out_ptr, + double[] scalars, long grix, ArrayList<MatrixObject> inputs, MatrixObject output); } diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java index 9587b95..deaa680 100644 --- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java @@ -43,6 +43,7 @@ import org.apache.sysds.runtime.instructions.cp.Data; import org.apache.sysds.runtime.instructions.cp.ListObject; import org.apache.sysds.runtime.instructions.cp.ScalarObject; import org.apache.sysds.runtime.instructions.cp.ScalarObjectFactory; +import org.apache.sysds.runtime.instructions.gpu.context.CSRPointer; import org.apache.sysds.runtime.instructions.gpu.context.GPUContext; import org.apache.sysds.runtime.instructions.gpu.context.GPUObject; import org.apache.sysds.runtime.lineage.Lineage; @@ -366,7 +367,7 @@ public class ExecutionContext { public Pair<MatrixObject, Boolean> getSparseMatrixOutputForGPUInstruction(String varName, long numRows, long numCols, long nnz) { MatrixObject mo = allocateGPUMatrixObject(varName, numRows, numCols); mo.getDataCharacteristics().setNonZeros(nnz); - boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(); + boolean allocated = mo.getGPUObject(getGPUContext(0)).acquireDeviceModifySparse(); return new Pair<>(mo, allocated); } @@ -405,15 +406,21 @@ public class ExecutionContext { mo.getGPUObject(getGPUContext(0)).addWriteLock(); return mo; } - - public long getGPUPointerAddress(MatrixObject obj) { - - if(obj.getGPUObject(getGPUContext(0)) == null) + + public long getGPUDensePointerAddress(MatrixObject obj) { + if(obj.getGPUObject(getGPUContext(0)) == null) return 0; else - return obj.getGPUObject(getGPUContext(0)).getPointerAddress(); + return obj.getGPUObject(getGPUContext(0)).getDensePointerAddress(); } - + + public CSRPointer getGPUSparsePointerAddress(MatrixObject obj) { + if(obj.getGPUObject(getGPUContext(0)) == null) + throw new RuntimeException("No CSRPointer for MatrixObject " + obj.toString()); + else + return obj.getGPUObject(getGPUContext(0)).getJcudaSparseMatrixPtr(); + } + public MatrixObject getMatrixInputForGPUInstruction(String varName, String opcode) { GPUContext gCtx = getGPUContext(0); MatrixObject mo = getMatrixObject(varName); diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java index 20f4333..6f3a1f7 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java @@ -31,6 +31,7 @@ import org.apache.sysds.runtime.controlprogram.context.ExecutionContext; import org.apache.sysds.runtime.instructions.InstructionUtils; import org.apache.sysds.runtime.instructions.cp.CPOperand; import org.apache.sysds.runtime.instructions.cp.ScalarObject; +import org.apache.sysds.runtime.instructions.gpu.context.GPUObject; import org.apache.sysds.runtime.lineage.LineageItem; import org.apache.sysds.runtime.lineage.LineageItemUtils; import org.apache.sysds.runtime.lineage.LineageTraceable; @@ -88,21 +89,46 @@ public class SpoofCUDAInstruction extends GPUInstruction implements LineageTrace // set the output dimensions to the hop node matrix dimensions if( _out.getDataType() == Types.DataType.MATRIX) { - long rows = inputs.get(0).getNumRows(); - long cols = inputs.get(0).getNumColumns(); + long out_rows = ec.getMatrixObject(_out.getName()).getNumRows(); + long out_cols = ec.getMatrixObject(_out.getName()).getNumColumns(); + if(_op.getSpoofTemplateType().contains("CW")) if(((CNodeCell)_op.getCNodeTemplate()).getCellType() == SpoofCellwise.CellType.COL_AGG) - rows = 1; + out_rows = 1; else if(((CNodeCell)_op.getCNodeTemplate()).getCellType() == SpoofCellwise.CellType.ROW_AGG) - cols = 1; - - MatrixObject out_obj = ec.getDenseMatrixOutputForGPUInstruction(_out.getName(), rows, cols).getKey(); - ec.setMetaData(_out.getName(), out_obj.getNumRows(), out_obj.getNumColumns()); - _op.execute(inputs, scalars, out_obj, ec); + out_cols = 1; + + + if(_op.getSpoofTemplateType().contains("RA")) { + // ToDo: make this workaround proper!! + boolean isConstDim2 = false; + int pos = _op.src.indexOf("// ConstDim2: "); + String strDim2 = _op.src.substring(pos + 14, _op.src.indexOf(System.lineSeparator(), pos)); + int dim2 = Integer.parseInt(strDim2); + if(dim2 > 0) + isConstDim2 = true; + + long n = inputs.get(0).getNumColumns(); + long n2 = isConstDim2 ? dim2 : inputs.get(0).getNumRows(); + if(_op.src.contains("COL_AGG_B1_T")) { + out_rows = n; + out_cols = n2; + } + + } + ec.setMetaData(_out.getName(), out_rows, out_cols); + GPUObject g = inputs.get(0).getGPUObject(ec.getGPUContext(0)); + boolean sparseOut = g.isSparse() && _op.getSpoofTemplateType().contains("CW"); + MatrixObject out_obj = sparseOut ? + (ec.getSparseMatrixOutputForGPUInstruction(_out.getName(), out_rows, out_cols, g.getNnz(getOpcode(), false)).getKey()) : + (ec.getDenseMatrixOutputForGPUInstruction(_out.getName(), out_rows, out_cols).getKey()); + +// ec.setMetaData(_out.getName(), out_obj.getNumRows(), out_obj.getNumColumns()); + _op.execute(inputs, scalars, out_obj, ec, sparseOut); ec.releaseMatrixOutputForGPUInstruction(_out.getName()); } else if (_out.getDataType() == Types.DataType.SCALAR) { - ScalarObject out = new DoubleObject(_op.execute(inputs, scalars, null, ec)); + ScalarObject out = new DoubleObject(_op.execute(inputs, scalars, null, ec, false)); ec.setScalarOutput(_out.getName(), out); } diff --git a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java index b2967bd..f59ab95 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java @@ -83,7 +83,7 @@ public class GPUObject { protected boolean writeLock = false; /** - * Timestamp, needed by {@link GPUContext#evict(long)} + * Timestamp, needed by {@link GPUContext\#evict(long)} */ AtomicLong timestamp = new AtomicLong(); @@ -1006,7 +1006,7 @@ public class GPUObject { return sb.toString(); } - private static long getPointerAddress(Pointer p) { + private static long getPointerAddressInternal(Pointer p) { // WORKAROUND until a method like CUdeviceptr#getAddress exists in jCuda class PointerWithAddress extends Pointer { @@ -1021,8 +1021,11 @@ public class GPUObject { } return new PointerWithAddress(p).getAddress(); } - - public long getPointerAddress() { - return getPointerAddress(getDensePointer()); + + public long getDensePointerAddress() { + return getPointerAddressInternal(getDensePointer()); } -} + + public static long getPointerAddress(Pointer p) { + return getPointerAddressInternal(p); + }}
