[systemds] 03/05: [SYSTEMDS-2852] Improve SPOOF CUDA compilation

markd Tue, 23 Feb 2021 15:37:20 -0800

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit a7efd67b1d0b87975fdda7051ff48da4dcead9f0
Author: Mark Dokter <[email protected]>
AuthorDate: Mon Feb 22 16:00:42 2021 +0100

    [SYSTEMDS-2852] Improve SPOOF CUDA compilation
    
    Compilation is now invoked per code template separately and all necessary 
flags for the operator are passed.
    Also contains a bugfix for the full aggregation kernel template (wrong row 
index passed). All cellwise and rowwise template tests pass except for some 
runtime inaccuracies.
---
 src/main/cuda/headers/reduction.cuh                |  20 +-
 src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp  | 216 ++++++++----------
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h    | 114 +++++-----
 src/main/cuda/spoof-launcher/jni_bridge.cpp        | 247 ++++++++++++++-------
 src/main/cuda/spoof-launcher/jni_bridge.h          |  16 ++
 src/main/cuda/spoof/cellwise.cu                    |   1 +
 src/main/java/org/apache/sysds/common/Types.java   |  37 ++-
 .../apache/sysds/hops/codegen/SpoofCompiler.java   |  18 +-
 .../apache/sysds/hops/codegen/cplan/CNodeCell.java |  11 +
 .../sysds/hops/codegen/cplan/CNodeMultiAgg.java    |  10 +
 .../hops/codegen/cplan/CNodeOuterProduct.java      |  10 +
 .../apache/sysds/hops/codegen/cplan/CNodeRow.java  |  17 +-
 .../apache/sysds/hops/codegen/cplan/CNodeTpl.java  |   2 +
 .../apache/sysds/runtime/codegen/SpoofCUDA.java    |   6 +-
 .../sysds/runtime/codegen/SpoofCellwise.java       |  31 ++-
 .../apache/sysds/runtime/codegen/SpoofRowwise.java |  47 +++-
 .../test/functions/codegen/CellwiseTmplTest.java   |  13 +-
 17 files changed, 497 insertions(+), 319 deletions(-)

diff --git a/src/main/cuda/headers/reduction.cuh 
b/src/main/cuda/headers/reduction.cuh
index 8a024cc..568eb12 100644
--- a/src/main/cuda/headers/reduction.cuh
+++ b/src/main/cuda/headers/reduction.cuh
@@ -66,7 +66,8 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
        // number of active thread blocks (via gridDim).  More blocks will 
result
        // in a larger gridSize and therefore fewer elements per thread
        while (i < N) {
-               v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / N, i % N));
+//             printf("tid=%d i=%d N=%d, in->cols()=%d rix=%d\n", threadIdx.x, 
i, N, in->cols(), i/in->cols());
+               v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / in->cols(), 
i % in->cols()));
 
                if (i + blockDim.x < N) {
                        //__syncthreads();
@@ -106,7 +107,7 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
                }
                __syncthreads();
        }
-
+       
        if (tid < 32) {
                // now that we are using warp-synchronous programming (below)
                // we need to declare our shared memory volatile so that the 
compiler
@@ -115,27 +116,40 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
                if (blockDim.x >= 64) {
                        smem[tid] = v = reduction_op(v, smem[tid + 32]);
                }
+               if(tid<12)
+                       printf("bid=%d tid=%d reduction result: %3.1f\n", 
blockIdx.x, tid, sdata[tid]);
+               
                if (blockDim.x >= 32) {
                        smem[tid] = v = reduction_op(v, smem[tid + 16]);
                }
+//             if(tid==0)
+//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 16) {
                        smem[tid] = v = reduction_op(v, smem[tid + 8]);
                }
+//             if(tid==0)
+//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 8) {
                        smem[tid] = v = reduction_op(v, smem[tid + 4]);
                }
+//             if(tid==0)
+//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 4) {
                        smem[tid] = v = reduction_op(v, smem[tid + 2]);
                }
+//             if(tid==0)
+//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 2) {
                        smem[tid] = v = reduction_op(v, smem[tid + 1]);
                }
+//             if(tid==0)
+//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
        }
 
         // write result for this block to global mem
         if (tid == 0) {
 //             if(gridDim.x < 10)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
+                       printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                out->val(0, blockIdx.x) = sdata[0];
         }
 }
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index d3f0778..3e023d5 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -24,49 +24,65 @@
 #include <cstdlib>
 #include <sstream>
 
+using clk = std::chrono::high_resolution_clock;
+using sec = std::chrono::duration<double, std::ratio<1>>;
+
 size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* 
resource_path) {
 
-#ifdef __DEBUG
+#ifdef _DEBUG
        std::cout << "initializing cuda device " << device_id << std::endl;
 #endif
-
-  SpoofCUDAContext *ctx = new SpoofCUDAContext(resource_path);
-  // cuda device is handled by jCuda atm
-  //cudaSetDevice(device_id);
-  //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-  //cudaDeviceSynchronize();
-
-  CHECK_CUDA(cuModuleLoad(&(ctx->reductions), std::string(ctx->resource_path + 
std::string("/cuda/kernels/reduction.ptx")).c_str()));
-
-  CUfunction func;
-
-  // ToDo: implement a more scalable solution for these imports
-
-  // SUM
-  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d"));
-  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_d", func));
-//  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f"));
-//  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_f", func));
-//
-//  // SUM_SQ
-//  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_d"));
-//  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", func));
-//  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_sq_f"));
-//  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", func));
-//
-//  // MIN
-  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_d"));
-  ctx->reduction_kernels.insert(std::make_pair("reduce_min_d", func));
-//  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f"));
-//  ctx->reduction_kernels.insert(std::make_pair("reduce_min_f", func));
-//
-//  // MAX
-  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
-  ctx->reduction_kernels.insert(std::make_pair("reduce_max_d", func));
-//  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_f"));
-//  ctx->reduction_kernels.insert(std::make_pair("reduce_max_f", func));
-
-  return reinterpret_cast<size_t>(ctx);
+       std::string cuda_include_path;
+       
+       char* cdp = std::getenv("CUDA_PATH");
+       if(cdp != nullptr)
+               cuda_include_path = std::string("-I") + std::string(cdp) + 
"/include";
+       else {
+               std::cout << "Warning: CUDA_PATH environment variable not set. 
Using default include path "
+                                        "/usr/local/cuda/include" << std::endl;
+               cuda_include_path = std::string("-I/usr/local/cuda/include");
+       }
+       
+       std::stringstream s1, s2;
+       s1 << "-I" << resource_path << "/cuda/headers";
+       s2 << "-I" << resource_path << "/cuda/spoof";
+       auto *ctx = new SpoofCUDAContext(resource_path, {s1.str(), s2.str(), 
cuda_include_path});
+       // cuda device is handled by jCuda atm
+       //cudaSetDevice(device_id);
+       //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+       //cudaDeviceSynchronize();
+       
+       CHECK_CUDA(cuModuleLoad(&(ctx->reductions), 
std::string(ctx->resource_path + 
std::string("/cuda/kernels/reduction.ptx")).c_str()));
+       
+       CUfunction func;
+       
+       // ToDo: implement a more scalable solution for these imports
+       
+       // SUM
+       CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d"));
+       ctx->reduction_kernels.insert(std::make_pair("reduce_sum_d", func));
+       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_sum_f"));
+       //  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_f", func));
+       //
+       //  // SUM_SQ
+       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_sum_sq_d"));
+       //  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", 
func));
+       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_sum_sq_f"));
+       //  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", 
func));
+       //
+       //  // MIN
+       CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_d"));
+       ctx->reduction_kernels.insert(std::make_pair("reduce_min_d", func));
+       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_min_f"));
+       //  ctx->reduction_kernels.insert(std::make_pair("reduce_min_f", func));
+       //
+       //  // MAX
+       CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
+       ctx->reduction_kernels.insert(std::make_pair("reduce_max_d", func));
+       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_max_f"));
+       //  ctx->reduction_kernels.insert(std::make_pair("reduce_max_f", func));
+
+       return reinterpret_cast<size_t>(ctx);
 }
 
 void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id) 
{
@@ -76,99 +92,43 @@ void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, 
uint32_t device_id) {
   //cudaDeviceReset();
 }
 
-bool SpoofCUDAContext::compile_cuda(const std::string &src,
-                                    const std::string &name) {
-    std::string cuda_include_path("");
-    char* cdp = std::getenv("CUDA_PATH");
-    if(cdp != nullptr)
-        cuda_include_path = std::string("-I") + std::string(cdp) + "/include";
-    else {
-       std::cout << "Warning: CUDA_PATH environment variable not set. Using 
default include path"
-                       "/usr/local/cuda/include" << std::endl;
-       cuda_include_path = std::string("-I/usr/local/cuda/include");
-    }
-
-#ifdef __DEBUG
-  std::cout << "compiling cuda kernel " << name << std::endl;
-  std::cout << src << std::endl;
-  std::cout << "cwd: " << std::filesystem::current_path() << std::endl;
-  std::cout << "cuda_path: " << cuda_include_path << std::endl;
+int SpoofCUDAContext::compile(const std::string &src, const std::string &name, 
SpoofOperator::OpType op_type,
+               SpoofOperator::AggType agg_type, SpoofOperator::AggOp agg_op, 
SpoofOperator::RowType row_type, bool sparse_safe,
+               int32_t const_dim2, uint32_t num_vectors, bool TB1) {
+#ifdef _DEBUG
+//     std::cout << "---=== START source listing of spoof cuda kernel [ " << 
name << " ]: " << std::endl;
+//    uint32_t line_num = 0;
+//     std::istringstream src_stream(src);
+//    for(std::string line; std::getline(src_stream, line); line_num++)
+//             std::cout << line_num << ": " << line << std::endl;
+//
+//     std::cout << "---=== END source listing of spoof cuda kernel [ " << 
name << " ]." << std::endl;
+       std::cout << "cwd: " << std::filesystem::current_path() << std::endl;
+       std::cout << "include_paths: ";
+       for_each (include_paths.begin(), include_paths.end(), [](const 
std::string& line){ std::cout << line << '\n';});
+       std::cout << std::endl;
 #endif
+// uncomment all related lines for temporary timing output:
+//     auto start = clk::now();
 
-       SpoofOperator::AggType agg_type= SpoofOperator::AggType::NONE;
-       SpoofOperator::AggOp agg_op = SpoofOperator::AggOp::NONE;
-       SpoofOperator::OpType op_type = SpoofOperator::OpType::NONE;
+//     auto compile_start = clk::now();
+       jitify::Program program = kernel_cache.program(src, 0, include_paths);
+//     auto compile_end = clk::now();
+//     auto compile_duration = std::chrono::duration_cast<sec>(compile_end - 
compile_start).count();
+       ops.insert(std::make_pair(name, SpoofOperator({std::move(program),  
op_type, agg_type, agg_op, row_type, name,
+                       const_dim2, num_vectors, TB1, sparse_safe})));
 
-       auto pos = 0;
-       
-       if((pos = src.find("SpoofCellwiseOp")) != std::string::npos)
-               op_type = SpoofOperator::OpType::CW;
-    else if((pos = src.find("SpoofRowwiseOp")) != std::string::npos)
-               op_type = SpoofOperator::OpType::RA;
-       else {
-        std::cerr << "error: unknown spoof operator" << std::endl;
-               return false;
-       }
+//     auto end = clk::now();
 
-       bool TB1 = false;
-       if((pos = src.find("TB1")) != std::string::npos)
-               if(src.substr(pos, pos+8).find("true") != std::string::npos)
-                       TB1 = true;
+//     auto handling_duration = std::chrono::duration_cast<sec>(end - 
start).count() - compile_duration;
 
-       uint32_t numTempVect = 0;
-       if((pos = src.find("// VectMem: ")) != std::string::npos)
-               numTempVect = std::stoi(std::string(src.begin() + pos + 12, 
std::find(src.begin()+pos, src.end(), '\n')));
-
-       int32_t constDim2 = 0;
-       if((pos = src.find("// ConstDim2: ")) != std::string::npos)
-               constDim2 = std::stoi(std::string(src.begin() + pos + 14, 
std::find(src.begin()+pos, src.end(), '\n')));
-
-       bool sparse_safe = false;
-       if ((pos = src.find("// SparseSafe:")) != std::string::npos)
-               if (src.substr(pos, pos + 15).find("true") != std::string::npos)
-                       sparse_safe = true;
-
-       if(((pos = src.find("CellType")) != std::string::npos) || ((pos = 
src.find("RowType")) != std::string::npos)){
-               if(src.substr(pos, pos+30).find("FULL_AGG") != 
std::string::npos)
-                       agg_type= SpoofOperator::AggType::FULL_AGG;
-               else if(src.substr(pos, pos+30).find("ROW_AGG") != 
std::string::npos)
-                       agg_type= SpoofOperator::AggType::ROW_AGG;
-               else if(src.substr(pos, pos+30).find("COL_AGG") != 
std::string::npos)
-                       agg_type= SpoofOperator::AggType::COL_AGG;
-               else if(src.substr(pos, pos+30).find("NO_AGG") != 
std::string::npos)
-                       agg_type= SpoofOperator::AggType::NO_AGG;
-               else if(src.substr(pos, pos+30).find("NO_AGG_CONST") != 
std::string::npos)
-                       agg_type= SpoofOperator::AggType::NO_AGG_CONST;
-               else if(src.substr(pos, pos+30).find("COL_AGG_T") != 
std::string::npos)
-                       agg_type= SpoofOperator::AggType::COL_AGG_T;
-               else {
-                       std::cerr << "error: unknown aggregation type" << 
std::endl;
-                       return false;
-               }
-
-               if((agg_type!= SpoofOperator::AggType::NO_AGG) && (op_type == 
SpoofOperator::OpType::CW)) {
-                       if((pos = src.find("AggOp")) != std::string::npos) {
-                               if(src.substr(pos, pos+30).find("AggOp.SUM") != 
std::string::npos)
-                                       agg_op = SpoofOperator::AggOp::SUM;
-                               else if(src.substr(pos, 
pos+30).find("AggOp.SUM_SQ") != std::string::npos)
-                                       agg_op = SpoofOperator::AggOp::SUM_SQ;
-                               else if(src.substr(pos, 
pos+30).find("AggOp.MIN") != std::string::npos)
-                                       agg_op = SpoofOperator::AggOp::MIN;
-                               else if(src.substr(pos, 
pos+30).find("AggOp.MAX") != std::string::npos)
-                                       agg_op = SpoofOperator::AggOp::MAX;
-                               else {
-                               std::cerr << "error: unknown aggregation 
operator" << std::endl;
-                                       return false;
-                               }
-                       }
-               }
-       }
-
-       std::stringstream s1, s2, s3;
-       s1 << "-I" << resource_path << "/cuda/headers";
-       s2 << "-I" << resource_path << "/cuda/spoof";
+//     compile_total += compile_duration;
+//     handling_total += handling_duration;
 
-       jitify::Program program = kernel_cache.program(src, 0, {s1.str(), 
s2.str(), cuda_include_path});
-       ops.insert(std::make_pair(name, SpoofOperator({std::move(program), 
agg_type, agg_op, op_type, name, TB1, constDim2, numTempVect, sparse_safe})));
-       return true;
+//     std::cout << name << " times [s] handling/compile/totals(h/c)/count: "
+//                     << handling_duration << "/"
+//                     << compile_duration << "/"
+//                     << handling_total << "/"
+//                     << compile_total << "/" << compile_count + 1 << 
std::endl;
+       return compile_count++;
 }
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index a9d3562..89dd9e0 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -21,7 +21,9 @@
 #ifndef SPOOFCUDACONTEXT_H
 #define SPOOFCUDACONTEXT_H
 
-#define NOMINMAX
+#if defined(_WIN32) || defined(_WIN64)
+       #define NOMINMAX
+#endif
 
 #include <cmath>
 #include <cstdint>
@@ -39,6 +41,7 @@
 #endif
 
 #include <jitify.hpp>
+#include <utility>
 #include <cublas_v2.h>
 
 #include "host_utils.h"
@@ -46,46 +49,57 @@
 using jitify::reflection::type_of;
 
 struct SpoofOperator {
-       enum class AggType : int { NO_AGG, NO_AGG_CONST, ROW_AGG, COL_AGG, 
FULL_AGG, COL_AGG_T, NONE };
-       enum class AggOp : int {SUM, SUM_SQ, MIN, MAX, NONE };
        enum class OpType : int { CW, RA, MA, OP, NONE };
+       enum class AggType : int { NONE, NO_AGG, FULL_AGG, ROW_AGG, COL_AGG };
+       enum class AggOp : int { NONE, SUM, SUM_SQ, MIN, MAX };
+       enum class RowType : int { NONE, FULL_AGG = 4 };
        
        jitify::Program program;
+       OpType op_type;
        AggType agg_type;
        AggOp agg_op;
-       OpType op_type;
+       RowType row_type;
     const std::string name;
-       bool TB1 = false;
        int32_t const_dim2;
        uint32_t num_temp_vectors;
+       bool TB1 = false;
        bool sparse_safe = true;
 };
 
 class SpoofCUDAContext {
-
-  jitify::JitCache kernel_cache;
-  std::map<const std::string, SpoofOperator> ops;
-  CUmodule reductions;
-  std::map<const std::string, CUfunction> reduction_kernels;
-
+       jitify::JitCache kernel_cache;
+       std::map<const std::string, SpoofOperator> ops;
+       CUmodule reductions;
+       std::map<const std::string, CUfunction> reduction_kernels;
+       double handling_total, compile_total;
+       uint32_t compile_count;
+       
+       const std::string resource_path;
+       const std::vector<std::string> include_paths;
+       
 public:
-  // ToDo: make launch config more adaptive
-  // num threads
-  const int NT = 256;
-
-  // values / thread
-  const int VT = 4;
+       // ToDo: make launch config more adaptive
+       // num threads
+       const int NT = 256;
 
-  const std::string resource_path;
+       // values / thread
+       const int VT = 4;
 
-  SpoofCUDAContext(const char* resource_path_) : reductions(nullptr), 
resource_path(resource_path_) {}
+       explicit SpoofCUDAContext(const char* resource_path_, 
std::vector<std::string>  include_paths_) : reductions(nullptr),
+                       resource_path(resource_path_), 
include_paths(std::move(include_paths_)), handling_total(0.0), 
compile_total(0.0),
+                       compile_count(0) {}
 
-  static size_t initialize_cuda(uint32_t device_id, const char* 
resource_path_);
+       static size_t initialize_cuda(uint32_t device_id, const char* 
resource_path_);
 
-  static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
+       static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
 
-  bool compile_cuda(const std::string &src, const std::string &name);
+       int compile(const std::string &src, const std::string &name, 
SpoofOperator::OpType op_type,
+                       SpoofOperator::AggType agg_type = 
SpoofOperator::AggType::NONE,
+                       SpoofOperator::AggOp agg_op = 
SpoofOperator::AggOp::NONE,
+                       SpoofOperator::RowType row_type = 
SpoofOperator::RowType::NONE, bool sparse_safe = true,
+                       int32_t const_dim2 = -1, uint32_t num_vectors = 0, bool 
TB1 = false);
 
+       
        template <typename T>
        T execute_kernel(const std::string &name, std::vector<Matrix<T>>& 
input,  std::vector<Matrix<T>>& sides, Matrix<T>* output,
                                         T *scalars_ptr, uint32_t num_scalars, 
uint32_t grix) {
@@ -107,8 +121,8 @@ public:
                        CHECK_CUDART(cudaMemcpy(d_in, 
reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>), cudaMemcpyHostToDevice));
 
                        if(output != nullptr) {
-                               if (op->sparse_safe == true && 
input.front().row_ptr != nullptr) {
-#ifdef __DEBUG
+                               if (op->sparse_safe && input.front().row_ptr != 
nullptr) {
+#ifdef _DEBUG
                                        std::cout << "copying sparse safe row 
ptrs" << std::endl;
 #endif
                                        
CHECK_CUDART(cudaMemcpy(output->row_ptr, input.front().row_ptr, 
(input.front().rows+1)*sizeof(uint32_t), cudaMemcpyDeviceToDevice));
@@ -135,7 +149,7 @@ public:
                        
                        if (!sides.empty()) {
                                if(op->TB1) {
-#ifdef __DEBUG
+#ifdef _DEBUG
                                        std::cout << "transposing TB1 for " << 
op->name << std::endl;
 #endif
                                        T* b1 = sides[0].data;
@@ -175,8 +189,7 @@ public:
                                                                 
input.front().cols, grix, input[0].row_ptr!=nullptr);
                        break;
                    default:
-                               std::cerr << "error: unknown spoof operator" << 
std::endl;
-                       return result;
+                               throw std::runtime_error("error: unknown spoof 
operator");
                    }
                        
                        if (num_scalars > 0)
@@ -188,9 +201,7 @@ public:
                        if (op->TB1)
                                CHECK_CUDART(cudaFree(b1_transposed));
                        
-                       if (op->agg_type == SpoofOperator::AggType::FULL_AGG) {
-//                             std::cout << "retrieving scalar result" << 
std::endl;
-                               
+                       if (op->agg_type == SpoofOperator::AggType::FULL_AGG || 
op->row_type == SpoofOperator::RowType::FULL_AGG) {
                                Matrix<T> res_mat;
                                CHECK_CUDART(cudaMemcpy(&res_mat, d_out, 
sizeof(Matrix<T>), cudaMemcpyDeviceToHost));
                                CHECK_CUDART(cudaMemcpy(&result, res_mat.data, 
sizeof(T), cudaMemcpyDeviceToHost));
@@ -200,8 +211,7 @@ public:
                        }
                } 
                else {
-                       std::cerr << "kernel " << name << " not found." << 
std::endl;
-                       return result;
+                       throw std::runtime_error("kernel " + name + " not 
found.");
                }
                return result;
        }
@@ -222,7 +232,7 @@ public:
                  reduction_type = "_col_";
                  break;
          default:
-                 std::cerr << "unknown reduction type" << std::endl;
+                 throw std::runtime_error("unknown reduction type");
                  return "";
          }
        
@@ -240,8 +250,7 @@ public:
                  reduction_kernel_name = "reduce" + reduction_type + "sum" + 
suffix;
                  break;
          default:
-                 std::cerr << "unknown reduction op" << std::endl;
-                 return "";
+                 throw std::runtime_error("unknown reduction op");
          }
        
          return reduction_kernel_name;
@@ -267,7 +276,7 @@ public:
                        dim3 block(NT, 1, 1);
                        uint32_t shared_mem_size = NT * sizeof(T);
 
-//#ifdef __DEBUG
+#ifdef _DEBUG
                                // ToDo: connect output to SystemDS logging 
facilities
                                std::cout << "launching spoof cellwise kernel " 
<< op_name << " with "
                                                  << NT * NB << " threads in " 
<< NB << " blocks and "
@@ -275,7 +284,7 @@ public:
                                                  << " bytes of shared memory 
for full aggregation of "
                                                  << N << " elements"
                                                  << std::endl;
-//#endif
+#endif
                        
                        CHECK_CUDA(op->program.kernel(op_name)
                                                           
.instantiate(type_of(value_type), std::max(1u, num_sides))
@@ -284,7 +293,6 @@ public:
                        
                        if(NB > 1) {
                                std::string reduction_kernel_name = 
determine_agg_kernel<T>(op);
-
                                CUfunction reduce_kernel = 
reduction_kernels.find(reduction_kernel_name)->second;
                                N = NB;
                                uint32_t iter = 1;
@@ -292,18 +300,18 @@ public:
                                        void* args[3] = { &d_out, &d_out, &N};
 
                                        NB = std::ceil((N + NT * 2 - 1) / (NT * 
2));
-//#ifdef __DEBUG
+#ifdef _DEBUG
                                        std::cout << "agg iter " << iter++ << " 
launching spoof cellwise kernel " << op_name << " with "
                     << NT * NB << " threads in " << NB << " blocks and "
                     << shared_mem_size
                     << " bytes of shared memory for full aggregation of "
                     << N << " elements"
                     << std::endl;
-//#endif
+#endif
                                        CHECK_CUDA(cuLaunchKernel(reduce_kernel,
                                                        NB, 1, 1,
                                                        NT, 1, 1,
-                                                       shared_mem_size, 0, 
args, 0));
+                                                       shared_mem_size, 
nullptr, args, nullptr));
                                                        N = NB;
                                }
                        }
@@ -315,7 +323,7 @@ public:
                        dim3 grid(NB, 1, 1);
                        dim3 block(NT, 1, 1);
                        uint32_t shared_mem_size = 0;
-#ifdef __DEBUG
+#ifdef _DEBUG
                        std::cout << " launching spoof cellwise kernel " << 
op_name << " with "
                                        << NT * NB << " threads in " << NB << " 
blocks for column aggregation of "
                                        << N << " elements" << std::endl;
@@ -333,12 +341,12 @@ public:
                        dim3 grid(NB, 1, 1);
                        dim3 block(NT, 1, 1);
                        uint32_t shared_mem_size = NT * sizeof(T);
-//#ifdef __DEBUG
+#ifdef _DEBUG
                        std::cout << " launching spoof cellwise kernel " << 
op_name << " with "
                                        << NT * NB << " threads in " << NB << " 
blocks and "
                                        << shared_mem_size << " bytes of shared 
memory for row aggregation of "
                                        << N << " elements" << std::endl;
-//#endif
+#endif
                        CHECK_CUDA(op->program.kernel(op_name)
                                                           
.instantiate(type_of(value_type), std::max(1u, num_sides))
                                                           .configure(grid, 
block, shared_mem_size)
@@ -357,7 +365,7 @@ public:
                        dim3 block(NT, 1, 1);
                        uint32_t shared_mem_size = 0;
 
-#ifdef __DEBUG
+#ifdef _DEBUG
                        if(sparse) {
                                std::cout << "launching sparse spoof cellwise 
kernel " << op_name << " with " << NT * NB
                                                  << " threads in " << NB << " 
blocks without aggregation for " << N << " elements"
@@ -369,10 +377,6 @@ public:
                                                  << std::endl;
                        }
 #endif
-//                     CHECK_CUDA(op->program.kernel(op_name)
-//                                     .instantiate(type_of(result))
-//                                     .configure(grid, block)
-//                                     .launch(in_ptrs[0], d_sides, out_ptr, 
d_scalars, m, n, grix));
 
                        CHECK_CUDA(op->program.kernel(op_name)
                                                           
.instantiate(type_of(value_type), std::max(1u, num_sides))
@@ -404,12 +408,12 @@ public:
                if(sparse)
                        name = std::string(op->name + "_SPARSE");
 
-//#ifdef __DEBUG
-                       // ToDo: connect output to SystemDS logging facilities
-                       std::cout << "launching spoof rowwise kernel " << name 
<< " with " << NT * in_rows << " threads in " << in_rows
-                               << " blocks and " << shared_mem_size << " bytes 
of shared memory for " << in_cols << " cols processed by "
-                               << NT << " threads per row, adding " << 
temp_buf_size / 1024 << " kb of temp buffer in global memory." <<  std::endl;
-//#endif
+#ifdef _DEBUG
+               // ToDo: connect output to SystemDS logging facilities
+               std::cout << "launching spoof rowwise kernel " << name << " 
with " << NT * in_rows << " threads in " << in_rows
+                       << " blocks and " << shared_mem_size << " bytes of 
shared memory for " << in_cols << " cols processed by "
+                       << NT << " threads per row, adding " << temp_buf_size / 
1024 << " kb of temp buffer in global memory." <<  std::endl;
+#endif
                CHECK_CUDA(op->program.kernel(name)
                                .instantiate(type_of(value_type), std::max(1u, 
num_sides), op->num_temp_vectors, tmp_len)
                                .configure(grid, block, shared_mem_size)
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp 
b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index af6967d..fd95d1b 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -26,10 +26,42 @@
 
 #define RELEASE_ARRAY(env, java, cpp)(env->ReleasePrimitiveArrayCritical(java, 
cpp, 0))
 
+// error output helper
+void printStdException(JNIEnv *env, jstring name, const std::exception& e, 
bool compile = false) {
+       std::string type = compile ? "compiling" : "executing";
+       if(name != nullptr) {
+               const char *cstr_name = env->GetStringUTFChars(name, nullptr);
+               std::cout << "std::exception while " << type << "  SPOOF CUDA 
operator " << cstr_name << ":\n" << e.what() <<
+                                 std::endl;
+               env->ReleaseStringUTFChars(name, cstr_name);
+       }
+       else
+               std::cout << "std::exception while " << type << " SPOOF CUDA 
operator (name=nullptr):\n" << e.what() <<
+                               std::endl;
+}
+
+void printException(JNIEnv* env, jstring name, bool compile = false) {
+       std::string type = compile ? "compiling" : "executing";
+       if(name != nullptr) {
+               const char *cstr_name = env->GetStringUTFChars(name, nullptr);
+               std::cout << "Unknown exception occurred while " << type << " 
SPOOF CUDA operator " << cstr_name << std::endl;
+               env->ReleaseStringUTFChars(name, cstr_name);
+       }
+       else
+               std::cout << "Unknown exception occurred while " << type << " 
SPOOF CUDA operator (name=nullptr)" << std::endl;
+}
+
+void printSource(JNIEnv* env, jstring name, jstring src) {
+       if(src != nullptr) {
+               const char *cstr_src = env->GetStringUTFChars(src, nullptr);
+               std::cout << "Source code:\n" << cstr_src << std::endl;
+               env->ReleaseStringUTFChars(name, cstr_src);
+       }
+}
+
 JNIEXPORT jlong JNICALL
 Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
     JNIEnv *env, jobject jobj, jint device_id, jstring resource_path) {
-
   const char *cstr_rp = env->GetStringUTFChars(resource_path, nullptr);
   size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
   env->ReleaseStringUTFChars(resource_path, cstr_rp);
@@ -42,54 +74,99 @@ 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
   SpoofCUDAContext::destroy_cuda(reinterpret_cast<SpoofCUDAContext *>(ctx), 
device_id);
 }
 
-JNIEXPORT jboolean JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_compile_1cuda_1kernel(
-    JNIEnv *env, jobject jobj, jlong ctx, jstring name, jstring src) {
-  auto *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
-  const char *cstr_name = env->GetStringUTFChars(name, nullptr);
-  const char *cstr_src = env->GetStringUTFChars(src, nullptr);
-  bool result = ctx_->compile_cuda(cstr_src, cstr_name);
-  env->ReleaseStringUTFChars(src, cstr_src);
-  env->ReleaseStringUTFChars(name, cstr_name);
-  return result;
+JNIEXPORT jint JNICALL 
Java_org_apache_sysds_hops_codegen_cplan_CNodeCell_compile_1nvrtc
+               (JNIEnv *env, jobject jobj, jlong ctx, jstring name, jstring 
src, jint type, jint agg_op, jboolean sparseSafe) {
+       try {
+               auto *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+               const char *cstr_name = env->GetStringUTFChars(name, nullptr);
+               const char *cstr_src = env->GetStringUTFChars(src, nullptr);
+               
+               int result = ctx_->compile(cstr_src, cstr_name, 
SpoofOperator::OpType::CW,
+                               SpoofOperator::AggType(type), 
SpoofOperator::AggOp(agg_op), SpoofOperator::RowType::NONE, sparseSafe);
+               
+               env->ReleaseStringUTFChars(src, cstr_src);
+               env->ReleaseStringUTFChars(name, cstr_name);
+               return result;
+       }
+       catch (std::exception& e) {
+               printStdException(env, name, e, true);
+       }
+       catch (...) {
+               printException(env, name, true);
+       }
+       printSource(env, name, src);
+       return -1;
+}
+
+/*
+ * Class:     org_apache_sysds_hops_codegen_cplan_CNodeRow
+ * Method:    compile_nvrtc
+ * Signature: (JLjava/lang/String;Ljava/lang/String;IIIZ)I
+ */
+JNIEXPORT jint JNICALL 
Java_org_apache_sysds_hops_codegen_cplan_CNodeRow_compile_1nvrtc
+               (JNIEnv *env, jobject jobj, jlong ctx, jstring name, jstring 
src, jint type, jint const_dim2, jint num_vectors,
+               jboolean TB1) {
+       try {
+               auto *ctx_ = reinterpret_cast<SpoofCUDAContext *>(ctx);
+               const char *cstr_name = env->GetStringUTFChars(name, nullptr);
+               const char *cstr_src = env->GetStringUTFChars(src, nullptr);
+               
+               int result = ctx_->compile(cstr_src, cstr_name, 
SpoofOperator::OpType::RA,
+                               SpoofOperator::AggType::NONE, 
SpoofOperator::AggOp::NONE,
+                               SpoofOperator::RowType(type), true, const_dim2, 
num_vectors, TB1);
+               
+               env->ReleaseStringUTFChars(src, cstr_src);
+               env->ReleaseStringUTFChars(name, cstr_name);
+               return result;
+       }
+       catch (std::exception& e) {
+               printStdException(env, name, e);
+       }
+       catch (...) {
+               printException(env, name);
+       }
+       printSource(env, name, src);
+       return -1;
 }
 
+
 JNIEXPORT jdouble JNICALL
 Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d(
     JNIEnv *env, jobject jobj, jlong ctx, jstring name, jlongArray in_ptrs, 
jint input_offset, jlongArray side_ptrs,
                jlongArray out_ptrs, jdoubleArray scalars_, jlong grix, jobject 
inputs_, jobject out_obj) {
-
-       auto *ctx_ = reinterpret_cast<SpoofCUDAContext*>(ctx);
-       const char *cstr_name = env->GetStringUTFChars(name, nullptr);
-
-       auto* inputs = reinterpret_cast<size_t*>(GET_ARRAY(env, in_ptrs));
-       auto* sides = reinterpret_cast<size_t*>(GET_ARRAY(env, side_ptrs));
-       auto *output = reinterpret_cast<size_t*>(GET_ARRAY(env, out_ptrs));
-       auto *scalars = reinterpret_cast<double*>(GET_ARRAY(env, scalars_));
-
-       //ToDo: call once while init
-       jclass CacheableData = 
env->FindClass("org/apache/sysds/runtime/controlprogram/caching/CacheableData");
-       if(!CacheableData) {
-               std::cerr << " JNIEnv -> FindClass(CacheableData) failed" << 
std::endl;
-               return -1.0;
-       }
-       jclass ArrayList = env->FindClass("java/util/ArrayList");
-       if(!ArrayList) {
-               std::cerr << " JNIEnv -> FindClass(ArrayList) failed" << 
std::endl;
-               return -1.0;
-       }
-       jmethodID mat_obj_num_rows = env->GetMethodID(CacheableData, 
"getNumRows", "()J");
-       if(!mat_obj_num_rows) {
-               std::cerr << " JNIEnv -> GetMethodID() failed" << std::endl;
-               return -1.0;
-       }
-       jmethodID mat_obj_num_cols = env->GetMethodID(CacheableData, 
"getNumColumns", "()J");
-       if(!mat_obj_num_cols) {
-               std::cerr << " JNIEnv -> GetMethodID() failed" << std::endl;
-               return -1.0;
-       }
-       jmethodID ArrayList_size = env->GetMethodID(ArrayList, "size", "()I");
-       jmethodID ArrayList_get = env->GetMethodID(ArrayList, "get", 
"(I)Ljava/lang/Object;");
+       double result = 0.0;
+       try {
+           auto *ctx_ = reinterpret_cast<SpoofCUDAContext*>(ctx);
+           const char *cstr_name = env->GetStringUTFChars(name, nullptr);
+           
+               auto* inputs = reinterpret_cast<size_t*>(GET_ARRAY(env, 
in_ptrs));
+               auto* sides = reinterpret_cast<size_t*>(GET_ARRAY(env, 
side_ptrs));
+               auto *output = reinterpret_cast<size_t*>(GET_ARRAY(env, 
out_ptrs));
+               auto *scalars = reinterpret_cast<double*>(GET_ARRAY(env, 
scalars_));
+
+               //ToDo: call once while init
+               jclass CacheableData = 
env->FindClass("org/apache/sysds/runtime/controlprogram/caching/CacheableData");
+               if (!CacheableData) {
+                       std::cerr << " JNIEnv -> FindClass(CacheableData) 
failed" << std::endl;
+                       return -1.0;
+               }
+               jclass ArrayList = env->FindClass("java/util/ArrayList");
+               if (!ArrayList) {
+                       std::cerr << " JNIEnv -> FindClass(ArrayList) failed" 
<< std::endl;
+                       return -1.0;
+               }
+               jmethodID mat_obj_num_rows = env->GetMethodID(CacheableData, 
"getNumRows", "()J");
+               if (!mat_obj_num_rows) {
+                       std::cerr << " JNIEnv -> GetMethodID() failed" << 
std::endl;
+                       return -1.0;
+               }
+               jmethodID mat_obj_num_cols = env->GetMethodID(CacheableData, 
"getNumColumns", "()J");
+               if (!mat_obj_num_cols) {
+                       std::cerr << " JNIEnv -> GetMethodID() failed" << 
std::endl;
+                       return -1.0;
+               }
+               jmethodID ArrayList_size = env->GetMethodID(ArrayList, "size", 
"()I");
+               jmethodID ArrayList_get = env->GetMethodID(ArrayList, "get", 
"(I)Ljava/lang/Object;");
 
                std::vector<Matrix<double>> in;
                jint num_inputs = env->CallIntMethod(inputs_, ArrayList_size);
@@ -97,50 +174,58 @@ Java_org_apache_sysds_runtime_codegen_SpoofCUDA_execute_1d(
                std::cout << "num inputs: " << num_inputs << " offsets: " << 
input_offset << std::endl;
 #endif
 
-       for(auto ptr_idx = 0, input_idx = 0; input_idx < input_offset; 
ptr_idx+=4, input_idx++) {
-               jobject input_obj = env->CallObjectMethod(inputs_, 
ArrayList_get, input_idx);
-               auto m = static_cast<uint32_t>(env->CallIntMethod(input_obj, 
mat_obj_num_rows));
-               auto n = static_cast<uint32_t>(env->CallIntMethod(input_obj, 
mat_obj_num_cols));
+        for(auto ptr_idx = 0, input_idx = 0; input_idx < input_offset; 
ptr_idx+=4, input_idx++) {
+            jobject input_obj = env->CallObjectMethod(inputs_, ArrayList_get, 
input_idx);
+            auto m = static_cast<uint32_t>(env->CallIntMethod(input_obj, 
mat_obj_num_rows));
+            auto n = static_cast<uint32_t>(env->CallIntMethod(input_obj, 
mat_obj_num_cols));
 
-               in.push_back(Matrix<double>{reinterpret_cast<double 
*>(inputs[ptr_idx + 3]),
-                                                                       
reinterpret_cast<uint32_t *>(inputs[ptr_idx + 1]),
-                                                                       
reinterpret_cast<uint32_t *>(inputs[ptr_idx + 2]),
-                                                                       m, n, 
static_cast<uint32_t>(inputs[ptr_idx])});
+            in.push_back(Matrix<double>{reinterpret_cast<double 
*>(inputs[ptr_idx + 3]),
+                                        reinterpret_cast<uint32_t 
*>(inputs[ptr_idx + 1]),
+                                        reinterpret_cast<uint32_t 
*>(inputs[ptr_idx + 2]),
+                                        m, n, 
static_cast<uint32_t>(inputs[ptr_idx])});
 #ifdef _DEBUG
                        std::cout << "input #" << input_idx << " m=" << m << " 
n=" << n << std::endl;
 #endif
+               }
+
+        std::vector<Matrix<double>> side_inputs;
+        for(uint32_t ptr_idx = 0, input_idx = input_offset; input_idx < 
num_inputs; ptr_idx+=4, input_idx++) {
+            jobject side_input_obj = env->CallObjectMethod(inputs_, 
ArrayList_get, input_idx);
+            auto m = static_cast<uint32_t>(env->CallIntMethod(side_input_obj, 
mat_obj_num_rows));
+            auto n = static_cast<uint32_t>(env->CallIntMethod(side_input_obj, 
mat_obj_num_cols));
+
+            side_inputs.push_back(Matrix<double>{reinterpret_cast<double 
*>(sides[ptr_idx + 3]),
+                                                 reinterpret_cast<uint32_t 
*>(sides[ptr_idx + 1]),
+                                                 reinterpret_cast<uint32_t 
*>(sides[ptr_idx + 2]),
+                                                 m, n, 
static_cast<uint32_t>(sides[ptr_idx])});
+        }
+
+        std::unique_ptr<Matrix<double>> out;
+        if(out_obj != nullptr) {
+            out = 
std::make_unique<Matrix<double>>(Matrix<double>{reinterpret_cast<double*>(output[3]),
+                                                    
reinterpret_cast<uint32_t*>(output[1]),
+                                                    
reinterpret_cast<uint32_t*>(output[2]),
+                                                    
static_cast<uint32_t>(env->CallIntMethod(out_obj, mat_obj_num_rows)),
+                                                    
static_cast<uint32_t>(env->CallIntMethod(out_obj, mat_obj_num_cols)),
+                                                    
static_cast<uint32_t>(output[0])});
+        }
+
+               result = ctx_->execute_kernel(cstr_name, in, side_inputs, 
out.get(), scalars,
+                                                                               
         env->GetArrayLength(scalars_), grix);
+
+               RELEASE_ARRAY(env, in_ptrs, inputs);
+               RELEASE_ARRAY(env, side_ptrs, sides);
+               RELEASE_ARRAY(env, out_ptrs, output);
+               RELEASE_ARRAY(env, scalars_, scalars);
+               
+               env->ReleaseStringUTFChars(name, cstr_name);
+               return result;
        }
-
-       std::vector<Matrix<double>> side_inputs;
-       for(uint32_t ptr_idx = 0, input_idx = input_offset; input_idx < 
num_inputs; ptr_idx+=4, input_idx++) {
-               jobject side_input_obj = env->CallObjectMethod(inputs_, 
ArrayList_get, input_idx);
-               auto m = 
static_cast<uint32_t>(env->CallIntMethod(side_input_obj, mat_obj_num_rows));
-               auto n = 
static_cast<uint32_t>(env->CallIntMethod(side_input_obj, mat_obj_num_cols));
-
-               side_inputs.push_back(Matrix<double>{reinterpret_cast<double 
*>(sides[ptr_idx + 3]),
-                                                                               
         reinterpret_cast<uint32_t *>(sides[ptr_idx + 1]),
-                                                                               
         reinterpret_cast<uint32_t *>(sides[ptr_idx + 2]),
-                                                                               
         m, n, static_cast<uint32_t>(sides[ptr_idx])});
+       catch (std::exception& e) {
+               printStdException(env, name, e);
        }
-
-       std::unique_ptr<Matrix<double>> out;
-       if(out_obj != nullptr) {
-               out = 
std::make_unique<Matrix<double>>(Matrix<double>{reinterpret_cast<double*>(output[3]),
-                                                                               
                reinterpret_cast<uint32_t*>(output[1]),
-                                                                               
                reinterpret_cast<uint32_t*>(output[2]),
-                                                                               
                static_cast<uint32_t>(env->CallIntMethod(out_obj, 
mat_obj_num_rows)),
-                                                                               
                static_cast<uint32_t>(env->CallIntMethod(out_obj, 
mat_obj_num_cols)),
-                                                                               
                static_cast<uint32_t>(output[0])});
+       catch (...) {
+               printException(env, name);
        }
-
-       double result = ctx_->execute_kernel(cstr_name, in, side_inputs, 
out.get(), scalars, env->GetArrayLength(scalars_),
-                                                                               
        grix);
-
-       RELEASE_ARRAY(env, in_ptrs, inputs);
-       RELEASE_ARRAY(env, side_ptrs, sides);
-       RELEASE_ARRAY(env, out_ptrs, output);
-       RELEASE_ARRAY(env, scalars_, scalars);
-
-       env->ReleaseStringUTFChars(name, cstr_name);
        return result;
 }
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h 
b/src/main/cuda/spoof-launcher/jni_bridge.h
index 788f93e..48c4882 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.h
+++ b/src/main/cuda/spoof-launcher/jni_bridge.h
@@ -48,6 +48,22 @@ 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
     JNIEnv *, jobject, jlong, jint);
 
 /*
+ * Class:     org_apache_sysds_hops_codegen_cplan_CNodeCell
+ * Method:    compile_nvrtc
+ * Signature: (JLjava/lang/String;Ljava/lang/String;IIZ)I
+ */
+JNIEXPORT jint JNICALL 
Java_org_apache_sysds_hops_codegen_cplan_CNodeCell_compile_1nvrtc
+               (JNIEnv *, jobject, jlong, jstring, jstring, jint, jint, 
jboolean);
+
+/*
+ * Class:     org_apache_sysds_hops_codegen_cplan_CNodeRow
+ * Method:    compile_nvrtc
+ * Signature: (JLjava/lang/String;Ljava/lang/String;IIIZ)I
+ */
+JNIEXPORT jint JNICALL 
Java_org_apache_sysds_hops_codegen_cplan_CNodeRow_compile_1nvrtc
+               (JNIEnv *, jobject, jlong, jstring, jstring, jint, jint, jint, 
jboolean);
+
+/*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    compile_cuda_kernel
  * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
index f26161b..ca25fe5 100644
--- a/src/main/cuda/spoof/cellwise.cu
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -58,6 +58,7 @@ struct SpoofCellwiseOp {
 //%NEED_GRIX%
 
 %BODY_dense%
+//printf("tid=%d a=%4.1f\n", threadIdx.x, a);
                return %OUT%;
        }
 };
diff --git a/src/main/java/org/apache/sysds/common/Types.java 
b/src/main/java/org/apache/sysds/common/Types.java
index 6c179cb..b9be432 100644
--- a/src/main/java/org/apache/sysds/common/Types.java
+++ b/src/main/java/org/apache/sysds/common/Types.java
@@ -20,10 +20,10 @@
 package org.apache.sysds.common;
 
 import java.util.Arrays;
+import java.util.HashMap;
 
 import org.apache.sysds.runtime.DMLRuntimeException;
 
-
 public class Types
 {
        /**
@@ -174,14 +174,14 @@ public class Types
                }
        }
        
+       // these values need to match with their native counterparts (spoof 
cuda ops)
        public enum AggOp {
-               SUM, SUM_SQ,
-               PROD, SUM_PROD,
-               MIN, MAX,
-               TRACE, MEAN, VAR,
-               MAXINDEX, MININDEX,
-               COUNT_DISTINCT,
-               COUNT_DISTINCT_APPROX;
+               SUM(1), SUM_SQ(2), MIN(3), MAX(4),
+               PROD(5), SUM_PROD(6),
+               TRACE(7), MEAN(8), VAR(9),
+               MAXINDEX(10), MININDEX(11),
+               COUNT_DISTINCT(12),
+               COUNT_DISTINCT_APPROX(13);
                
                @Override
                public String toString() {
@@ -192,6 +192,27 @@ public class Types
                                default:     return name().toLowerCase();
                        }
                }
+               
+               private final int value;
+               private final static HashMap<Integer, AggOp> map = new 
HashMap<>();
+               
+               AggOp(int value) {
+                       this.value = value;
+               }
+               
+               static {
+                       for (AggOp aggOp : AggOp.values()) {
+                               map.put(aggOp.value, aggOp);
+                       }
+               }
+               
+               public static AggOp valueOf(int aggOp) {
+                       return map.get(aggOp);
+               }
+               
+               public int getValue() {
+                       return value;
+               }
        }
        
        // Operations that require 1 operand
diff --git a/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java 
b/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
index 4e8e61e..46a7042 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/SpoofCompiler.java
@@ -238,8 +238,6 @@ public class SpoofCompiler {
                                        isLoaded = 
NativeHelper.loadLibraryHelperFromResource(libName);
 
                                if(isLoaded) {
-
-
                                        long ctx_ptr = 
initialize_cuda_context(0, local_tmp);
                                        if(ctx_ptr != 0) {
                                                
native_contexts.put(GeneratorAPI.CUDA, ctx_ptr);
@@ -289,14 +287,8 @@ public class SpoofCompiler {
                }
        }
 
-       private static boolean compile_cuda(String name, String src) {
-               return 
compile_cuda_kernel(native_contexts.get(GeneratorAPI.CUDA), name, src);
-       }
-
        private static native long initialize_cuda_context(int device_id, 
String resource_path);
 
-       private static native boolean compile_cuda_kernel(long ctx, String 
name, String src);
-
        private static native void destroy_cuda_context(long ctx, int 
device_id);
 
        //plan cache for cplan->compiled source to avoid unnecessary 
codegen/source code compile
@@ -514,18 +506,16 @@ public class SpoofCompiler {
                                Class<?> cla = 
planCache.getPlan(tmp.getValue());
                                
                                if( cla == null ) {
-                                       String src = "";
                                        String src_cuda = "";
-                                       boolean native_compiled_successfully = 
false;
-                                       src = tmp.getValue().codegen(false, 
GeneratorAPI.JAVA);
+                                       String src = 
tmp.getValue().codegen(false, GeneratorAPI.JAVA);
                                        cla = 
CodegenUtils.compileClass("codegen."+ tmp.getValue().getClassname(), src);
 
                                        if(API == GeneratorAPI.CUDA) {
                                                
if(tmp.getValue().isSupported(API)) {
                                                        src_cuda = 
tmp.getValue().codegen(false, GeneratorAPI.CUDA);
-                                                       
native_compiled_successfully = compile_cuda(tmp.getValue().getVarname(), 
src_cuda);
-                                                       
if(native_compiled_successfully)
-                                                               
CodegenUtils.putNativeOpData(new SpoofCUDA(src_cuda, tmp.getValue(), cla));
+                                                       int op_id = 
tmp.getValue().compile(API, src_cuda);
+                                                       if(op_id >= 0)
+                                                               
CodegenUtils.putNativeOpData(new SpoofCUDA(src_cuda, tmp.getValue(), op_id));
                                                        else {
                                                                LOG.warn("CUDA 
compilation failed, falling back to JAVA");
                                                                
tmp.getValue().setGeneratorAPI(GeneratorAPI.JAVA);
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index f2f0179..9d1061f 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -22,6 +22,7 @@ package org.apache.sysds.hops.codegen.cplan;
 import java.util.ArrayList;
 
 import org.apache.sysds.common.Types.AggOp;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
 import org.apache.sysds.runtime.codegen.SpoofCellwise;
@@ -274,4 +275,14 @@ public class CNodeCell extends CNodeTpl
                return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA) 
&& _output.isSupported(api) &&
                        !(getSpoofAggOp() == SpoofCellwise.AggOp.SUM_SQ);
        }
+       
+       public int compile(GeneratorAPI api, String src) {
+               if(api == GeneratorAPI.CUDA)
+                       return 
compile_nvrtc(SpoofCompiler.native_contexts.get(api), _genVar, src, 
_type.getValue
+                                       (), 
+                               _aggOp != null ? _aggOp.getValue() : 0, 
_sparseSafe);
+               return -1;
+       }
+       
+       private native int compile_nvrtc(long context, String name, String src, 
int type, int agg_op, boolean sparseSafe);
 }
diff --git 
a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
index 895d945..c14c2c7 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeMultiAgg.java
@@ -25,6 +25,7 @@ import java.util.Arrays;
 import org.apache.commons.collections.CollectionUtils;
 import org.apache.sysds.hops.Hop;
 import org.apache.sysds.common.Types.AggOp;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
 import org.apache.sysds.runtime.util.UtilFunctions;
 import org.apache.sysds.hops.codegen.SpoofCompiler.GeneratorAPI;
@@ -216,4 +217,13 @@ public class CNodeMultiAgg extends CNodeTpl
                }
                return  is_supported;
        }
+       
+       public int compile(GeneratorAPI api, String src) {
+               if(api == GeneratorAPI.CUDA)
+                       return 
compile_nvrtc(SpoofCompiler.native_contexts.get(api), _genVar, src, 
_sparseSafe); // ToDo: compile MA
+               return -1;
+       }
+       
+       private native int compile_nvrtc(long context, String name, String src, 
boolean sparseSafe);
+       
 }
diff --git 
a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
index 6a3a647..955e9e2 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeOuterProduct.java
@@ -21,6 +21,7 @@ package org.apache.sysds.hops.codegen.cplan;
 
 import java.util.ArrayList;
 
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
 import org.apache.sysds.lops.MMTSJ;
 import org.apache.sysds.runtime.codegen.SpoofOuterProduct.OutProdType;
@@ -197,4 +198,13 @@ public class CNodeOuterProduct extends CNodeTpl
                }
                return  is_supported;
        }
+
+       public int compile(GeneratorAPI api, String src) {
+               if(api == GeneratorAPI.CUDA)
+                       return 
compile_nvrtc(SpoofCompiler.native_contexts.get(api), _genVar, src, 
_type.ordinal());
+               return -1;
+       }
+       
+       private native int compile_nvrtc(long context, String name, String src, 
int type);
+
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
index 74bde95..211c289 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeRow.java
@@ -24,6 +24,7 @@ import java.util.Arrays;
 import java.util.regex.Matcher;
 import java.util.stream.Collectors;
 
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
 import org.apache.sysds.hops.codegen.cplan.CNodeBinary.BinType;
 import org.apache.sysds.hops.codegen.template.TemplateUtils;
@@ -52,6 +53,7 @@ public class CNodeRow extends CNodeTpl
        private RowType _type = null; //access pattern 
        private long _constDim2 = -1; //constant number of output columns
        private int _numVectors = -1; //number of intermediate vectors
+       private boolean _tb1 = false;
        
        public void setRowType(RowType type) {
                _type = type;
@@ -115,8 +117,8 @@ public class CNodeRow extends CNodeTpl
                //replace colvector information and number of vector 
intermediates
                tmp = tmp.replace("%TYPE%", _type.name());
                tmp = tmp.replace("%CONST_DIM2%", String.valueOf(_constDim2));
-               tmp = tmp.replace("%TB1%", String.valueOf(
-                       TemplateUtils.containsBinary(_output, 
BinType.VECT_MATRIXMULT)));
+               _tb1 = TemplateUtils.containsBinary(_output, 
BinType.VECT_MATRIXMULT); 
+               tmp = tmp.replace("%TB1%", String.valueOf(_tb1));
 
                if(api == GeneratorAPI.CUDA && _numVectors > 0) {
                        tmp = tmp.replace("//%HAS_TEMP_VECT%", ": public 
TempStorageImpl<T, NUM_TMP_VECT, TMP_VECT_LEN>");
@@ -230,4 +232,15 @@ public class CNodeRow extends CNodeTpl
        public boolean isSupported(GeneratorAPI api) {
                return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA) 
&& _output.isSupported(api);
        }
+
+       public int compile(GeneratorAPI api, String src) {
+               if(api == GeneratorAPI.CUDA)
+                       return 
compile_nvrtc(SpoofCompiler.native_contexts.get(api), _genVar, src, 
_type.getValue(), _constDim2, 
+                               _numVectors, _tb1);
+               return -1;
+       }
+       
+       private native int compile_nvrtc(long context, String name, String src, 
int type, long constDim2, int numVectors, 
+                       boolean TB1);
+
 }
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
index 552b951..4f990c0 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeTpl.java
@@ -238,4 +238,6 @@ public abstract class CNodeTpl extends CNode implements 
Cloneable
        public GeneratorAPI getGeneratorAPI() { return api; }
        
        public void setGeneratorAPI(GeneratorAPI _api) { api = _api; }
+       
+       public abstract int compile(GeneratorAPI api, String src);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
index f3ccf12..88d91a7 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDA.java
@@ -37,15 +37,15 @@ public class SpoofCUDA extends SpoofOperator {
        private static final long serialVersionUID = -2161276866245388359L;
        
        private final CNodeTpl cnt;
-       private final Class<?> java_op;
        public final String name;
        public final String src;
+       public final int id;
 
-       public SpoofCUDA(String source, CNodeTpl cnode, Class<?> java_op) {
+       public SpoofCUDA(String source, CNodeTpl cnode, int _id) {
                name = "codegen." + cnode.getVarname();
                cnt = cnode;
                src = source;
-               this.java_op = java_op;
+               id = _id;
        }
 
        public String getName() {
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCellwise.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCellwise.java
index f266f6d..89b502e 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCellwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCellwise.java
@@ -23,6 +23,7 @@ import java.io.Serializable;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@@ -49,11 +50,33 @@ public abstract class SpoofCellwise extends SpoofOperator 
implements Serializabl
 {
        private static final long serialVersionUID = 3442528770573293590L;
        
+       // these values need to match with their native counterparts (spoof 
cuda ops)
        public enum CellType {
-               NO_AGG,
-               FULL_AGG,
-               ROW_AGG,
-               COL_AGG,
+               NO_AGG(1),
+               FULL_AGG(2),
+               ROW_AGG(3),
+               COL_AGG(4);
+               
+               private final int value;
+               private final static HashMap<Integer, CellType> map = new 
HashMap<>();
+               
+               CellType(int value) {
+                       this.value = value;
+               }
+               
+               static {
+                       for (CellType cellType : CellType.values()) {
+                               map.put(cellType.value, cellType);
+                       }
+               }
+               
+               public static CellType valueOf(int cellType) {
+                       return map.get(cellType);
+               }
+               
+               public int getValue() {
+                       return value;
+               }                       
        }
        
        //redefinition of Hop.AggOp for cleaner imports in generate class
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofRowwise.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofRowwise.java
index b49cb8d..bd37aaf 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofRowwise.java
@@ -20,6 +20,7 @@
 package org.apache.sysds.runtime.codegen;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutorService;
@@ -45,18 +46,42 @@ public abstract class SpoofRowwise extends SpoofOperator
 {
        private static final long serialVersionUID = 6242910797139642998L;
        
+       // Enum with explicit integer values
+       // Thanks to 
https://codingexplained.com/coding/java/enum-to-integer-and-integer-to-enum
+       // these values need to match with their native counterparts (spoof 
cuda ops)
        public enum RowType {
-               NO_AGG,       //no aggregation
-               NO_AGG_B1,    //no aggregation w/ matrix mult B1
-               NO_AGG_CONST, //no aggregation w/ expansion/contraction
-               FULL_AGG,     //full row/col aggregation
-               ROW_AGG,      //row aggregation (e.g., rowSums() or X %*% v)
-               COL_AGG,      //col aggregation (e.g., colSums() or t(y) %*% X)
-               COL_AGG_T,    //transposed col aggregation (e.g., t(X) %*% y)
-               COL_AGG_B1,   //col aggregation w/ matrix mult B1
-               COL_AGG_B1_T, //transposed col aggregation w/ matrix mult B1
-               COL_AGG_B1R,  //col aggregation w/ matrix mult B1 to row vector
-               COL_AGG_CONST;//col aggregation w/ expansion/contraction
+               NO_AGG(1),       //no aggregation
+               NO_AGG_B1(2),    //no aggregation w/ matrix mult B1
+               NO_AGG_CONST(3), //no aggregation w/ expansion/contraction
+               FULL_AGG(4),     //full row/col aggregation
+               ROW_AGG(5),      //row aggregation (e.g., rowSums() or X %*% v)
+               COL_AGG (6),      //col aggregation (e.g., colSums() or t(y) 
%*% X)
+               COL_AGG_T(7),    //transposed col aggregation (e.g., t(X) %*% y)
+               COL_AGG_B1(8),   //col aggregation w/ matrix mult B1
+               COL_AGG_B1_T(9), //transposed col aggregation w/ matrix mult B1
+               COL_AGG_B1R(10),  //col aggregation w/ matrix mult B1 to row 
vector
+               COL_AGG_CONST (11);//col aggregation w/ expansion/contraction
+               
+               private final int value;
+               private final static HashMap<Integer, RowType> map = new 
HashMap<>();
+               
+               RowType(int value) {
+                       this.value = value;
+               }
+               
+               static {
+                       for (RowType rowType : RowType.values()) {
+                               map.put(rowType.value, rowType);
+                       }
+               }
+               
+               public static RowType valueOf(int rowType) {
+                       return map.get(rowType);
+               }
+               
+               public int getValue() {
+                       return value;
+               }
                
                public boolean isColumnAgg() {
                        return this == COL_AGG || this == COL_AGG_T
diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java 
b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
index 5fac4a9..10ea91e 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegen/CellwiseTmplTest.java
@@ -171,15 +171,10 @@ public class CellwiseTmplTest extends AutomatedTestBase
        }
        
        @Test
-       public void testCodegenCellwise4() 
-       {
-               testCodegenIntegration( TEST_NAME4, false, ExecType.CP  );
-       }
+       public void testCodegenCellwise4() { testCodegenIntegration( 
TEST_NAME4, false, ExecType.CP ); }
        
        @Test
-       public void testCodegenCellwise5() {
-               testCodegenIntegration( TEST_NAME5, false, ExecType.CP  );
-       }
+       public void testCodegenCellwise5() { testCodegenIntegration( 
TEST_NAME5, false, ExecType.CP  ); }
        
        @Test
        public void testCodegenCellwise6() {
@@ -192,9 +187,7 @@ public class CellwiseTmplTest extends AutomatedTestBase
        }
        
        @Test
-       public void testCodegenCellwise8() {
-               testCodegenIntegration( TEST_NAME8, false, ExecType.CP  );
-       }
+       public void testCodegenCellwise8() { testCodegenIntegration( 
TEST_NAME8, false, ExecType.CP  ); }
        
        @Test
        public void testCodegenCellwise9() {

[systemds] 03/05: [SYSTEMDS-2852] Improve SPOOF CUDA compilation

Reply via email to