[systemds] branch main updated: [SYSTEMDS-3024] Improve performance by batching data descriptor transfers

markd Thu, 24 Mar 2022 17:01:21 -0700

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 0725185  [SYSTEMDS-3024] Improve performance by batching data 
descriptor transfers
0725185 is described below

commit 072518513bc8454457701132b492e86e2928f44a
Author: Mark Dokter <[email protected]>
AuthorDate: Thu Apr 15 15:50:02 2021 +0200

    [SYSTEMDS-3024] Improve performance by batching data descriptor transfers
    
    The spoof cuda operators do several little cudaMemcpy() invocations per 
operator execution. By transferring all data in one go the overhead can be 
reduced. In addition, using asynchronous copies can further improve things and 
are a first step towards using more asynchronicity in the GPU operations.
    
    Closes #1567
---
 src/main/cuda/headers/Matrix.h                     |  42 +----
 src/main/cuda/headers/reduction.cuh                |  52 +-----
 src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp  |  19 +-
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h    |  62 ++-----
 src/main/cuda/spoof-launcher/SpoofCellwise.h       |  94 +++++-----
 src/main/cuda/spoof-launcher/SpoofOperator.h       | 128 +++++++++----
 src/main/cuda/spoof-launcher/SpoofRowwise.h        |  38 ++--
 src/main/cuda/spoof-launcher/jni_bridge.cpp        | 177 ++++++++----------
 src/main/cuda/spoof-launcher/jni_bridge.h          |  57 +++---
 src/main/cuda/spoof/cellwise.cu                    |   3 +-
 src/main/cuda/spoof/rowwise.cu                     |   2 +
 .../apache/sysds/hops/codegen/cplan/CNodeCell.java |  15 +-
 .../sysds/hops/codegen/cplan/cuda/Unary.java       |   4 +-
 .../sysds/runtime/codegen/SpoofCUDACellwise.java   | 116 +++++-------
 .../sysds/runtime/codegen/SpoofCUDAOperator.java   | 205 +++++++++------------
 .../sysds/runtime/codegen/SpoofCUDARowwise.java    |  80 +++-----
 .../sysds/runtime/codegen/SpoofOperator.java       |   3 +
 .../instructions/gpu/SpoofCUDAInstruction.java     |  21 +--
 .../instructions/gpu/context/GPUObject.java        |   3 +-
 19 files changed, 505 insertions(+), 616 deletions(-)

diff --git a/src/main/cuda/headers/Matrix.h b/src/main/cuda/headers/Matrix.h
index 0446983..61ef939 100644
--- a/src/main/cuda/headers/Matrix.h
+++ b/src/main/cuda/headers/Matrix.h
@@ -26,19 +26,19 @@ using int32_t = int;
 
 template <typename T>
 struct Matrix {
-       int32_t nnz;
+       uint64_t nnz;
        uint32_t rows;
        uint32_t cols;
-       
        uint32_t* row_ptr;
        uint32_t* col_idx;
        T* data;
        
        typedef T value_type;
        
-       explicit Matrix(size_t* jvals) : nnz(jvals[0]), rows(jvals[1]), 
cols(jvals[2]),
-                       row_ptr(reinterpret_cast<uint32_t*>(jvals[3])),
-                       col_idx(reinterpret_cast<uint32_t*>((jvals[4]))), 
data(reinterpret_cast<T*>(jvals[5])) {}
+       explicit Matrix(uint8_t* jvals) : 
nnz(*reinterpret_cast<uint32_t*>(&jvals[0])),
+               rows(*reinterpret_cast<uint32_t*>(&jvals[8])), 
cols(*reinterpret_cast<uint32_t*>(&jvals[12])),
+                       row_ptr(reinterpret_cast<uint32_t*>(jvals[16])), 
col_idx(reinterpret_cast<uint32_t*>((jvals[24]))),
+                               data(static_cast<T*>(jvals[32])) {}
 };
 
 #ifdef __CUDACC__
@@ -72,7 +72,7 @@ public:
        
        __device__ void init(Matrix<T>* mat) { _mat = mat; }
        
-       __device__ uint32_t& nnz() { return _mat->nnz; }
+       __device__ uint32_t& nnz() { return return _mat->row_ptr == nullptr ? 
_mat->rows * _mat->cols : _mat->nnz; }
        __device__ uint32_t cols() { return _mat->cols; }
        __device__ uint32_t rows() { return _mat->rows; }
        
@@ -133,7 +133,7 @@ private:
        
        //ToDo sparse accessors
        __device__ uint32_t len_sparse() {
-               return _mat->nnz;
+               return _mat->row_ptr[_mat->rows];
        }
        
        __device__ uint32_t pos_sparse(uint32_t rix) {
@@ -227,34 +227,6 @@ public:
        }
 };
 
-template <typename T, int NUM_B>
-struct SpoofOp {
-       MatrixAccessor<T> a;
-       MatrixAccessor<T> b[NUM_B];
-       MatrixAccessor<T> c;
-       T* scalars;
-       uint32_t grix;
-       T* avals;
-       uint32_t* aix;
-       uint32_t alen;
-       
-       SpoofOp(Matrix<T>* A, Matrix<T>* B, Matrix<T>* C, T* scalars, T* 
tmp_stor, uint32_t grix) :
-                       scalars(scalars), grix(grix), avals(A->data), 
aix(A->col_idx) {
-               a.init(A);
-               c.init(C);
-               alen = a.row_len(grix);
-
-               if(B)
-                       for(auto i = 0; i < NUM_B; ++i)
-                               b[i].init(&(B[i]));
-       }
-       
-//     __device__ Vector<T>& getTempStorage(uint32_t len) {
-//             Vector<T>& vec = temp_rb.next();
-//             tvec.length = len;
-//             return vec;
-//     }
-};
 #endif // __CUDACC_RTC__
 
 #endif //SYSTEMDS_MATRIX_H
diff --git a/src/main/cuda/headers/reduction.cuh 
b/src/main/cuda/headers/reduction.cuh
index 3cd0a0b..7707fe2 100644
--- a/src/main/cuda/headers/reduction.cuh
+++ b/src/main/cuda/headers/reduction.cuh
@@ -22,6 +22,7 @@
 #define REDUCTION_CUH
 
 using uint = unsigned int;
+
 #include <cuda_runtime.h>
 
 #include "utils.cuh"
@@ -51,7 +52,9 @@ using uint = unsigned int;
  * @param SpoofCellwiseOp              initial value for the reduction variable
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, 
uint32_t N, T VT, ReductionOp reduction_op, SpoofCellwiseOp spoof_op) {
+__device__ void FULL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, 
uint32_t N, T VT, ReductionOp reduction_op,
+       SpoofCellwiseOp spoof_op)
+{
        auto sdata = shared_memory_proxy<T>();
 
        // perform first level of reduction,
@@ -66,12 +69,9 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
        // number of active thread blocks (via gridDim).  More blocks will 
result
        // in a larger gridSize and therefore fewer elements per thread
        while (i < N) {
-//             printf("tid=%d i=%d N=%d, in->cols()=%d rix=%d\n", threadIdx.x, 
i, N, in->cols(), i/in->cols());
                v = reduction_op(v, spoof_op(*(in->vals(i)), i, i / in->cols(), 
i % in->cols()));
 
                if (i + blockDim.x < N) {
-                       //__syncthreads();
-                       //printf("loop fetch i(%d)+blockDim.x(%d)=%d, 
in=%f\n",i, blockDim.x, i + blockDim.x, g_idata[i + blockDim.x]);
                        v = reduction_op(v, spoof_op(*(in->vals(i+blockDim.x)), 
blockDim.x + i, (i+blockDim.x) / in->cols(), (i+blockDim.x) % in->cols()));
                }
 
@@ -116,40 +116,25 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
                if (blockDim.x >= 64) {
                        smem[tid] = v = reduction_op(v, smem[tid + 32]);
                }
-//             if(tid<12)
-//                     printf("bid=%d tid=%d reduction result: %3.1f\n", 
blockIdx.x, tid, sdata[tid]);
-               
                if (blockDim.x >= 32) {
                        smem[tid] = v = reduction_op(v, smem[tid + 16]);
                }
-//             if(tid==0)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 16) {
                        smem[tid] = v = reduction_op(v, smem[tid + 8]);
                }
-//             if(tid==0)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 8) {
                        smem[tid] = v = reduction_op(v, smem[tid + 4]);
                }
-//             if(tid==0)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 4) {
                        smem[tid] = v = reduction_op(v, smem[tid + 2]);
                }
-//             if(tid==0)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                if (blockDim.x >= 2) {
                        smem[tid] = v = reduction_op(v, smem[tid + 1]);
                }
-//             if(tid==0)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
        }
 
         // write result for this block to global mem
         if (tid == 0) {
-//             if(gridDim.x < 10)
-//                     printf("blockIdx.x=%d reduction result: %3.1f\n", 
blockIdx.x, sdata[0]);
                out->val(0, blockIdx.x) = sdata[0];
         }
 }
@@ -174,19 +159,10 @@ __device__ void FULL_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
  * the value before writing it to its final location in global memory for each
  * row
  */
-//template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
-//__device__ void ROW_AGG(
-//             T *g_idata, ///< input data stored in device memory (of size 
rows*cols)
-//             T *g_odata,  ///< output/temporary array store in device memory 
(of size
-//             /// rows*cols)
-//             uint rows,  ///< rows in input and temporary/output arrays
-//             uint cols,  ///< columns in input and temporary/output arrays
-//             T initialValue,  ///< initial value for the reduction variable
-//             ReductionOp reduction_op, ///< Reduction operation to perform 
(functor object)
-//             SpoofCellwiseOp spoof_op) ///< Operation to perform before 
assigning this
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void ROW_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, 
uint32_t N, T VT,  ReductionOp reduction_op,
-                                          SpoofCellwiseOp spoof_op) {
+       SpoofCellwiseOp spoof_op)
+{
        auto sdata = shared_memory_proxy<T>();
 
        // one block per row
@@ -199,7 +175,6 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
        uint32_t i = tid;
        uint block_offset = block * in->cols();
 
-//     T v = initialValue;
        T v = reduction_op.init();
        while (i < in->cols()) {
                v = reduction_op(v, spoof_op(in->val(block_offset + i), i, i / 
in->cols(), i % in->cols()));
@@ -283,16 +258,8 @@ __device__ void ROW_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t
  */
 template<typename T, typename ReductionOp, typename SpoofCellwiseOp>
 __device__ void COL_AGG(MatrixAccessor<T>* in, MatrixAccessor<T>* out, 
uint32_t N, T VT,  ReductionOp reduction_op,
-                                               SpoofCellwiseOp spoof_op) {
-//__device__ void COL_AGG(T *g_idata, ///< input data stored in device memory 
(of size rows*cols)
-//             T *g_odata,  ///< output/temporary array store in device memory 
(of size rows*cols)
-//             uint rows,  ///< rows in input and temporary/output arrays
-//             uint cols,  ///< columns in input and temporary/output arrays
-//             T initialValue,  ///< initial value for the reduction variable
-//             ReductionOp reduction_op, ///< Reduction operation to perform 
(functor object)
-//             SpoofCellwiseOp spoof_op) ///< Operation to perform before 
aggregation
-//
-//{
+       SpoofCellwiseOp spoof_op)
+{
        uint global_tid = blockIdx.x * blockDim.x + threadIdx.x;
        if (global_tid >= in->cols()) {
                return;
@@ -315,13 +282,12 @@ __device__ void NO_AGG(MatrixAccessor<T>* in, 
MatrixAccessor<T>* out, uint32_t N
        uint32_t gtid = blockIdx.x * blockDim.x + threadIdx.x;
        uint32_t first_idx = gtid * static_cast<uint32_t>(VT);
        uint32_t last_idx = min(first_idx + static_cast<uint32_t>(VT), N);
+
        #pragma unroll
        for(auto i = first_idx; i < last_idx; i++) {
                T a = in->hasData() ? in->vals(0)[i] : 0;
                T result = spoof_op(a, i, i / in->cols(), i % in->cols());
                out->vals(0)[i] = result;
-               //if(i < 4)
-               //      printf("tid=%d in=%4.3f res=%4.3f out=%4.3f r=%d\n", i, 
in->vals(0)[i], result, out->vals(0)[i], i/in->cols());
        }
 }
 
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 233a4a1..2ef482d 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -77,10 +77,15 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t 
device_id, const char* resourc
        CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_max_d"));
        
ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
 SpoofOperator::AggOp::MAX), func));
        
+       
CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(ctx->staging_buffer)), 
ctx->default_mem_size));
+       
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(ctx->device_buffer)), 
ctx->default_mem_size));
+       ctx->current_mem_size = ctx->default_mem_size;
        return reinterpret_cast<size_t>(ctx);
 }
 
 void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] 
uint32_t device_id) {
+       cudaFreeHost(ctx->staging_buffer);
+       cudaFree(ctx->device_buffer);
        delete ctx;
        // cuda device is handled by jCuda atm
        //cudaDeviceReset();
@@ -116,15 +121,25 @@ size_t 
SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const std::s
 
 template<typename T>
 CUfunction SpoofCUDAContext::getReductionKernel(const 
std::pair<SpoofOperator::AggType, SpoofOperator::AggOp> &key) {
-       return nullptr;
+       return nullptr; // generic case never used
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<float>(const 
std::pair<SpoofOperator::AggType,
                SpoofOperator::AggOp> &key) {
        return reduction_kernels_f[key];
 }
+
 template<>
 CUfunction SpoofCUDAContext::getReductionKernel<double>(const 
std::pair<SpoofOperator::AggType,
                SpoofOperator::AggOp> &key) {
        return reduction_kernels_d[key];
-}
\ No newline at end of file
+}
+
+void SpoofCUDAContext::resize_staging_buffer(size_t size) {
+       cudaFreeHost(staging_buffer);
+       cudaFree(device_buffer);
+       
CHECK_CUDART(cudaMallocHost(reinterpret_cast<void**>(&(staging_buffer)), size));
+       CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&(device_buffer)), 
size));
+       current_mem_size = size;
+}
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index 696682f..e4b80c5 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -46,62 +46,34 @@ class SpoofCUDAContext {
        std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, 
CUfunction> reduction_kernels_f;
        std::map<std::pair<SpoofOperator::AggType, SpoofOperator::AggOp>, 
CUfunction> reduction_kernels_d;
 
-//     double handling_total, compile_total;
-       
        const std::string resource_path;
        const std::vector<std::string> include_paths;
        
 public:
+       size_t default_mem_size = 1024; // 1kb for hosting data pointers, 
scalars and some meta info. This default should
+                                                                       // not 
require resizing these buffers in most cases.
+       size_t current_mem_size = 0; // the actual staging buffer size (should 
be default unless there was a resize)
+       std::byte* staging_buffer{}; // pinned host mem for async transfers
+       std::byte* device_buffer{};  // this buffer holds the pointers to the 
data buffers
 
        explicit SpoofCUDAContext(const char* resource_path_, 
std::vector<std::string>  include_paths_) : reductions(nullptr),
-                       resource_path(resource_path_), 
include_paths(std::move(include_paths_))
-                       //,handling_total(0.0), compile_total(0.0)
-                       {}
+                       resource_path(resource_path_), 
include_paths(std::move(include_paths_)) { }
 
        static size_t initialize_cuda(uint32_t device_id, const char* 
resource_path_);
 
        static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
-       
+
        size_t compile(std::unique_ptr<SpoofOperator> op, const std::string 
&src);
-       
+
        template <typename T, typename CALL>
-       int launch(uint32_t opID, std::vector<Matrix<T>>& input, 
std::vector<Matrix<T>>& sides, Matrix<T>& output,
-                       T* scalars, uint32_t grix) {
-               // dp holds in/side/out/scalar pointers for GPU
-               DevMatPtrs<T> dp;
-
-               SpoofOperator* op = compiled_ops[opID].get();
-               
-               CHECK_CUDART(cudaMalloc((void **)&dp.in, sizeof(Matrix<T>) * 
input.size()));
-               CHECK_CUDART(cudaMemcpy(dp.in, 
reinterpret_cast<void*>(&input[0]), sizeof(Matrix<T>) * input.size(),
-                               cudaMemcpyHostToDevice));
-
-               if (!sides.empty()) {
-                       CHECK_CUDART(cudaMalloc(reinterpret_cast<void 
**>(&dp.sides), sizeof(Matrix<T>) * sides.size()));
-                       CHECK_CUDART(cudaMemcpy(dp.sides, &sides[0], 
sizeof(Matrix<T>)  * sides.size(), cudaMemcpyHostToDevice));
-               }
-               
-               if (op->isSparseSafe() && input.front().row_ptr != nullptr) {
-                       CHECK_CUDART(cudaMemcpy(output.row_ptr, 
input.front().row_ptr, (input.front().rows+1)*sizeof(uint32_t),
-                                       cudaMemcpyDeviceToDevice));
-               }
-#ifndef NDEBUG
-               std::cout << "output rows: " << output.rows << " cols: " << 
output.cols << " nnz: " << output.nnz << " format: " <<
-                               (output.row_ptr == nullptr ? "dense" : 
"sparse") << std::endl;
-#endif
-               size_t out_num_elements = output.rows * output.cols;
-               if(output.row_ptr)
-                       if(op->isSparseSafe() && output.nnz > 0)
-                               out_num_elements = output.nnz;
-               CHECK_CUDART(cudaMalloc((void **) &dp.out, sizeof(Matrix<T>)));
-               CHECK_CUDART(cudaMemset(output.data, 0, out_num_elements * 
sizeof(T)));
-               CHECK_CUDART(cudaMemcpy(dp.out, reinterpret_cast<void 
*>(&output), sizeof(Matrix<T>),
-                               cudaMemcpyHostToDevice));
-               
-               dp.scalars = scalars;
-
-               CALL::exec(this, op, input, sides, output, grix, dp);
-               
+       int launch() {
+
+               DataBufferWrapper dbw(staging_buffer, device_buffer);
+               SpoofOperator* op = compiled_ops[dbw.op_id()].get();
+               dbw.toDevice(op->stream);
+
+               CALL::exec(this, op, &dbw);
+
                return 0;
        }
        
@@ -109,6 +81,8 @@ public:
 
        template<typename T>
        CUfunction getReductionKernel(const std::pair<SpoofOperator::AggType, 
SpoofOperator::AggOp>& key);
+
+       void resize_staging_buffer(size_t size);
 };
 
 #endif // SPOOFCUDACONTEXT_H
diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h 
b/src/main/cuda/spoof-launcher/SpoofCellwise.h
index fe7e9d6..9077840 100644
--- a/src/main/cuda/spoof-launcher/SpoofCellwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h
@@ -27,8 +27,7 @@
 template<typename T>
 struct SpoofCellwiseFullAgg {
        
-       static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const 
std::string& op_name,
-                       std::vector<Matrix<T>>& sides, uint32_t grix,  
DevMatPtrs<T>& dp) {
+       static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const 
std::string& op_name, DataBufferWrapper* dbw) {
                T value_type;
                
                // num ctas
@@ -46,14 +45,15 @@ struct SpoofCellwiseFullAgg {
                                                  << std::endl;
 #endif
                CHECK_CUDA(op->program.get()->kernel(op_name)
-                                                  
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1), 
sides.size()))
-                                                  .configure(grid, block, 
shared_mem_size)
-                                                  .launch(dp.in, dp.sides, 
dp.out, dp.scalars, N, grix));
+                                                  
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
+                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
                
                if(NB > 1) {
                        N = NB;
                        while (NB > 1) {
-                               void* args[3] = { &dp.out, &dp.out, &N};
+                               Matrix<T>* out = dbw->d_out<T>();
+                               void* args[3] = { &out, &out, &N};
                                
                                NB = std::ceil((N + NT * 2 - 1) / (NT * 2));
 #ifndef NDEBUG
@@ -64,7 +64,7 @@ struct SpoofCellwiseFullAgg {
                     << N << " elements"
                     << std::endl;
 #endif
-                               CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 
1, NT, 1, 1, shared_mem_size, nullptr, args, nullptr));
+                               CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 
1, NT, 1, 1, shared_mem_size, op->stream, args, nullptr));
                                N = NB;
                        }
                }
@@ -74,12 +74,11 @@ struct SpoofCellwiseFullAgg {
 
 template<typename T>
 struct SpoofCellwiseRowAgg {
-       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name,
-                         std::vector<Matrix<T>> &input, std::vector<Matrix<T>> 
&sides, uint32_t grix, DevMatPtrs<T>& dp) {
+       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name, DataBufferWrapper* dbw) {
                T value_type;
                
                // num ctas
-               uint32_t NB = input.front().rows;
+               uint32_t NB = dbw->h_in<T>(0)->rows;
                dim3 grid(NB, 1, 1);
                dim3 block(NT, 1, 1);
                uint32_t shared_mem_size = NT * sizeof(T);
@@ -90,9 +89,9 @@ struct SpoofCellwiseRowAgg {
                                        << N << " elements" << std::endl;
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
-                                                  
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1), 
sides.size()))
-                                                  .configure(grid, block, 
shared_mem_size)
-                                                  .launch(dp.in, dp.sides, 
dp.out, dp.scalars, N, grix));
+                                                  
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
+                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
                
        }
 };
@@ -100,8 +99,7 @@ struct SpoofCellwiseRowAgg {
 
 template<typename T>
 struct SpoofCellwiseColAgg {
-       static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const 
std::string& op_name,
-                                        std::vector<Matrix<T>>& sides, 
uint32_t grix, DevMatPtrs<T>& dp) {
+       static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const 
std::string& op_name, DataBufferWrapper* dbw) {
                T value_type;
                
                // num ctas
@@ -116,9 +114,9 @@ struct SpoofCellwiseColAgg {
                                                << N << " elements" << 
std::endl;
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
-                                                  
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1), 
sides.size()))
-                                                  .configure(grid, block, 
shared_mem_size)
-                                                  .launch(dp.in, dp.sides, 
dp.out, dp.scalars, N, grix));
+                                                  
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
+                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
                
        }
 };
@@ -126,72 +124,78 @@ struct SpoofCellwiseColAgg {
 
 template<typename T>
 struct SpoofCellwiseNoAgg {
-       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name,
-                         std::vector<Matrix<T>> &input, std::vector<Matrix<T>> 
&sides, uint32_t grix, DevMatPtrs<T>& dp) {
+       static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const 
std::string &op_name, DataBufferWrapper* dbw) {
                T value_type;
-               bool sparse_input = input.front().row_ptr != nullptr;
+               bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
                
                // num ctas
-               // ToDo: adaptive VT
+               // ToDo? adaptive VT
                const uint32_t VT = 1;
                uint32_t NB = std::ceil((N + NT * VT - 1) / (NT * VT));
                if(sparse_input)
-                       NB = input.front().rows;
+                       NB = dbw->h_in<T>(0)->rows;
                dim3 grid(NB, 1, 1);
                dim3 block(NT, 1, 1);
                uint32_t shared_mem_size = 0;
 
 #ifndef NDEBUG
+               std::cout << "output rows: " << dbw->h_out<T>()->rows << " 
cols: " << dbw->h_out<T>()->cols << " nnz: " <<
+                       (dbw->h_out<T>()->row_ptr == nullptr ? 
dbw->h_out<T>()->rows * dbw->h_out<T>()->cols :
+                               dbw->h_out<T>()->nnz) << " format: " << 
(dbw->h_out<T>()->row_ptr == nullptr
+                                       ? "dense" : "sparse") << std::endl;
+
                if(sparse_input) {
                                std::cout << "launching sparse spoof cellwise 
kernel " << op_name << " with " << NT * NB
-                                                 << " threads in " << NB << " 
blocks without aggregation for " << N << " elements"
-                                                 << std::endl;
+                                       << " threads in " << NB << " blocks 
without aggregation for " << N << " elements" << std::endl;
                }
                else {
-                       std::cout << "launching spoof cellwise kernel " << 
op_name << " with " << NT * NB
-                                         << " threads in " << NB << " blocks 
without aggregation for " << N << " elements"
-                                         << std::endl;
+                       std::cout << "launching spoof cellwise kernel " << 
op_name << " with " << NT * NB << " threads in " << NB <<
+                               " blocks without aggregation for " << N << " 
elements" << std::endl;
                }
 #endif
-               
                CHECK_CUDA(op->program->kernel(op_name)
-                                                  
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1), 
sides.size()))
-                                                  .configure(grid, block, 
shared_mem_size)
-                                                  .launch(dp.in, dp.sides, 
dp.out, dp.scalars, N, grix));
+                                                  
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), 
dbw->num_sides()))
+                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix()));
+
+               // copy over row indices from input to output if appropriate
+               if (op->isSparseSafe() && dbw->h_in<T>(0)->row_ptr != nullptr) {
+                       // src/dst information (pointer address) is stored in 
*host* buffer!
+                       CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->row_ptr, 
dbw->h_in<T>(0)->row_ptr,
+                               (dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), 
cudaMemcpyDeviceToDevice, op->stream));
+                       CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->col_idx, 
dbw->h_in<T>(0)->col_idx,
+                                                                               
 (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, 
op->stream));
+               }
        }
 };
 
 template<typename T>
 struct SpoofCellwise {
-       static void exec(SpoofCUDAContext* ctx, SpoofOperator* _op, 
std::vector<Matrix<T>>& input,
-                       std::vector<Matrix<T>>& sides, Matrix<T>& output, 
uint32_t grix,
-                       DevMatPtrs<T>& dp)  {
-               
-               T value_type;
+       static void exec(SpoofCUDAContext* ctx, SpoofOperator* _op, 
DataBufferWrapper* dbw) {
                auto* op = dynamic_cast<SpoofCellwiseOp*>(_op);
-               bool sparse_input = input.front().row_ptr != nullptr;
+               bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
                uint32_t NT = 256; // ToDo: num threads
-               uint32_t N = input.front().rows * input.front().cols;
+               uint32_t N = dbw->h_in<T>(0)->rows * dbw->h_in<T>(0)->cols;
                std::string op_name(op->name + "_DENSE");
                if(sparse_input) {
                        op_name = std::string(op->name + "_SPARSE");
-                       if(op->isSparseSafe() && input.front().nnz > 0)
-                               N = input.front().nnz;
+                       if(op->isSparseSafe() && dbw->h_in<T>(0)->nnz > 0)
+                               N = dbw->h_in<T>(0)->nnz;
                }
                
                switch(op->agg_type) {
                        case SpoofOperator::AggType::FULL_AGG:
                                op->agg_kernel = ctx->template 
getReductionKernel<T>(std::make_pair(op->agg_type, op->agg_op));
-                               SpoofCellwiseFullAgg<T>::exec(op, NT, N, 
op_name, sides, grix, dp);
+                               SpoofCellwiseFullAgg<T>::exec(op, NT, N, 
op_name, dbw);
                                break;
                        case SpoofOperator::AggType::ROW_AGG:
-                               SpoofCellwiseRowAgg<T>::exec(op, NT, N, 
op_name, input, sides, grix, dp);
+                               SpoofCellwiseRowAgg<T>::exec(op, NT, N, 
op_name, dbw);
                                break;
                        case SpoofOperator::AggType::COL_AGG:
-                               SpoofCellwiseColAgg<T>::exec(op, NT, N, 
op_name, sides, grix, dp);
+                               SpoofCellwiseColAgg<T>::exec(op, NT, N, 
op_name, dbw);
                                break;
                        case SpoofOperator::AggType::NO_AGG:
-                               SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, 
input, sides, grix, dp);
+                               SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, 
dbw);
                                break;
                        default:
                                throw std::runtime_error("unknown cellwise agg 
type" + std::to_string(static_cast<int>(op->agg_type)));
diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h 
b/src/main/cuda/spoof-launcher/SpoofOperator.h
index 0ccc633..f256e81 100644
--- a/src/main/cuda/spoof-launcher/SpoofOperator.h
+++ b/src/main/cuda/spoof-launcher/SpoofOperator.h
@@ -28,18 +28,24 @@
 #include "host_utils.h"
 #include "Matrix.h"
 
+// these two constants have equivalents in Java code:
+const uint32_t JNI_MAT_ENTRY_SIZE = 40;
+const uint32_t TRANSFERRED_DATA_HEADER_SIZE = 32;
+
 struct SpoofOperator {
-//     enum class OpType : int { CW, RA, MA, OP, NONE };
        enum class AggType : int { NO_AGG, FULL_AGG, ROW_AGG, COL_AGG };
        enum class AggOp : int { SUM, SUM_SQ, MIN, MAX };
        enum class RowType : int { FULL_AGG = 4 };
-       
-//     OpType op_type;
+
        std::string name;
-//     jitify::Program program;
        std::unique_ptr<jitify::Program> program;
        
        [[nodiscard]] virtual bool isSparseSafe() const = 0;
+
+       cudaStream_t stream{};
+       
+       SpoofOperator() { CHECK_CUDART(cudaStreamCreate(&stream));}
+       virtual ~SpoofOperator() {CHECK_CUDART(cudaStreamDestroy(stream));}
 };
 
 struct SpoofCellwiseOp : public SpoofOperator {
@@ -58,46 +64,90 @@ struct SpoofRowwiseOp : public SpoofOperator {
        int32_t const_dim2;
        RowType row_type;
        
-       SpoofRowwiseOp(RowType rt, bool tb1, uint32_t ntv, int32_t cd2)  : 
row_type(rt), TB1(tb1), num_temp_vectors(ntv),
+       SpoofRowwiseOp(RowType rt, bool tb1, uint32_t ntv, int32_t cd2) : 
row_type(rt), TB1(tb1), num_temp_vectors(ntv),
                        const_dim2(cd2) {}
                        
        [[nodiscard]] bool isSparseSafe() const override { return false; }
 };
 
-template<typename T>
-struct DevMatPtrs {
-       Matrix<T>* ptrs[3] = {0,0,0};
-       
-       Matrix<T>*& in = ptrs[0];
-       Matrix<T>*& sides = ptrs[1];
-       Matrix<T>*& out = ptrs[2];
-       T* scalars{};
-
-       ~DevMatPtrs() {
-#ifndef NDEBUG
-               std::cout << "~DevMatPtrs() before cudaFree:\n";
-               int i = 0;
-               for (auto& p : ptrs) {
-                       std::cout << " p[" << i << "]=" << p;
-                       i++;
-               }
-               std::cout << std::endl;
-#endif
-               for (auto& p : ptrs) {
-                       if (p) {
-                               CHECK_CUDART(cudaFree(p));
-                               p = nullptr;
-                       }
-               }
-#ifndef NDEBUG
-               std::cout << "~DevMatPtrs() after cudaFree:\n";
-               i = 0;
-               for (auto& p : ptrs) {
-                       std::cout << " p[" << i << "]=" << p;
-                       i++;
-               }
-               std::cout << std::endl;
-#endif
+struct DataBufferWrapper {
+       std::byte* staging_buffer;
+       std::byte* device_buffer;
+
+
+       template<typename T>
+       Matrix<T>* in(std::byte* buffer, uint32_t idx) {
+               return 
reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + idx * 
JNI_MAT_ENTRY_SIZE]);
+       }
+
+       template<typename T>
+       Matrix<T>* sides(std::byte* buffer) {
+               return 
reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + 
num_inputs() * JNI_MAT_ENTRY_SIZE]);
+       }
+
+       template<typename T>
+       Matrix<T>* out(std::byte* buffer) {
+               return 
reinterpret_cast<Matrix<T>*>(&buffer[TRANSFERRED_DATA_HEADER_SIZE + 
(num_inputs() + num_sides())
+                       * JNI_MAT_ENTRY_SIZE]);
+       }
+
+       template<typename T>
+       T* scalars(std::byte* buffer, uint32_t idx) {
+               return 
reinterpret_cast<T*>(&(buffer[TRANSFERRED_DATA_HEADER_SIZE + (num_inputs() + 
num_sides() + 1)
+                       * JNI_MAT_ENTRY_SIZE + idx * sizeof(T)]));
+       }
+
+public:
+       explicit DataBufferWrapper(std::byte* staging, std::byte* dev_buf) : 
staging_buffer(staging),
+               device_buffer(dev_buf) { }
+
+       void toDevice(cudaStream_t &stream) const {
+               CHECK_CUDART(cudaMemcpyAsync(device_buffer, staging_buffer, 
*reinterpret_cast<uint32_t*>(&staging_buffer[0]),
+                       cudaMemcpyHostToDevice, stream));
+       }
+
+       template<typename T>
+       Matrix<T>* d_in(uint32_t num) { return in<T>(device_buffer, num); }
+
+       template<typename T>
+       Matrix<T>* h_in(uint32_t num) { return in<T>(staging_buffer, num); }
+
+       template<typename T>
+       Matrix<T>* d_sides() { return sides<T>(device_buffer); }
+
+       template<typename T>
+       Matrix<T>* h_sides() { return sides<T>(staging_buffer); }
+
+       template<typename T>
+       Matrix<T>* d_out() { return out<T>(device_buffer); }
+
+       template<typename T>
+       Matrix<T>* h_out() { return out<T>(staging_buffer); }
+
+       template<typename T>
+       T* d_scalars(uint32_t idx = 0) { return scalars<T>(device_buffer, idx); 
}
+
+       template<typename T>
+       T* h_scalars(uint32_t idx = 0) { return scalars<T>(staging_buffer, 
idx); }
+
+       uint32_t op_id() const {
+               return 
*reinterpret_cast<uint32_t*>(&staging_buffer[sizeof(int)]);
+       }
+
+       uint64_t grix() const {
+               return *reinterpret_cast<uint64_t*>(&staging_buffer[2 * 
sizeof(int)]);
+       }
+
+       [[nodiscard]] uint32_t num_inputs() const {
+               return *reinterpret_cast<uint32_t*>(&staging_buffer[3 * 
sizeof(int)]);
+       }
+
+       uint32_t num_sides() const {
+               return *reinterpret_cast<uint32_t*>(&staging_buffer[4 * 
sizeof(int)]);
+       }
+
+       uint32_t num_scalars() const {
+               return *reinterpret_cast<uint32_t*>(&staging_buffer[6 * 
sizeof(int)]);
        }
 };
 
diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h 
b/src/main/cuda/spoof-launcher/SpoofRowwise.h
index 1295314..4465ac9 100644
--- a/src/main/cuda/spoof-launcher/SpoofRowwise.h
+++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h
@@ -27,27 +27,34 @@
 template <typename T>
 struct SpoofRowwise {
        
-       static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* 
_op, std::vector<Matrix<T>>& input,
-                       std::vector<Matrix<T>>& sides, Matrix<T>& output, 
uint32_t grix, DevMatPtrs<T>& dp)  {
+       static void exec([[maybe_unused]] SpoofCUDAContext* ctx, SpoofOperator* 
_op, DataBufferWrapper* dbw)  {
                uint32_t NT=256;
                T value_type;
-               bool sparse_input = input.front().row_ptr != nullptr;
+               bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr;
                auto* op = dynamic_cast<SpoofRowwiseOp*>(_op);
-               dim3 grid(input.front().rows, 1, 1);
+               dim3 grid(dbw->h_in<T>(0)->rows, 1, 1);
                dim3 block(NT, 1, 1);
                unsigned int shared_mem_size = NT * sizeof(T);
-               
+
+               size_t out_num_elements = dbw->h_out<T>()->rows * 
dbw->h_out<T>()->cols;
+               if(dbw->h_out<T>()->row_ptr)
+                       if(op->isSparseSafe() && dbw->h_out<T>()->nnz > 0)
+                               out_num_elements = dbw->h_out<T>()->nnz;
+               //ToDo: only memset output when there is an output operation 
that *adds* to the buffer
+               CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, 
out_num_elements * sizeof(T), op->stream));
+
+               //ToDo: handle this in JVM
                uint32_t tmp_len = 0;
                uint32_t temp_buf_size = 0;
                T* d_temp = nullptr;
                if(op->num_temp_vectors > 0) {
-                       tmp_len = std::max(input.front().cols, op->const_dim2 < 
0 ? 0 : static_cast<uint32_t>(op->const_dim2));
-                       temp_buf_size = op->num_temp_vectors * tmp_len * 
input.front().rows * sizeof(T);
+                       tmp_len = std::max(dbw->h_in<T>(0)->cols, 
op->const_dim2 < 0 ? 0u : static_cast<uint32_t>(op->const_dim2));
+                       temp_buf_size = op->num_temp_vectors * tmp_len * 
dbw->h_in<T>(0)->rows * sizeof(T);
 #ifndef NDEBUG
                        std::cout << "num_temp_vect: " << op->num_temp_vectors 
<< " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl;
 #endif
                        
CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size));
-                       CHECK_CUDART(cudaMemset(d_temp, 0, temp_buf_size));
+                       CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, 
op->stream));
                }
                
                std::string op_name(op->name + "_DENSE");
@@ -56,21 +63,18 @@ struct SpoofRowwise {
 
 #ifndef NDEBUG
                // ToDo: connect output to SystemDS logging facilities
-               std::cout << "launching spoof rowwise kernel " << op_name << " 
with " << NT * input.front().rows << " threads in "
-                               << input.front().rows << " blocks and " << 
shared_mem_size << " bytes of shared memory for "
-                               << input.front().rows << " cols processed by " 
<< NT << " threads per row, adding "
+               std::cout << "launching spoof rowwise kernel " << op_name << " 
with " << NT * dbw->h_in<T>(0)->rows << " threads in "
+                               << dbw->h_in<T>(0)->rows << " blocks and " << 
shared_mem_size << " bytes of shared memory for "
+                               << dbw->h_in<T>(0)->rows << " cols processed by 
" << NT << " threads per row, adding "
                                << temp_buf_size / 1024 << " kb of temp buffer 
in global memory." <<  std::endl;
 #endif
                CHECK_CUDA(op->program->kernel(op_name)
-                                                  
.instantiate(type_of(value_type), std::max(static_cast<size_t>(1), 
sides.size()), op->num_temp_vectors, tmp_len)
-                                                  .configure(grid, block, 
shared_mem_size)
-                                                  .launch(dp.in, dp.sides, 
dp.out, dp.scalars, d_temp, grix));
+                                                  
.instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1), 
dbw->num_sides()), op->num_temp_vectors, tmp_len)
+                                                  .configure(grid, block, 
shared_mem_size, op->stream)
+                                                  .launch(dbw->d_in<T>(0), 
dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), d_temp, dbw->grix()));
                
                if(op->num_temp_vectors > 0)
                        CHECK_CUDART(cudaFree(d_temp));
-               
-//             if (op->TB1)
-//                     CHECK_CUDART(cudaFree(b1_transposed));
        }
 };
 
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp 
b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index 9a808a6..5134d5e 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -23,69 +23,86 @@
 #include "SpoofRowwise.h"
 
 // JNI Methods to get/release arrays
-#define GET_ARRAY(env, input)((void *)env->GetPrimitiveArrayCritical(input, 
nullptr))
+//#define GET_ARRAY(env, input)((void *)env->GetPrimitiveArrayCritical(input, 
nullptr))
+//#define RELEASE_ARRAY(env, java, 
cpp)(env->ReleasePrimitiveArrayCritical(java, cpp, 0))
 
-#define RELEASE_ARRAY(env, java, cpp)(env->ReleasePrimitiveArrayCritical(java, 
cpp, 0))
+jclass jcuda_pointer_class;
+jclass jcuda_native_pointer_class;
+jfieldID pointer_buffer_field;
+jfieldID native_pointer_field;
 
 // error output helper
-void printException(const std::string& name, const std::exception& e, bool 
compile = false) {
+void printException(const std::string &name, const std::exception &e, bool 
compile = false) {
        std::string type = compile ? "compiling" : "executing";
-       std::cout << "std::exception while " << type << "  SPOOF CUDA operator 
" << name << ":\n" << e.what() << std::endl;
+       std::cerr << "std::exception while " << type << "  SPOOF CUDA operator 
" << name << ":\n" << e.what() << std::endl;
 }
 
 
-// a pod struct to have names for the passed pointers
-template<typename T>
-struct LaunchMetadata {
-       const T& opID;
-       const T& grix;
-       const size_t& num_inputs;
-       const size_t& num_sides;
-       
-       // num entries describing one matrix (6 entries):
-       // {nnz,rows,cols,row_ptr,col_idxs,data}
-       const size_t& entry_size;
-       const T& num_scalars;
-       
-       explicit LaunchMetadata(const size_t* jvals) : opID(jvals[0]), 
grix(jvals[1]), num_inputs(jvals[2]),
-                       num_sides(jvals[3]), entry_size(jvals[4]), 
num_scalars(jvals[5]) {}
-};
-
-
-[[maybe_unused]] JNIEXPORT jlong JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
-       JNIEnv *jenv, [[maybe_unused]] jobject jobj, jint device_id, jstring 
resource_path) {
+[[maybe_unused]] JNIEXPORT jlong JNICALL 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context
+       (JNIEnv *jenv, [[maybe_unused]] jobject jobj, jint device_id, jstring 
resource_path)
+{
        const char *cstr_rp = jenv->GetStringUTFChars(resource_path, nullptr);
        size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
        jenv->ReleaseStringUTFChars(resource_path, cstr_rp);
+
+       // fetch some jcuda class handles
+       jcuda_pointer_class = jenv->FindClass("jcuda/Pointer");
+       jcuda_native_pointer_class = 
jenv->FindClass("jcuda/NativePointerObject");
+       pointer_buffer_field = jenv->GetFieldID(jcuda_pointer_class, "buffer", 
"Ljava/nio/Buffer;");
+       native_pointer_field = jenv->GetFieldID(jcuda_native_pointer_class, 
"nativePointer", "J");
+
+       // explicit cast to make compiler and linter happy
        return static_cast<jlong>(ctx);
 }
 
 
-[[maybe_unused]] JNIEXPORT void JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
-               [[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jobject jobj, 
jlong ctx, jint device_id) {
+[[maybe_unused]] JNIEXPORT void JNICALL 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context
+       ([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong 
ctx, jint device_id) {
        SpoofCUDAContext::destroy_cuda(reinterpret_cast<SpoofCUDAContext 
*>(ctx), device_id);
 }
 
+[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofOperator_getNativeStagingBuffer
+       (JNIEnv *jenv, [[maybe_unused]] jclass jobj, jobject ptr, jlong _ctx, 
jint size) {
+       std::string operator_name("SpoofOperator_getNativeStagingBuffer");
+       try {
+               // retrieve data handles from JVM
+               auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
+               if (size > ctx->current_mem_size)
+                       ctx->resize_staging_buffer(size);
+
+               jobject object = jenv->NewDirectByteBuffer(ctx->staging_buffer, 
size);
+               jenv->SetObjectField(ptr, pointer_buffer_field, object);
+               jenv->SetLongField(ptr,native_pointer_field, 
reinterpret_cast<jlong>(ctx->staging_buffer));
+
+               return 0;
+       }
+       catch (std::exception & e) {
+               printException(operator_name, e);
+       }
+       catch (...) {
+               printException(operator_name, std::runtime_error("unknown 
exception"), true);
+       }
+       return -1;
+}
 
 template<typename TEMPLATE>
-int compile_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong 
_ctx, jstring name, jstring src, TEMPLATE op) {
+int compile_spoof_operator
+       (JNIEnv *jenv, [[maybe_unused]] jobject jobj, jlong _ctx, jstring name, 
jstring src, TEMPLATE op) {
        std::string operator_name;
        try {
                auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
                const char *cstr_name = jenv->GetStringUTFChars(name, nullptr);
                const char *cstr_src = jenv->GetStringUTFChars(src, nullptr);
                operator_name = cstr_name;
-
                op->name = operator_name;
+
                int status = ctx->compile(std::move(op), cstr_src);
-               
+
                jenv->ReleaseStringUTFChars(src, cstr_src);
                jenv->ReleaseStringUTFChars(name, cstr_name);
                return status;
        }
-       catch (std::exception& e) {
+       catch (std::exception &e) {
                printException(operator_name, e, true);
        }
        catch (...) {
@@ -96,73 +113,41 @@ int compile_spoof_operator(JNIEnv *jenv, [[maybe_unused]] 
jobject jobj, jlong _c
 
 
 [[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_hops_codegen_cplan_CNodeCell_compile_1nvrtc
-               (JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring 
src, jint type, jint agg_op,
-                               jboolean sparseSafe) {
-       
+       (JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint 
type, jint agg_op, jboolean sparseSafe) {
        std::unique_ptr<SpoofCellwiseOp> op = 
std::make_unique<SpoofCellwiseOp>(SpoofOperator::AggType(type),
-                       SpoofOperator::AggOp(agg_op), sparseSafe);
-       
+                                                                               
                                                                        
SpoofOperator::AggOp(agg_op), sparseSafe);
+
        return compile_spoof_operator<std::unique_ptr<SpoofCellwiseOp>>(jenv, 
jobj, ctx, name, src, std::move(op));
 }
 
-
 [[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_hops_codegen_cplan_CNodeRow_compile_1nvrtc
-               (JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring 
src, jint type, jint const_dim2,
-                               jint num_vectors, jboolean TB1) {
-       
+       (JNIEnv *jenv, jobject jobj, jlong ctx, jstring name, jstring src, jint 
type, jint const_dim2,
+        jint num_vectors, jboolean TB1) {
        std::unique_ptr<SpoofRowwiseOp> op = 
std::make_unique<SpoofRowwiseOp>(SpoofOperator::RowType(type), TB1,
-                       num_vectors, const_dim2);
+                                                                               
                                                                  num_vectors, 
const_dim2);
        return compile_spoof_operator<std::unique_ptr<SpoofRowwiseOp>>(jenv, 
jobj, ctx, name, src, std::move(op));
 }
 
 
 template<typename T, typename TEMPLATE>
-int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] jclass jobj, jlong 
_ctx, jlongArray _meta, jlongArray in,
-               jlongArray _sides, jlongArray out, jlong _scalars) {
-       std::string operator_name("unknown");
+int launch_spoof_operator([[maybe_unused]] JNIEnv *jenv, [[maybe_unused]] 
jclass jobj, jlong _ctx) {
+       std::string operator_name("launch_spoof_operator jni-bridge");
        try {
                // retrieve data handles from JVM
-               auto *metacast = reinterpret_cast<size_t *>(GET_ARRAY(jenv, 
_meta));
                auto *ctx = reinterpret_cast<SpoofCUDAContext *>(_ctx);
-               auto *inputs = reinterpret_cast<size_t *>(GET_ARRAY(jenv, in));
-               auto *sides = reinterpret_cast<size_t *>(GET_ARRAY(jenv, 
_sides));
-               auto *output = reinterpret_cast<size_t *>(GET_ARRAY(jenv, out));
-//             auto *scalars = reinterpret_cast<T *>(GET_ARRAY(jenv, 
_scalars));
-               auto *scalars = reinterpret_cast<T *>(_scalars);
-               LaunchMetadata<size_t> meta(metacast);
-               
+
+#ifndef NDEBUG
+               uint32_t opID = 
*reinterpret_cast<uint32_t*>(&ctx->staging_buffer[sizeof(uint32_t)]);
                // this implicitly checks if op exists
-               operator_name = ctx->getOperatorName(meta.opID);
-               
-               // wrap/cast inputs
-               std::vector<Matrix<T>> mats_in;
-               for(auto i = 0ul; i < meta.num_inputs; i+=meta.entry_size)
-                       mats_in.emplace_back(&inputs[i]);
-               
-               // wrap/cast sides
-               std::vector<Matrix<T>> mats_sides;
-               for(auto i = 0ul; i < meta.num_sides; i+=meta.entry_size)
-                       mats_sides.emplace_back(&sides[i]);
-               
-               // wrap/cast output
-               Matrix<T> mat_out(output);
-               
-               // wrap/cast scalars
-//             std::unique_ptr<Matrix<T>> mat_scalars = scalars == nullptr ? 0 
: std::make_unique<Matrix<T>>(scalars);
-               
+               operator_name = ctx->getOperatorName(opID);
+               std::cout << "executing op=" << operator_name << " id=" << opID 
<< std::endl;
+#endif
                // transfers resource pointers to GPU and calls op->exec()
-               ctx->launch<T, TEMPLATE>(meta.opID, mats_in, mats_sides, 
mat_out, scalars, meta.grix);
-               
-               // release data handles from JVM
-               RELEASE_ARRAY(jenv, _meta, metacast);
-               RELEASE_ARRAY(jenv, in, inputs);
-               RELEASE_ARRAY(jenv, _sides, sides);
-               RELEASE_ARRAY(jenv, out, output);
-//             RELEASE_ARRAY(jenv, _scalars, scalars);
-               
+               ctx->launch<T, TEMPLATE>();
+
                return 0;
        }
-       catch (std::exception& e) {
+       catch (std::exception &e) {
                printException(operator_name, e);
        }
        catch (...) {
@@ -171,26 +156,26 @@ int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] 
jclass jobj, jlong _ctx
        return -1;
 }
 
-[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
-               (JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, 
jlongArray in, jlongArray sides, jlongArray out,
-                jlong scalars) {
-       return launch_spoof_operator<float, SpoofCellwise<float>>(jenv, jobj, 
ctx, meta, in, sides, out, scalars);
-}
 
 [[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1d
-               (JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, 
jlongArray in, jlongArray sides, jlongArray out,
-                jlong scalars) {
-       return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, 
ctx, meta, in, sides, out, scalars);
+       (JNIEnv *jenv, jclass jobj, jlong ctx) {
+       return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, 
ctx);
 }
 
-[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
-               (JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, 
jlongArray in, jlongArray sides, jlongArray out,
-                jlong scalars) {
-       return launch_spoof_operator<float, SpoofRowwise<float>>(jenv, jobj, 
ctx, meta, in, sides, out, scalars);
+
+[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
+       (JNIEnv *jenv, jclass jobj, jlong ctx) {
+       return launch_spoof_operator<double, SpoofCellwise<double>>(jenv, jobj, 
ctx);
 }
 
+
 [[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1d
-               (JNIEnv *jenv, jclass jobj, jlong ctx, jlongArray meta, 
jlongArray in, jlongArray sides, jlongArray out,
-                jlong scalars) {
-       return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, 
ctx, meta, in, sides, out, scalars);
+       (JNIEnv *jenv, jclass jobj, jlong ctx) {
+       return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, 
ctx);
 }
+
+
+[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
+       (JNIEnv *jenv, jclass jobj, jlong ctx) {
+       return launch_spoof_operator<double, SpoofRowwise<double>>(jenv, jobj, 
ctx);
+}
\ No newline at end of file
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.h 
b/src/main/cuda/spoof-launcher/jni_bridge.h
index 1e9ef20..0f25936 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.h
+++ b/src/main/cuda/spoof-launcher/jni_bridge.h
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-/* DO NOT EDIT THIS FILE - it is machine generated */
+/* DO EDIT THIS FILE - it is not machine generated */
 
 #pragma once
 #ifndef JNI_BRIDGE_H
@@ -32,20 +32,26 @@ extern "C" {
 /*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    initialize_cuda_context
- * Signature: (I)J
+ * Signature: (ILjava/lang/String;)J
  */
-[[maybe_unused]] JNIEXPORT jlong JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
-    JNIEnv *, [[maybe_unused]] jobject, jint, jstring);
+[[maybe_unused]] JNIEXPORT jlong JNICALL 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context
+       (JNIEnv *, [[maybe_unused]] jobject, jint, jstring);
 
 /*
  * Class:     org_apache_sysds_hops_codegen_SpoofCompiler
  * Method:    destroy_cuda_context
  * Signature: (JI)V
  */
-[[maybe_unused]] JNIEXPORT void JNICALL
-Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
-               [[maybe_unused]] JNIEnv *, [[maybe_unused]] jobject, jlong, 
jint);
+[[maybe_unused]] JNIEXPORT void JNICALL 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context
+       ([[maybe_unused]] JNIEnv *, [[maybe_unused]] jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_sysds_runtime_codegen_SpoofOperator
+ * Method:    getNativeStagingBuffer
+ * Signature: (Ljcuda/Pointer;JI)I
+ */
+[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofOperator_getNativeStagingBuffer
+       (JNIEnv *, jclass, jobject, jlong, jint);
 
 /*
  * Class:     org_apache_sysds_hops_codegen_cplan_CNodeCell
@@ -64,37 +70,36 @@ 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_destroy_1cuda_1context(
                (JNIEnv *, [[maybe_unused]] jobject, jlong, jstring, jstring, 
jint, jint, jint, jboolean);
 
 /*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwiseOperator
- * Method:    execute_f
- * Signature: (J[J[J[J[JJ)I
- */
-[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
-               (JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, 
jlongArray, jlong);
-
-/*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwiseOperator
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwise
  * Method:    execute_d
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
 [[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1d
-               (JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, 
jlongArray, jlong);
-
+       (JNIEnv *, jclass, jlong);
 
 /*
- * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDACellwise
  * Method:    execute_f
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
-[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
-               (JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, 
jlongArray, jlong);
+[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDACellwise_execute_1f
+       (JNIEnv *, jclass, jlong);
 
 /*
  * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
  * Method:    execute_d
- * Signature: (J[J[J[J[JJ)I
+ * Signature: (J)I
  */
 [[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1d
-               (JNIEnv *, jclass, jlong, jlongArray, jlongArray, jlongArray, 
jlongArray, jlong);
+       (JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_apache_sysds_runtime_codegen_SpoofCUDARowwise
+ * Method:    execute_f
+ * Signature: (J)I
+ */
+[[maybe_unused]] JNIEXPORT jint JNICALL 
Java_org_apache_sysds_runtime_codegen_SpoofCUDARowwise_execute_1f
+       (JNIEnv *, jclass, jlong);
 
 #ifdef __cplusplus
 }
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
index 951f709..6751b1d 100644
--- a/src/main/cuda/spoof/cellwise.cu
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -42,7 +42,8 @@ struct SpoofCellwiseOp {
                uint32_t _grix;
 
        SpoofCellwiseOp(Matrix<T>* _A, Matrix<T>* _B, Matrix<T>* _C, T* 
scalars, uint32_t grix) :
-                       n(_A->cols), scalars(scalars), _grix(grix) {
+               n(_A->cols), scalars(scalars), _grix(grix)
+       {
                A.init(_A);
                c.init(_C);
                alen = A.row_len(grix);
diff --git a/src/main/cuda/spoof/rowwise.cu b/src/main/cuda/spoof/rowwise.cu
index 7d80d54..b31ce0c 100644
--- a/src/main/cuda/spoof/rowwise.cu
+++ b/src/main/cuda/spoof/rowwise.cu
@@ -55,6 +55,8 @@ struct SpoofRowwiseOp //%HAS_TEMP_VECT%
 
        __device__  __forceinline__ void exec_dense(uint32_t ai, uint32_t ci, 
uint32_t rix) {
 //%BODY_dense%
+               if (debug_row() && debug_thread())
+                       printf("c[0]=%4.3f\n", c.vals(0)[0]);
        }
 
        __device__  __forceinline__ void exec_sparse(uint32_t ai, uint32_t ci, 
uint32_t rix) {
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index 47a7178..f803000 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -156,10 +156,12 @@ public class CNodeCell extends CNodeTpl
                if(tmpDense.contains("grix"))
                        tmp = tmp.replace("//%NEED_GRIX%", "\t\tuint32_t 
grix=_grix + rix;");
                else
-                       tmp = tmp.replace("//%NEED_GRIX%", "");
-               tmp = tmp.replace("//%NEED_RIX%", "");
-               tmp = tmp.replace("//%NEED_CIX%", "");
-               
+                       tmp = tmp.replace("//%NEED_GRIX%\n", "");
+
+               // remove empty lines
+//             if(!api.isJava())
+//                     tmpDense = tmpDense.replaceAll("(?m)^[ \t]*\r?\n", "");
+
                tmp = tmp.replace("%BODY_dense%", tmpDense);
                
                //Return last TMP. Square it for CUDA+SUM_SQ
@@ -171,10 +173,7 @@ public class CNodeCell extends CNodeTpl
                tmp = tmp.replace("%AGG_OP_NAME%", (_aggOp != null) ? "AggOp." 
+ _aggOp.name() : "null");
                tmp = tmp.replace("%SPARSE_SAFE%", 
String.valueOf(isSparseSafe()));
                tmp = tmp.replace("%SEQ%", String.valueOf(containsSeq()));
-               
-               // maybe empty lines
-               //tmp = tmp.replaceAll("(?m)^[ \t]*\r?\n", "");
-               
+
                if(api == GeneratorAPI.CUDA) {
                        // ToDo: initial_value is misused to pass VT (values 
per thread) to no_agg operator
                        String agg_op = "IdentityOp";
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
index 0e21017..f2405d5 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/cuda/Unary.java
@@ -191,7 +191,7 @@ public class Unary extends CodeTemplate {
                                case MULT2:
                                        return "        T %TMP% = %IN1% + 
%IN1%;\n";
                                case ABS:
-                                       return "        T %TMP% = 
fabs(%IN1%);\n";
+                                       return "\t\tT %TMP% = fabs(%IN1%);\n";
                                case SIN:
                                        return "        T %TMP% = 
sin(%IN1%);\n";
                                case COS:
@@ -217,7 +217,7 @@ public class Unary extends CodeTemplate {
                                case LOG:
                                        return "                T %TMP% = 
log(%IN1%);\n";
                                case ROUND:
-                                       return "        T %TMP% = 
round(%IN1%);\n";
+                                       return "\t\tT %TMP% = round(%IN1%);\n";
                                case CEIL:
                                        return "        T %TMP% = 
ceil(%IN1%);\n";
                                case FLOOR:
diff --git 
a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
index 0510941..5a32114 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDACellwise.java
@@ -23,11 +23,11 @@ import jcuda.Pointer;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
@@ -38,101 +38,75 @@ public class SpoofCUDACellwise extends SpoofCellwise 
implements SpoofCUDAOperato
        private static final Log LOG = 
LogFactory.getLog(SpoofCUDACellwise.class.getName());
        private final int ID;
        private final PrecisionProxy call;
-       private Pointer ptr;
        private final SpoofCellwise fallback_java_op;
+       private final long ctx;
        
        public SpoofCUDACellwise(CellType type, boolean sparseSafe, boolean 
containsSeq, AggOp aggOp, int id,
-                       PrecisionProxy ep, SpoofCellwise fallback) {
+               PrecisionProxy ep, SpoofCellwise fallback)
+       {
                super(type, sparseSafe, containsSeq, aggOp);
                ID = id;
                call = ep;
-               ptr = null;
                fallback_java_op = fallback;
+               ctx = 
SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);
        }
        
        @Override
-       public ScalarObject execute(ExecutionContext ec, 
ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects) {
+       public ScalarObject execute(ExecutionContext ec, 
ArrayList<MatrixObject> inputs,
+               ArrayList<ScalarObject> scalarObjects)
+       {
                double[] result = new double[1];
-               // ToDo: this is a temporary "solution" before perf opt
-               int NT=256;
-               long N = inputs.get(0).getNumRows() * 
inputs.get(0).getNumColumns();
-               long num_blocks = ((N + NT * 2 - 1) / (NT * 2));
-               Pointer ptr = ec.getGPUContext(0).allocate(getName(), 
LibMatrixCUDA.sizeOfDataType * num_blocks);
-               long[] out = {1,1,1, 0, 0, GPUObject.getPointerAddress(ptr)};
-               int offset = 1;
-               if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, 
offset), 
-                       prepareSideInputPointers(ec, inputs, offset, false), 
out, scalarObjects, 0 ) != 0) {
-                       LOG.error("SpoofCUDA " + getSpoofType() + " operator 
failed to execute. Trying Java fallback.\n");
-                       // ToDo: java fallback
+               Pointer[] ptr = new Pointer[1];
+               packDataForTransfer(ec, inputs, scalarObjects, null, 1, ID, 
0,false, ptr);
+               if(NotEmpty(inputs.get(0).getGPUObject(ec.getGPUContext(0)))) {
+                       if(call.exec(this) != 0)
+                               LOG.error("SpoofCUDA " + getSpoofType() + " 
operator " + ID + " failed to execute!\n");
                }
-               
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, 
result, getName(), false);
-               
+               
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr[0], 
result, getName(), false);
+               ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr[0], 
DMLScript.EAGER_CUDA_FREE);
                return new DoubleObject(result[0]);
        }
        
        @Override public String getName() {
                return getSpoofType();
        }
-       
-       @Override public void setScalarPtr(Pointer _ptr) {
-               ptr = _ptr;
-       }
-       
-       @Override public Pointer getScalarPtr() {
-               return ptr;
-       }
-       
-       @Override public void releaseScalarGPUMemory(ExecutionContext ec) {
-               if(ptr != null) {
-                       ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr, 
DMLScript.EAGER_CUDA_FREE);
-                       ptr = null;
-               }
-       }
-       
+
        @Override
-       public MatrixObject execute(ExecutionContext ec, 
ArrayList<MatrixObject> inputs, ArrayList<ScalarObject> scalarObjects,
-                       String outputName) {
-               
+       public MatrixObject execute(ExecutionContext ec, 
ArrayList<MatrixObject> inputs,
+               ArrayList<ScalarObject> scalarObjects, String outputName)
+       {
                long out_rows = ec.getMatrixObject(outputName).getNumRows();
                long out_cols = ec.getMatrixObject(outputName).getNumColumns();
-               MatrixObject a = inputs.get(0);
-               GPUContext gctx = ec.getGPUContext(0);
-               int m = (int) a.getNumRows();
-               int n = (int) a.getNumColumns();
-               double[] scalars = prepInputScalars(scalarObjects);
+
                if(_type == CellType.COL_AGG)
                        out_rows = 1;
                else if(_type == SpoofCellwise.CellType.ROW_AGG)
                        out_cols = 1;
-               
+
+               double[] scalars = prepInputScalars(scalarObjects);
                boolean sparseSafe = isSparseSafe() || ((inputs.size() < 2) && 
-                               genexec( 0, new SideInput[0], scalars, m, n, 0, 
0 ) == 0);
-               
-//             ec.setMetaData(outputName, out_rows, out_cols);
-               GPUObject g = a.getGPUObject(gctx);
-               boolean sparseOut = _type == CellType.NO_AGG && sparseSafe && 
g.isSparse();
-               
-               long nnz = g.getNnz("spoofCUDA" + getSpoofType(), false);
-               if(sparseOut)
-                       LOG.warn("sparse out");
+                               genexec( 0, new SideInput[0], scalars, (int) 
inputs.get(0).getNumRows(),
+                                       (int) inputs.get(0).getNumColumns(), 0, 
0 ) == 0);
+
+               GPUObject in_obj = 
inputs.get(0).getGPUObject(ec.getGPUContext(0));
+               boolean sparseOut = _type == CellType.NO_AGG && sparseSafe && 
in_obj.isSparse();
+               long nnz = in_obj.getNnz("spoofCUDA" + getSpoofType(), false);
                MatrixObject out_obj = sparseOut ?
                                
(ec.getSparseMatrixOutputForGPUInstruction(outputName, out_rows, out_cols, 
(isSparseSafe() && nnz > 0) ?
                                                nnz : out_rows * 
out_cols).getKey()) :
                                
(ec.getDenseMatrixOutputForGPUInstruction(outputName, out_rows, 
out_cols).getKey());
-               
-               int offset = 1;
-               if(!inputIsEmpty(a.getGPUObject(gctx)) || !sparseSafe) {
-                       if(call.exec(ec, this, ID, prepareInputPointers(ec, 
inputs, offset), prepareSideInputPointers(ec, inputs, offset, false),
-                               prepareOutputPointers(ec, out_obj, sparseOut), 
scalarObjects, 0) != 0) {
-                               LOG.error("SpoofCUDA " + getSpoofType() + " 
operator failed to execute. Trying Java fallback.(ToDo)\n");
-                               // ToDo: java fallback
-                       }
+
+               packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 
0,false, null);
+               if(NotEmpty(in_obj) || !sparseSafe) {
+                       if(call.exec(this) != 0)
+                               LOG.error("SpoofCUDA " + getSpoofType() + " 
operator " + ID + " failed to execute!\n");
                }
                return out_obj;
        }
        
-       private static boolean inputIsEmpty(GPUObject g) {
-               return g.getDensePointer() == null && 
g.getSparseMatrixCudaPointer() == null;
+       private static boolean NotEmpty(GPUObject g) {
+               // ToDo: check if that check is sufficient
+               return g.getDensePointer() != null || 
g.getSparseMatrixCudaPointer() != null;
        }
        
        // used to determine sparse safety
@@ -140,15 +114,11 @@ public class SpoofCUDACellwise extends SpoofCellwise 
implements SpoofCUDAOperato
        protected double genexec(double a, SideInput[] b, double[] scalars, int 
m, int n, long gix, int rix, int cix) {
                return fallback_java_op.genexec(a, b, scalars, m, n, 0, 0, 0);
        }
-       
-       public int execute_sp(long ctx, long[] meta, long[] in, long[] sides, 
long[] out, long scalars) {
-               return execute_f(ctx, meta, in, sides, out, scalars);   
-       }
-       
-       public int execute_dp(long ctx, long[] meta, long[] in, long[] sides, 
long[] out, long scalars) {
-               return execute_d(ctx, meta, in, sides, out, scalars);
-       }
-       
-       public static native int execute_f(long ctx, long[] meta, long[] in, 
long[] sides, long[] out, long scalars);
-       public static native int execute_d(long ctx, long[] meta, long[] in, 
long[] sides, long[] out, long scalars);
+
+       public int execute_dp(long ctx) { return execute_d(ctx); }
+       public int execute_sp(long ctx) { return execute_d(ctx); }
+       public long getContext() { return ctx; }
+
+       public static native int execute_d(long ctx);
+       public static native int execute_s(long ctx);
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
index 118f97a..02fbf96 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDAOperator.java
@@ -19,7 +19,10 @@
 
 package org.apache.sysds.runtime.codegen;
 
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import jcuda.Pointer;
+
 import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
@@ -27,146 +30,112 @@ import 
org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
-import java.util.ArrayList;
-
+import static jcuda.runtime.cudaError.cudaSuccess;
 import static 
org.apache.sysds.runtime.matrix.data.LibMatrixCUDA.sizeOfDataType;
 
 public interface SpoofCUDAOperator  {
-       int JNI_MAT_ENTRY_SIZE = 6;
+       // these two constants have equivalences in native code:
+       int JNI_MAT_ENTRY_SIZE = 40;
+       int TRANSFERRED_DATA_HEADER_SIZE = 32;
+
        abstract class PrecisionProxy {
                protected final long ctx;
                
                public PrecisionProxy() { ctx = 
SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);     }
                
-               public abstract int exec(ExecutionContext ec, SpoofCUDAOperator 
op, int opID, long[] in, long[] sides, long[] out,
-                               ArrayList<ScalarObject> scalarObjects, long 
grix);
-               
-               protected Pointer transferScalars(ExecutionContext ec, 
SpoofCUDAOperator op, int sizeOfDataType,
-                               ArrayList<ScalarObject> scalarObjects) {
-                       double[] s = 
SpoofOperator.prepInputScalars(scalarObjects);
-                       Pointer ptr = 
ec.getGPUContext(0).allocate(op.getName(), (long) scalarObjects.size() * 
sizeOfDataType);
-                       
LibMatrixCUDA.cudaSupportFunctions.hostToDevice(ec.getGPUContext(0), s, ptr, 
op.getName());
-                       return ptr;
-               }
+               public abstract int exec(SpoofCUDAOperator op);
        }
        
        String getName();
-       
-       void setScalarPtr(Pointer ptr);
-       
-       Pointer getScalarPtr();
-       
-       void releaseScalarGPUMemory(ExecutionContext ec);
-       
-       default long [] prepareInputPointers(ExecutionContext ec, 
ArrayList<MatrixObject> inputs, int offset) {
-               long [] in = new long[offset * JNI_MAT_ENTRY_SIZE];
-               for(int i = 0; i < offset; i++) {
-                       int j = i  * JNI_MAT_ENTRY_SIZE;
-                       
-                       
if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) {
-                               in[j] = 
ec.getGPUSparsePointerAddress(inputs.get(i)).nnz;
-                               in[j + 1] = inputs.get(i).getNumRows();
-                               in[j + 2] = inputs.get(i).getNumColumns();
-                               in[j + 3] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr);
-                               in[j + 4] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd);
-                               in[j + 5] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val);
-                       }
-                       else {
-                               in[j] = inputs.get(i).getNnz();
-                               in[j + 1] = inputs.get(i).getNumRows();
-                               in[j + 2] = inputs.get(i).getNumColumns();
-                               in[j + 5] = 
ec.getGPUDensePointerAddress(inputs.get(i));
-                       }
-               }
-               return in;
+
+       default void writeMatrixDescriptorToBuffer(ByteBuffer dst, int rows, 
int cols, long row_ptr,
+               long col_idx_ptr, long data_ptr, long nnz)
+       {
+               dst.putLong(nnz);
+               dst.putInt(rows);
+               dst.putInt(cols);
+               dst.putLong(row_ptr);
+               dst.putLong(col_idx_ptr);
+               dst.putLong(data_ptr);
        }
-       
-       default long [] prepareSideInputPointers(ExecutionContext ec, 
ArrayList<MatrixObject> inputs, int offset, boolean tB1) {
-               long[] sides = new long[(inputs.size() - offset) * 
JNI_MAT_ENTRY_SIZE];
-               for(int i = offset; i < inputs.size(); i++) {
-                       int j = (i - offset)  * JNI_MAT_ENTRY_SIZE;
-                       
if(inputs.get(i).getGPUObject(ec.getGPUContext(0)).isSparse()) {
-                               sides[j] = 
ec.getGPUSparsePointerAddress(inputs.get(i)).nnz;
-                               sides[j + 1] = inputs.get(i).getNumRows();
-                               sides[j + 2] = inputs.get(i).getNumColumns();
-                               sides[j + 3] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).rowPtr);
-                               sides[j + 4] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).colInd);
-                               sides[j + 5] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(inputs.get(i)).val);
-                       }
-                       else {
-                               if(tB1 && j == 0) {
-                                       long rows = inputs.get(i).getNumRows();
-                                       long cols = 
inputs.get(i).getNumColumns();
-                                       Pointer b1 = 
inputs.get(i).getGPUObject(ec.getGPUContext(0)).getDensePointer();
-                                       Pointer ptr = 
ec.getGPUContext(0).allocate(getName(), rows * cols * sizeOfDataType);
-                                       
-//                                     double[] tmp1 = new double[(int) (rows 
* cols)];
-//                                     
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), b1, tmp1, 
getName(), false);
-//
-//                                     System.out.println("Mat before 
transpose: rows=" + rows + " cols=" + cols + "\n");
-//                                     for(int m = 0; m < rows; m++) {
-//                                             StringBuilder sb = new 
StringBuilder();
-//                                             for(int n = 0; n < cols; n++)
-//                                                     sb.append(" " + 
tmp1[(int) (cols * m + n)]);
-//                                             
System.out.println(sb.toString());
-//                                     }
-                                       
-                                       LibMatrixCUDA.denseTranspose(ec, 
ec.getGPUContext(0), getName(),
-                                               b1, ptr, rows, cols);
-                                       
-//                                     double[] tmp2 = new double[(int) (rows 
* cols)];
-//                                     
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, tmp2, 
getName(), false);
-//
-//                                     System.out.println("Mat after 
transpose: rows=" + cols + " cols=" + rows + "\n");
-//                                     for(int m = 0; m < cols; m++) {
-//                                             StringBuilder sb = new 
StringBuilder();
-//                                             for(int n = 0; n < rows; n++)
-//                                                     sb.append(" " + 
tmp2[(int) (rows * m + n)]);
-//                                             
System.out.println(sb.toString());
-//                                     }
 
-                                       sides[j] = inputs.get(i).getNnz();
-                                       sides[j + 1] = cols;
-                                       sides[j + 2] = rows;
-                                       sides[j + 5] = 
GPUObject.getPointerAddress(ptr);
-                                       
-                               } else {
-                                       sides[j] = inputs.get(i).getNnz();
-                                       sides[j + 1] = 
inputs.get(i).getNumRows();
-                                       sides[j + 2] = 
inputs.get(i).getNumColumns();
-                                       sides[j + 5] = 
ec.getGPUDensePointerAddress(inputs.get(i));
-                               }
+       default void prepareMatrixPointers(ByteBuffer buf, ExecutionContext ec, 
MatrixObject mo, boolean tB1) {
+               if(mo.getGPUObject(ec.getGPUContext(0)).isSparse()) {
+                       writeMatrixDescriptorToBuffer(buf, 
(int)mo.getNumRows(), (int)mo.getNumColumns(),
+                               
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).rowPtr),
+                                       
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).colInd),
+                                               
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(mo).val),
+                                                       
ec.getGPUSparsePointerAddress(mo).nnz);
+               }
+               else {
+                       if(tB1) {
+                               int rows = (int)mo.getNumRows();
+                               int cols = (int)mo.getNumColumns();
+                               Pointer b1 = 
mo.getGPUObject(ec.getGPUContext(0)).getDensePointer();
+                               Pointer ptr = 
ec.getGPUContext(0).allocate(getName(), (long) rows * cols * sizeOfDataType);
+                               LibMatrixCUDA.denseTranspose(ec, 
ec.getGPUContext(0), getName(), b1, ptr, rows, cols);
+                               writeMatrixDescriptorToBuffer(buf, rows, cols, 
0, 0, GPUObject.getPointerAddress(ptr), mo.getNnz());
+                       } else {
+                               writeMatrixDescriptorToBuffer(buf, 
(int)mo.getNumRows(), (int)mo.getNumColumns(), 0, 0,
+                                       ec.getGPUDensePointerAddress(mo), 
mo.getNnz());
                        }
                }
-               return sides;
        }
-       
-       default long[] prepareOutputPointers(ExecutionContext ec, MatrixObject 
output, boolean sparseOut) {
-               long[] out = {0,0,0,0,0,0};
 
-               if(sparseOut) {
-                       out[0] = ec.getGPUSparsePointerAddress(output).nnz;
-                       out[1] = output.getNumRows();
-                       out[2] = output.getNumColumns();
-                       out[3] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).rowPtr);
-                       out[4] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).colInd);
-                       out[5] = 
GPUObject.getPointerAddress(ec.getGPUSparsePointerAddress(output).val);
+       default void packDataForTransfer(ExecutionContext ec, 
ArrayList<MatrixObject> inputs,
+               ArrayList<ScalarObject> scalarObjects, MatrixObject out_obj, 
int num_inputs, int ID, long grix, boolean tB1,
+                       Pointer[] ptr)
+       {
+               int op_data_size = (inputs.size() + 1) * JNI_MAT_ENTRY_SIZE + 
scalarObjects.size() * Double.BYTES + TRANSFERRED_DATA_HEADER_SIZE;
+               Pointer staging = new Pointer();
+               if(SpoofOperator.getNativeStagingBuffer(staging, 
this.getContext(), op_data_size) != cudaSuccess)
+                       throw new RuntimeException("Failed to get native 
staging buffer from spoof operator");
+               ByteBuffer buf = staging.getByteBuffer();
+               buf.putInt(op_data_size);
+               buf.putInt(ID);
+               buf.putInt((int)grix);
+               buf.putInt(num_inputs);
+               buf.putInt(inputs.size() - num_inputs);
+               buf.putInt(out_obj == null ? 0 : 1);
+               buf.putInt(scalarObjects.size());
+               buf.putInt(-1); // padding
+
+               // copy input & side input pointers
+               for(int i=0; i < inputs.size(); i++) {
+                       if(i == num_inputs)
+                               prepareMatrixPointers(buf, ec, inputs.get(i), 
tB1);
+                       else
+                               prepareMatrixPointers(buf, ec, inputs.get(i), 
false);
                }
-               else {
-                       out[0] = output.getNnz();
-                       out[1] = output.getNumRows();
-                       out[2] = output.getNumColumns();
-                       out[5] = ec.getGPUDensePointerAddress(output);
+
+               // copy output pointers or allocate buffer for reduction
+               if(out_obj == null) {
+                       long num_blocks = 1;
+                       if(this instanceof SpoofCUDACellwise) {
+                               int NT = 256;
+                               long N = inputs.get(0).getNumRows() * 
inputs.get(0).getNumColumns();
+                               num_blocks = ((N + NT * 2 - 1) / (NT * 2));
+                       }
+                       ptr[0] = ec.getGPUContext(0).allocate(getName(), 
LibMatrixCUDA.sizeOfDataType * num_blocks);
+                       writeMatrixDescriptorToBuffer(buf, 1, 1, 0, 0, 
GPUObject.getPointerAddress(ptr[0]), 1);
+               }
+               else {
+                       prepareMatrixPointers(buf, ec, out_obj, false);
+               }
+
+               // copy scalar values (no pointers)
+               for(ScalarObject scalarObject : scalarObjects) {
+                       buf.putDouble(scalarObject.getDoubleValue());
                }
-               return out;
        }
-       
-       MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> 
inputs, 
+
+       MatrixObject execute(ExecutionContext ec, ArrayList<MatrixObject> 
inputs,
                        ArrayList<ScalarObject> scalarObjects, String 
outputName);
        
        ScalarObject execute(ExecutionContext ec, ArrayList<MatrixObject> 
inputs,
                ArrayList<ScalarObject> scalarObjects);
-       
-       int execute_sp(long ctx, long[] meta, long[] in, long[] sides, long[] 
out, long scalars);
-       int execute_dp(long ctx, long[] meta, long[] in, long[] sides, long[] 
out, long scalars);
+
+       int execute_dp(long ctx);
+       int execute_sp(long ctx);
+       long getContext();
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
index b5686c7..2632fae 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofCUDARowwise.java
@@ -23,11 +23,11 @@ import jcuda.Pointer;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.api.DMLScript;
+import org.apache.sysds.hops.codegen.SpoofCompiler;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.cp.DoubleObject;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCUDA;
 
 import java.util.ArrayList;
@@ -37,55 +37,40 @@ public class SpoofCUDARowwise extends SpoofRowwise 
implements SpoofCUDAOperator
        private static final Log LOG = 
LogFactory.getLog(SpoofCUDARowwise.class.getName());
        private final int ID;
        private final PrecisionProxy call;
-       private Pointer ptr;
+       private final long ctx;
        
        public SpoofCUDARowwise(RowType type,  long constDim2, boolean tB1, int 
reqVectMem, int id,
-               PrecisionProxy ep) {
+               PrecisionProxy ep)
+       {
                super(type, constDim2, tB1, reqVectMem);
                ID = id;
                call = ep;
-               ptr = null;
+               ctx = 
SpoofCompiler.native_contexts.get(SpoofCompiler.GeneratorAPI.CUDA);
        }
        
        @Override public String getName() {
                return getSpoofType();
        }
-       
-       @Override public void setScalarPtr(Pointer _ptr) {
-               ptr = _ptr;
-       }
-       
-       @Override public Pointer getScalarPtr() {
-               return ptr;
-       }
-       
-       @Override public void releaseScalarGPUMemory(ExecutionContext ec) {
-               if(ptr != null) {
-                       ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr, 
DMLScript.EAGER_CUDA_FREE);
-                       ptr = null;
-               }
-       }
-       
+
        @Override
        public ScalarObject execute(ExecutionContext ec, 
ArrayList<MatrixObject> inputs,
-               ArrayList<ScalarObject> scalarObjects) {
+               ArrayList<ScalarObject> scalarObjects)
+       {
                double[] result = new double[1];
-               Pointer ptr = ec.getGPUContext(0).allocate(getName(), 
LibMatrixCUDA.sizeOfDataType);
-               long[] out = {1,1,1, 0, 0, GPUObject.getPointerAddress(ptr)};
-               int offset = 1;
-               if(call.exec(ec, this, ID, prepareInputPointers(ec, inputs, 
offset), prepareSideInputPointers(ec, inputs, offset, _tB1),
-                               out, scalarObjects, 0) != 0) {
-                       LOG.error("SpoofCUDA " + getSpoofType() + " operator 
failed to execute. Trying Java fallback.\n");
-                       // ToDo: java fallback
-               }
-               
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr, 
result, getName(), false);
+               Pointer[] ptr = new Pointer[1];
+               packDataForTransfer(ec, inputs, scalarObjects, null, 1, ID, 
0,_tB1, ptr);
+               if(call.exec(this) != 0)
+                       LOG.error("SpoofCUDA " + getSpoofType() + " operator " 
+ ID + " failed to execute!\n");
+
+               
LibMatrixCUDA.cudaSupportFunctions.deviceToHost(ec.getGPUContext(0), ptr[0], 
result, getName(), false);
+               ec.getGPUContext(0).cudaFreeHelper(getSpoofType(), ptr[0], 
DMLScript.EAGER_CUDA_FREE);
                return new DoubleObject(result[0]);
        }
        
        @Override
        public MatrixObject execute(ExecutionContext ec, 
ArrayList<MatrixObject> inputs,
-               ArrayList<ScalarObject> scalarObjects, String outputName) {
-               
+               ArrayList<ScalarObject> scalarObjects, String outputName)
+       {
                int m = (int) inputs.get(0).getNumRows();
                int n = (int) inputs.get(0).getNumColumns();
                final int n2 = _type.isConstDim2(_constDim2) ? (int)_constDim2 
: _type.isRowTypeB1() ||
@@ -93,13 +78,12 @@ public class SpoofCUDARowwise extends SpoofRowwise 
implements SpoofCUDAOperator
                OutputDimensions out_dims = new OutputDimensions(m, n, n2);
                ec.setMetaData(outputName, out_dims.rows, out_dims.cols);
                MatrixObject out_obj = 
ec.getDenseMatrixOutputForGPUInstruction(outputName, out_dims.rows, 
out_dims.cols).getKey();
-               
-               int offset = 1;
-               if(call.exec(ec,this, ID, prepareInputPointers(ec, inputs, 
offset), prepareSideInputPointers(ec, inputs, 
-                               offset, _tB1), prepareOutputPointers(ec, 
out_obj, false), scalarObjects, 0) != 0) {
-                       LOG.error("SpoofCUDA " + getSpoofType() + " operator 
failed to execute. Trying Java fallback.\n");
-                       // ToDo: java fallback
-               }
+
+               packDataForTransfer(ec, inputs, scalarObjects, out_obj, 1, ID, 
0,_tB1, null);
+
+               if(call.exec(this) != 0)
+                       LOG.error("SpoofCUDA " + getSpoofType() + " operator " 
+ ID + " failed to execute!\n");
+
                return out_obj;
        }
        
@@ -110,15 +94,11 @@ public class SpoofCUDARowwise extends SpoofRowwise 
implements SpoofCUDAOperator
        // unused
        @Override protected void genexec(double[] avals, int[] aix, int ai, 
SideInput[] b, double[] scalars, double[] c,
                int ci, int alen, int n, long grix, int rix) { }
-       
-       public int execute_sp(long ctx, long[] meta, long[] in, long[] sides, 
long[] out, long scalars) {
-               return execute_f(ctx, meta, in, sides, out, scalars);
-       }
-       
-       public int execute_dp(long ctx, long[] meta, long[] in, long[] sides, 
long[] out, long scalars) {
-               return execute_d(ctx, meta, in, sides, out, scalars);
-       }
-       
-       public static native int execute_f(long ctx, long[] meta, long[] in, 
long[] sides, long[] out, long scalars);
-       public static native int execute_d(long ctx, long[] meta, long[] in, 
long[] sides, long[] out, long scalars);
+
+       public int execute_dp(long ctx) { return execute_d(ctx); }
+       public int execute_sp(long ctx) { return execute_d(ctx); }
+       public long getContext() { return ctx; }
+
+       public static native int execute_d(long ctx);
+       public static native int execute_s(long ctx);
 }
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java 
b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
index fe8d932..f270091 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/SpoofOperator.java
@@ -23,6 +23,7 @@ import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import jcuda.Pointer;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -321,4 +322,6 @@ public abstract class SpoofOperator implements Serializable
                        currColPos = 0;
                }
        }
+
+       public static native int getNativeStagingBuffer(Pointer ptr, long 
context, int size);
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
index 9fd1bcf..b54d6fe 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/SpoofCUDAInstruction.java
@@ -35,7 +35,6 @@ import 
org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.instructions.cp.CPOperand;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
-import org.apache.sysds.runtime.instructions.gpu.context.GPUObject;
 import org.apache.sysds.runtime.lineage.LineageItem;
 import org.apache.sysds.runtime.lineage.LineageItemUtils;
 import org.apache.sysds.utils.GPUStatistics;
@@ -52,21 +51,14 @@ public class SpoofCUDAInstruction extends GPUInstruction {
        public final CPOperand _out;
        
        public static class SinglePrecision extends 
SpoofCUDAOperator.PrecisionProxy {
-               public int exec(ExecutionContext ec, SpoofCUDAOperator op, int 
opID, long[] in, long[] sides, long[] out,
-                               ArrayList<ScalarObject> scalarObjects, long 
grix) {
-                       op.setScalarPtr(transferScalars(ec, op, Sizeof.FLOAT, 
scalarObjects));
-                       long[] _metadata = { opID, grix, in.length, 
sides.length, out.length, scalarObjects.size() };
-                       return op.execute_sp(ctx, _metadata, in, sides, out, 
GPUObject.getPointerAddress(op.getScalarPtr()));
+               public int exec(SpoofCUDAOperator op) {
+                       return op.execute_sp(ctx);
                }
        }
        
        public static class DoublePrecision extends 
SpoofCUDAOperator.PrecisionProxy {
-               public int exec(ExecutionContext ec, SpoofCUDAOperator op, int 
opID, long[] in, long[] sides, long[] out,
-                               ArrayList<ScalarObject> scalarObjects, long 
grix) {
-                       if(!scalarObjects.isEmpty())
-                               op.setScalarPtr(transferScalars(ec, op, 
Sizeof.DOUBLE, scalarObjects));
-                       long[] _metadata = { opID, grix, in.length, 
sides.length, out.length, scalarObjects.size() };
-                       return op.execute_dp(ctx, _metadata, in, sides, out, 
GPUObject.getPointerAddress(op.getScalarPtr()));
+               public int exec(SpoofCUDAOperator op) {
+                       return op.execute_dp(ctx);
                }
        }
        
@@ -101,7 +93,6 @@ public class SpoofCUDAInstruction extends GPUInstruction {
                String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
 
                ArrayList<CPOperand> inlist = new ArrayList<>();
-//             Integer op_id =  
CodegenUtils.getCUDAopID(parts[2].split("\\.")[1]);
                Integer op_id = CodegenUtils.getCUDAopID(parts[2]);
                Class<?> cla = CodegenUtils.getClass(parts[2]);
                SpoofOperator fallback_java_op = 
CodegenUtils.createInstance(cla);
@@ -141,11 +132,9 @@ public class SpoofCUDAInstruction extends GPUInstruction {
                                ScalarObject out = _op.execute(ec, inputs, 
scalars);
                                ec.setScalarOutput(_out.getName(), out);
                        }
-                       
-                       _op.releaseScalarGPUMemory(ec);
                }
                catch(Exception ex) {
-                       LOG.error("SpoofCUDAInstruction: " + _op.getName() + " 
operator failed to execute. Trying Java fallback.(ToDo)\n");
+                       LOG.error("SpoofCUDAInstruction: " + _op.getName() + " 
operator failed to execute :(\n");
                        
                        throw new DMLRuntimeException(ex);
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
index efbf201..633e3fc 100644
--- 
a/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysds/runtime/instructions/gpu/context/GPUObject.java
@@ -1168,4 +1168,5 @@ public class GPUObject {
        
        public static long getPointerAddress(Pointer p) {
                return (p == null) ?  0 : getPointerAddressInternal(p);
-       }}
+       }
+}

[systemds] branch main updated: [SYSTEMDS-3024] Improve performance by batching data descriptor transfers

Reply via email to