[systemds] branch master updated: [SYSTEMDS-2854] SUM_SQ reduction for GPU-codegen fixed; cleanups

markd Wed, 16 Jun 2021 15:16:05 -0700

This is an automated email from the ASF dual-hosted git repository.

markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new 197a14b  [SYSTEMDS-2854] SUM_SQ reduction for GPU-codegen fixed; 
cleanups
197a14b is described below

commit 197a14bce0c2dca44f508ca8b2105dff16e1c462
Author: Mark Dokter <[email protected]>
AuthorDate: Wed Jun 16 23:42:42 2021 +0200

    [SYSTEMDS-2854] SUM_SQ reduction for GPU-codegen fixed; cleanups
    
    This patch re-enables the reduction operation for GPU, which was 
(unnecessarily) disabled because it did not work correctly with cuda codegen.
    A few cleanups in the touched files went in alongside.
    
    Closes #1315
---
 src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp   | 21 +++++++++------------
 src/main/cuda/spoof-launcher/SpoofCUDAContext.h     |  2 +-
 src/main/cuda/spoof-launcher/jni_bridge.cpp         | 12 ++++++------
 src/main/cuda/spoof/cellwise.cu                     |  7 -------
 src/main/java/org/apache/sysds/hops/AggUnaryOp.java |  2 +-
 .../apache/sysds/hops/codegen/cplan/CNodeCell.java  |  3 +--
 .../sysds/test/functions/aggregate/SumSqTest.java   |  3 ++-
 7 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 6f449e9..233a4a1 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -22,10 +22,10 @@
 #include <filesystem>
 #include <iostream>
 #include <cstdlib>
-#include <sstream>
 
-using clk = std::chrono::high_resolution_clock;
-using sec = std::chrono::duration<double, std::ratio<1>>;
+//#include <sstream>
+//using clk = std::chrono::high_resolution_clock;
+//using sec = std::chrono::duration<double, std::ratio<1>>;
 
 size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char* 
resource_path) {
 
@@ -56,17 +56,14 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t 
device_id, const char* resourc
        
        CUfunction func;
        
-       // SUM
+       // SUM and SUM_SQ have the same behavior for intermediate buffers 
(squaring is done in the initial reduction step,
+       // after that it is just summing up the temporary data)
        CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f"));
        
ctx->reduction_kernels_f.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
 SpoofOperator::AggOp::SUM), func));
+       
ctx->reduction_kernels_f.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
 SpoofOperator::AggOp::SUM_SQ), func));
        CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d"));
        
ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
 SpoofOperator::AggOp::SUM), func));
-
-       //  // SUM_SQ
-       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_sum_sq_d"));
-       //  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d", 
func));
-       //  CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, 
"reduce_sum_sq_f"));
-       //  ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f", 
func));
+       
ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
 SpoofOperator::AggOp::SUM_SQ), func));
 
        // MIN
        CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f"));
@@ -83,13 +80,13 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t 
device_id, const char* resourc
        return reinterpret_cast<size_t>(ctx);
 }
 
-void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id) 
{
+void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]] 
uint32_t device_id) {
        delete ctx;
        // cuda device is handled by jCuda atm
        //cudaDeviceReset();
 }
 
-int SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const 
std::string &src) {
+size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const 
std::string &src) {
 #ifndef NDEBUG
 //     std::cout << "---=== START source listing of spoof cuda kernel [ " << 
name << " ]: " << std::endl;
 //    uint32_t line_num = 0;
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h 
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index 7f74337..696682f 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -62,7 +62,7 @@ public:
 
        static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
        
-       int compile(std::unique_ptr<SpoofOperator> op, const std::string &src);
+       size_t compile(std::unique_ptr<SpoofOperator> op, const std::string 
&src);
        
        template <typename T, typename CALL>
        int launch(uint32_t opID, std::vector<Matrix<T>>& input, 
std::vector<Matrix<T>>& sides, Matrix<T>& output,
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp 
b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index 9942b41..9a808a6 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -39,12 +39,12 @@ template<typename T>
 struct LaunchMetadata {
        const T& opID;
        const T& grix;
-       const T& num_inputs;
-       const T& num_sides;
+       const size_t& num_inputs;
+       const size_t& num_sides;
        
        // num entries describing one matrix (6 entries):
        // {nnz,rows,cols,row_ptr,col_idxs,data}
-       const T& entry_size;
+       const size_t& entry_size;
        const T& num_scalars;
        
        explicit LaunchMetadata(const size_t* jvals) : opID(jvals[0]), 
grix(jvals[1]), num_inputs(jvals[2]),
@@ -58,7 +58,7 @@ 
Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
        const char *cstr_rp = jenv->GetStringUTFChars(resource_path, nullptr);
        size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
        jenv->ReleaseStringUTFChars(resource_path, cstr_rp);
-       return ctx;
+       return static_cast<jlong>(ctx);
 }
 
 
@@ -136,12 +136,12 @@ int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]] 
jclass jobj, jlong _ctx
                
                // wrap/cast inputs
                std::vector<Matrix<T>> mats_in;
-               for(auto i = 0; i < meta.num_inputs; i+=meta.entry_size)
+               for(auto i = 0ul; i < meta.num_inputs; i+=meta.entry_size)
                        mats_in.emplace_back(&inputs[i]);
                
                // wrap/cast sides
                std::vector<Matrix<T>> mats_sides;
-               for(auto i = 0; i < meta.num_sides; i+=meta.entry_size)
+               for(auto i = 0ul; i < meta.num_sides; i+=meta.entry_size)
                        mats_sides.emplace_back(&sides[i]);
                
                // wrap/cast output
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
index ca25fe5..d70cc3d 100644
--- a/src/main/cuda/spoof/cellwise.cu
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -56,9 +56,7 @@ struct SpoofCellwiseOp {
 //%NEED_RIX%
 //%NEED_CIX%
 //%NEED_GRIX%
-
 %BODY_dense%
-//printf("tid=%d a=%4.1f\n", threadIdx.x, a);
                return %OUT%;
        }
 };
@@ -75,9 +73,4 @@ __global__ void /*%TMP%*/SPOOF_OP_NAME_SPARSE (Matrix<T>* a, 
Matrix<T>* b, Matri
        %AGG_OP%<T> agg_op;
        SpoofCellwiseOp<T, NUM_B> spoof_op(a, b, c, scalars, grix);
        %TYPE%_SPARSE<T, %AGG_OP%<T>, SpoofCellwiseOp<T, NUM_B>>(&(spoof_op.A), 
&(spoof_op.c), n, %INITIAL_VALUE%, agg_op, spoof_op);
-
-//     if(blockIdx.x == 0 && threadIdx.x == 0) {
-//             for(auto i = 0; i < 30; ++i)
-//                     printf("%4.3f ", spoof_op.c.val(i));
-//     }
 };
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/hops/AggUnaryOp.java 
b/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
index 44d828c..4840f97 100644
--- a/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
@@ -98,7 +98,7 @@ public class AggUnaryOp extends MultiThreadedHop
                                return false;
                        }
                        else if ((_op == AggOp.SUM    && (_direction == 
Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-//                                      || (_op == AggOp.SUM_SQ && (_direction 
== Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                        || (_op == AggOp.SUM_SQ && (_direction 
== Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
                                         || (_op == AggOp.MAX    && (_direction 
== Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
                                         || (_op == AggOp.MIN    && (_direction 
== Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
                                         || (_op == AggOp.MEAN   && (_direction 
== Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java 
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index c8cd6e2..2b52160 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -275,8 +275,7 @@ public class CNodeCell extends CNodeTpl
        }
        @Override
        public boolean isSupported(GeneratorAPI api) {
-               return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA) 
&& _output.isSupported(api) &&
-                       !(getSpoofAggOp() == SpoofCellwise.AggOp.SUM_SQ);
+               return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA) 
&& _output.isSupported(api);
        }
        
        public int compile(GeneratorAPI api, String src) {
diff --git 
a/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java 
b/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
index e3a44cb..531a8ff 100644
--- a/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
@@ -206,7 +206,8 @@ public class SumSqTest extends AutomatedTestBase {
             // On CP and Spark modes, check that the rewrite actually
             // occurred for matrix cases and not for vector cases.
             if (rewrites && (platform == ExecType.SPARK || platform == 
ExecType.CP)) {
-                String prefix = (platform == ExecType.SPARK) ? 
Instruction.SP_INST_PREFIX : "";
+                String prefix = (platform == ExecType.SPARK) ? 
Instruction.SP_INST_PREFIX :
+                    (DMLScript.USE_ACCELERATOR ? "gpu_": "");
                 String opcode = prefix + op;
                 boolean rewriteApplied = 
Statistics.getCPHeavyHitterOpCodes().contains(opcode);
                 if (vector)

[systemds] branch master updated: [SYSTEMDS-2854] SUM_SQ reduction for GPU-codegen fixed; cleanups

Reply via email to