This is an automated email from the ASF dual-hosted git repository.
markd pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 197a14b [SYSTEMDS-2854] SUM_SQ reduction for GPU-codegen fixed;
cleanups
197a14b is described below
commit 197a14bce0c2dca44f508ca8b2105dff16e1c462
Author: Mark Dokter <[email protected]>
AuthorDate: Wed Jun 16 23:42:42 2021 +0200
[SYSTEMDS-2854] SUM_SQ reduction for GPU-codegen fixed; cleanups
This patch re-enables the reduction operation for GPU, which was
(unnecessarily) disabled because it did not work correctly with cuda codegen.
A few cleanups in the touched files went in alongside.
Closes #1315
---
src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp | 21 +++++++++------------
src/main/cuda/spoof-launcher/SpoofCUDAContext.h | 2 +-
src/main/cuda/spoof-launcher/jni_bridge.cpp | 12 ++++++------
src/main/cuda/spoof/cellwise.cu | 7 -------
src/main/java/org/apache/sysds/hops/AggUnaryOp.java | 2 +-
.../apache/sysds/hops/codegen/cplan/CNodeCell.java | 3 +--
.../sysds/test/functions/aggregate/SumSqTest.java | 3 ++-
7 files changed, 20 insertions(+), 30 deletions(-)
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
index 6f449e9..233a4a1 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.cpp
@@ -22,10 +22,10 @@
#include <filesystem>
#include <iostream>
#include <cstdlib>
-#include <sstream>
-using clk = std::chrono::high_resolution_clock;
-using sec = std::chrono::duration<double, std::ratio<1>>;
+//#include <sstream>
+//using clk = std::chrono::high_resolution_clock;
+//using sec = std::chrono::duration<double, std::ratio<1>>;
size_t SpoofCUDAContext::initialize_cuda(uint32_t device_id, const char*
resource_path) {
@@ -56,17 +56,14 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t
device_id, const char* resourc
CUfunction func;
- // SUM
+ // SUM and SUM_SQ have the same behavior for intermediate buffers
(squaring is done in the initial reduction step,
+ // after that it is just summing up the temporary data)
CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_f"));
ctx->reduction_kernels_f.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
SpoofOperator::AggOp::SUM), func));
+
ctx->reduction_kernels_f.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
SpoofOperator::AggOp::SUM_SQ), func));
CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_sum_d"));
ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
SpoofOperator::AggOp::SUM), func));
-
- // // SUM_SQ
- // CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions,
"reduce_sum_sq_d"));
- // ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_d",
func));
- // CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions,
"reduce_sum_sq_f"));
- // ctx->reduction_kernels.insert(std::make_pair("reduce_sum_sq_f",
func));
+
ctx->reduction_kernels_d.insert(std::make_pair(std::make_pair(SpoofOperator::AggType::FULL_AGG,
SpoofOperator::AggOp::SUM_SQ), func));
// MIN
CHECK_CUDA(cuModuleGetFunction(&func, ctx->reductions, "reduce_min_f"));
@@ -83,13 +80,13 @@ size_t SpoofCUDAContext::initialize_cuda(uint32_t
device_id, const char* resourc
return reinterpret_cast<size_t>(ctx);
}
-void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id)
{
+void SpoofCUDAContext::destroy_cuda(SpoofCUDAContext *ctx, [[maybe_unused]]
uint32_t device_id) {
delete ctx;
// cuda device is handled by jCuda atm
//cudaDeviceReset();
}
-int SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const
std::string &src) {
+size_t SpoofCUDAContext::compile(std::unique_ptr<SpoofOperator> op, const
std::string &src) {
#ifndef NDEBUG
// std::cout << "---=== START source listing of spoof cuda kernel [ " <<
name << " ]: " << std::endl;
// uint32_t line_num = 0;
diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
index 7f74337..696682f 100644
--- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
+++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h
@@ -62,7 +62,7 @@ public:
static void destroy_cuda(SpoofCUDAContext *ctx, uint32_t device_id);
- int compile(std::unique_ptr<SpoofOperator> op, const std::string &src);
+ size_t compile(std::unique_ptr<SpoofOperator> op, const std::string
&src);
template <typename T, typename CALL>
int launch(uint32_t opID, std::vector<Matrix<T>>& input,
std::vector<Matrix<T>>& sides, Matrix<T>& output,
diff --git a/src/main/cuda/spoof-launcher/jni_bridge.cpp
b/src/main/cuda/spoof-launcher/jni_bridge.cpp
index 9942b41..9a808a6 100644
--- a/src/main/cuda/spoof-launcher/jni_bridge.cpp
+++ b/src/main/cuda/spoof-launcher/jni_bridge.cpp
@@ -39,12 +39,12 @@ template<typename T>
struct LaunchMetadata {
const T& opID;
const T& grix;
- const T& num_inputs;
- const T& num_sides;
+ const size_t& num_inputs;
+ const size_t& num_sides;
// num entries describing one matrix (6 entries):
// {nnz,rows,cols,row_ptr,col_idxs,data}
- const T& entry_size;
+ const size_t& entry_size;
const T& num_scalars;
explicit LaunchMetadata(const size_t* jvals) : opID(jvals[0]),
grix(jvals[1]), num_inputs(jvals[2]),
@@ -58,7 +58,7 @@
Java_org_apache_sysds_hops_codegen_SpoofCompiler_initialize_1cuda_1context(
const char *cstr_rp = jenv->GetStringUTFChars(resource_path, nullptr);
size_t ctx = SpoofCUDAContext::initialize_cuda(device_id, cstr_rp);
jenv->ReleaseStringUTFChars(resource_path, cstr_rp);
- return ctx;
+ return static_cast<jlong>(ctx);
}
@@ -136,12 +136,12 @@ int launch_spoof_operator(JNIEnv *jenv, [[maybe_unused]]
jclass jobj, jlong _ctx
// wrap/cast inputs
std::vector<Matrix<T>> mats_in;
- for(auto i = 0; i < meta.num_inputs; i+=meta.entry_size)
+ for(auto i = 0ul; i < meta.num_inputs; i+=meta.entry_size)
mats_in.emplace_back(&inputs[i]);
// wrap/cast sides
std::vector<Matrix<T>> mats_sides;
- for(auto i = 0; i < meta.num_sides; i+=meta.entry_size)
+ for(auto i = 0ul; i < meta.num_sides; i+=meta.entry_size)
mats_sides.emplace_back(&sides[i]);
// wrap/cast output
diff --git a/src/main/cuda/spoof/cellwise.cu b/src/main/cuda/spoof/cellwise.cu
index ca25fe5..d70cc3d 100644
--- a/src/main/cuda/spoof/cellwise.cu
+++ b/src/main/cuda/spoof/cellwise.cu
@@ -56,9 +56,7 @@ struct SpoofCellwiseOp {
//%NEED_RIX%
//%NEED_CIX%
//%NEED_GRIX%
-
%BODY_dense%
-//printf("tid=%d a=%4.1f\n", threadIdx.x, a);
return %OUT%;
}
};
@@ -75,9 +73,4 @@ __global__ void /*%TMP%*/SPOOF_OP_NAME_SPARSE (Matrix<T>* a,
Matrix<T>* b, Matri
%AGG_OP%<T> agg_op;
SpoofCellwiseOp<T, NUM_B> spoof_op(a, b, c, scalars, grix);
%TYPE%_SPARSE<T, %AGG_OP%<T>, SpoofCellwiseOp<T, NUM_B>>(&(spoof_op.A),
&(spoof_op.c), n, %INITIAL_VALUE%, agg_op, spoof_op);
-
-// if(blockIdx.x == 0 && threadIdx.x == 0) {
-// for(auto i = 0; i < 30; ++i)
-// printf("%4.3f ", spoof_op.c.val(i));
-// }
};
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
b/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
index 44d828c..4840f97 100644
--- a/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysds/hops/AggUnaryOp.java
@@ -98,7 +98,7 @@ public class AggUnaryOp extends MultiThreadedHop
return false;
}
else if ((_op == AggOp.SUM && (_direction ==
Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col))
-// || (_op == AggOp.SUM_SQ && (_direction
== Direction.RowCol || _direction == Direction.Row || _direction ==
Direction.Col))
+ || (_op == AggOp.SUM_SQ && (_direction
== Direction.RowCol || _direction == Direction.Row || _direction ==
Direction.Col))
|| (_op == AggOp.MAX && (_direction
== Direction.RowCol || _direction == Direction.Row || _direction ==
Direction.Col))
|| (_op == AggOp.MIN && (_direction
== Direction.RowCol || _direction == Direction.Row || _direction ==
Direction.Col))
|| (_op == AggOp.MEAN && (_direction
== Direction.RowCol || _direction == Direction.Row || _direction ==
Direction.Col))
diff --git a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
index c8cd6e2..2b52160 100644
--- a/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysds/hops/codegen/cplan/CNodeCell.java
@@ -275,8 +275,7 @@ public class CNodeCell extends CNodeTpl
}
@Override
public boolean isSupported(GeneratorAPI api) {
- return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA)
&& _output.isSupported(api) &&
- !(getSpoofAggOp() == SpoofCellwise.AggOp.SUM_SQ);
+ return (api == GeneratorAPI.CUDA || api == GeneratorAPI.JAVA)
&& _output.isSupported(api);
}
public int compile(GeneratorAPI api, String src) {
diff --git
a/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
b/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
index e3a44cb..531a8ff 100644
--- a/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/aggregate/SumSqTest.java
@@ -206,7 +206,8 @@ public class SumSqTest extends AutomatedTestBase {
// On CP and Spark modes, check that the rewrite actually
// occurred for matrix cases and not for vector cases.
if (rewrites && (platform == ExecType.SPARK || platform ==
ExecType.CP)) {
- String prefix = (platform == ExecType.SPARK) ?
Instruction.SP_INST_PREFIX : "";
+ String prefix = (platform == ExecType.SPARK) ?
Instruction.SP_INST_PREFIX :
+ (DMLScript.USE_ACCELERATOR ? "gpu_": "");
String opcode = prefix + op;
boolean rewriteApplied =
Statistics.getCPHeavyHitterOpCodes().contains(opcode);
if (vector)