This is an automated email from the ASF dual-hosted git repository. markd pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 14d095efe5bc98d120bc0dd34270c3f12747b3cc Author: Mark Dokter <[email protected]> AuthorDate: Thu Apr 28 14:06:53 2022 +0200 [SYSTEMDS-3362] CUDA code gen stream synchronization (bugfix) The CUDA code generation launcher handles streams per operator at the moment. This is wrong since a read before write can happen on a certain device allocation. Switching to a central stream object for now. Closes #1600 --- src/main/cuda/spoof-launcher/SpoofCUDAContext.h | 9 ++++++-- src/main/cuda/spoof-launcher/SpoofCellwise.h | 30 ++++++++++++------------- src/main/cuda/spoof-launcher/SpoofOperator.h | 6 ++--- src/main/cuda/spoof-launcher/SpoofRowwise.h | 6 ++--- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h index e4b80c5e40..c902c38382 100644 --- a/src/main/cuda/spoof-launcher/SpoofCUDAContext.h +++ b/src/main/cuda/spoof-launcher/SpoofCUDAContext.h @@ -55,9 +55,14 @@ public: size_t current_mem_size = 0; // the actual staging buffer size (should be default unless there was a resize) std::byte* staging_buffer{}; // pinned host mem for async transfers std::byte* device_buffer{}; // this buffer holds the pointers to the data buffers + cudaStream_t stream{}; explicit SpoofCUDAContext(const char* resource_path_, std::vector<std::string> include_paths_) : reductions(nullptr), - resource_path(resource_path_), include_paths(std::move(include_paths_)) { } + resource_path(resource_path_), include_paths(std::move(include_paths_)) { + CHECK_CUDART(cudaStreamCreate(&stream)); + } + + virtual ~SpoofCUDAContext() { CHECK_CUDART(cudaStreamDestroy(stream)); } static size_t initialize_cuda(uint32_t device_id, const char* resource_path_); @@ -70,7 +75,7 @@ public: DataBufferWrapper dbw(staging_buffer, device_buffer); SpoofOperator* op = compiled_ops[dbw.op_id()].get(); - dbw.toDevice(op->stream); + dbw.toDevice(stream); CALL::exec(this, op, &dbw); diff --git a/src/main/cuda/spoof-launcher/SpoofCellwise.h b/src/main/cuda/spoof-launcher/SpoofCellwise.h index 9077840020..68b176b6f2 100644 --- a/src/main/cuda/spoof-launcher/SpoofCellwise.h +++ b/src/main/cuda/spoof-launcher/SpoofCellwise.h @@ -27,7 +27,7 @@ template<typename T> struct SpoofCellwiseFullAgg { - static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) { + static void exec(SpoofCellwiseOp* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) { T value_type; // num ctas @@ -46,7 +46,7 @@ struct SpoofCellwiseFullAgg { #endif CHECK_CUDA(op->program.get()->kernel(op_name) .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides())) - .configure(grid, block, shared_mem_size, op->stream) + .configure(grid, block, shared_mem_size, ctx->stream) .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix())); if(NB > 1) { @@ -64,7 +64,7 @@ struct SpoofCellwiseFullAgg { << N << " elements" << std::endl; #endif - CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, op->stream, args, nullptr)); + CHECK_CUDA(cuLaunchKernel(op->agg_kernel,NB, 1, 1, NT, 1, 1, shared_mem_size, ctx->stream, args, nullptr)); N = NB; } } @@ -74,7 +74,7 @@ struct SpoofCellwiseFullAgg { template<typename T> struct SpoofCellwiseRowAgg { - static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) { + static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) { T value_type; // num ctas @@ -90,7 +90,7 @@ struct SpoofCellwiseRowAgg { #endif CHECK_CUDA(op->program->kernel(op_name) .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides())) - .configure(grid, block, shared_mem_size, op->stream) + .configure(grid, block, shared_mem_size, ctx->stream) .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix())); } @@ -99,7 +99,7 @@ struct SpoofCellwiseRowAgg { template<typename T> struct SpoofCellwiseColAgg { - static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw) { + static void exec(SpoofOperator* op, uint32_t NT, uint32_t N, const std::string& op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) { T value_type; // num ctas @@ -115,7 +115,7 @@ struct SpoofCellwiseColAgg { #endif CHECK_CUDA(op->program->kernel(op_name) .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides())) - .configure(grid, block, shared_mem_size, op->stream) + .configure(grid, block, shared_mem_size, ctx->stream) .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix())); } @@ -124,7 +124,7 @@ struct SpoofCellwiseColAgg { template<typename T> struct SpoofCellwiseNoAgg { - static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw) { + static void exec(SpoofOperator *op, uint32_t NT, uint32_t N, const std::string &op_name, DataBufferWrapper* dbw, SpoofCUDAContext* ctx) { T value_type; bool sparse_input = dbw->h_in<T>(0)->row_ptr != nullptr; @@ -155,16 +155,16 @@ struct SpoofCellwiseNoAgg { #endif CHECK_CUDA(op->program->kernel(op_name) .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1u), dbw->num_sides())) - .configure(grid, block, shared_mem_size, op->stream) + .configure(grid, block, shared_mem_size, ctx->stream) .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), N, dbw->grix())); // copy over row indices from input to output if appropriate if (op->isSparseSafe() && dbw->h_in<T>(0)->row_ptr != nullptr) { // src/dst information (pointer address) is stored in *host* buffer! CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->row_ptr, dbw->h_in<T>(0)->row_ptr, - (dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream)); + (dbw->h_in<T>(0)->rows+1) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, ctx->stream)); CHECK_CUDART(cudaMemcpyAsync(dbw->h_out<T>()->col_idx, dbw->h_in<T>(0)->col_idx, - (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, op->stream)); + (dbw->h_in<T>(0)->nnz) * sizeof(uint32_t), cudaMemcpyDeviceToDevice, ctx->stream)); } } }; @@ -186,16 +186,16 @@ struct SpoofCellwise { switch(op->agg_type) { case SpoofOperator::AggType::FULL_AGG: op->agg_kernel = ctx->template getReductionKernel<T>(std::make_pair(op->agg_type, op->agg_op)); - SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, dbw); + SpoofCellwiseFullAgg<T>::exec(op, NT, N, op_name, dbw, ctx); break; case SpoofOperator::AggType::ROW_AGG: - SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, dbw); + SpoofCellwiseRowAgg<T>::exec(op, NT, N, op_name, dbw, ctx); break; case SpoofOperator::AggType::COL_AGG: - SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, dbw); + SpoofCellwiseColAgg<T>::exec(op, NT, N, op_name, dbw, ctx); break; case SpoofOperator::AggType::NO_AGG: - SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, dbw); + SpoofCellwiseNoAgg<T>::exec(op, NT, N, op_name, dbw, ctx); break; default: throw std::runtime_error("unknown cellwise agg type" + std::to_string(static_cast<int>(op->agg_type))); diff --git a/src/main/cuda/spoof-launcher/SpoofOperator.h b/src/main/cuda/spoof-launcher/SpoofOperator.h index f256e817db..045dcfdb80 100644 --- a/src/main/cuda/spoof-launcher/SpoofOperator.h +++ b/src/main/cuda/spoof-launcher/SpoofOperator.h @@ -42,10 +42,8 @@ struct SpoofOperator { [[nodiscard]] virtual bool isSparseSafe() const = 0; - cudaStream_t stream{}; - - SpoofOperator() { CHECK_CUDART(cudaStreamCreate(&stream));} - virtual ~SpoofOperator() {CHECK_CUDART(cudaStreamDestroy(stream));} + SpoofOperator() = default; + virtual ~SpoofOperator() = default; }; struct SpoofCellwiseOp : public SpoofOperator { diff --git a/src/main/cuda/spoof-launcher/SpoofRowwise.h b/src/main/cuda/spoof-launcher/SpoofRowwise.h index 01ec5206aa..a9a656fbb7 100644 --- a/src/main/cuda/spoof-launcher/SpoofRowwise.h +++ b/src/main/cuda/spoof-launcher/SpoofRowwise.h @@ -39,7 +39,7 @@ struct SpoofRowwise { if(op->isSparseSafe() && dbw->h_out<T>()->nnz > 0) out_num_elements = dbw->h_out<T>()->nnz; //ToDo: only memset output when there is an output operation that *adds* to the buffer - CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, out_num_elements * sizeof(T), op->stream)); + CHECK_CUDART(cudaMemsetAsync(dbw->h_out<T>()->data, 0, out_num_elements * sizeof(T), ctx->stream)); //ToDo: handle this in JVM uint32_t tmp_len = 0; @@ -52,7 +52,7 @@ struct SpoofRowwise { std::cout << "num_temp_vect: " << op->num_temp_vectors << " temp_buf_size: " << temp_buf_size << " tmp_len: " << tmp_len << std::endl; #endif CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_temp), temp_buf_size)); - CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, op->stream)); + CHECK_CUDART(cudaMemsetAsync(d_temp, 0, temp_buf_size, ctx->stream)); } std::string op_name(op->name + "_DENSE"); @@ -68,7 +68,7 @@ struct SpoofRowwise { #endif CHECK_CUDA(op->program->kernel(op_name) .instantiate(type_of(value_type), std::max(static_cast<uint32_t>(1), dbw->num_sides()), op->num_temp_vectors, tmp_len) - .configure(grid, block, shared_mem_size, op->stream) + .configure(grid, block, shared_mem_size, ctx->stream) .launch(dbw->d_in<T>(0), dbw->d_sides<T>(), dbw->d_out<T>(), dbw->d_scalars<T>(), d_temp, dbw->grix())); if(op->num_temp_vectors > 0)
