sc/source/core/opencl/formulagroupcl.cxx |  745 +++++++++++++++----------------
 1 file changed, 376 insertions(+), 369 deletions(-)

New commits:
commit 7db6a64e40f172894e08ecec483f214f7f1d7e10
Author:     Stephan Bergmann <sberg...@redhat.com>
AuthorDate: Fri Dec 6 17:04:11 2019 +0100
Commit:     Stephan Bergmann <sberg...@redhat.com>
CommitDate: Fri Dec 6 20:35:21 2019 +0100

    Incomplete OpAverage, OpCount must not be used in dynamic_cast
    
    ...even in template code (which compilers often only analyze late during
    compilation, but which Clang trunk now apparently processes more 
aggressively,
    presumably since <https://github.com/llvm/llvm-project/commit/
    878a24ee244a24c39d1c57e9af2e88c621f7cce9> "Reapply 'Fix crash on switch
    conditions of non-integer types in templates'", emitting errors about 
incomplete
    types)
    
    Change-Id: I851d266007f72cc4063f299412eadacbc6084f70
    Reviewed-on: https://gerrit.libreoffice.org/84657
    Tested-by: Jenkins
    Reviewed-by: Stephan Bergmann <sberg...@redhat.com>

diff --git a/sc/source/core/opencl/formulagroupcl.cxx 
b/sc/source/core/opencl/formulagroupcl.cxx
index a0a0a8e4bc46..f126311d5bf3 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -1032,9 +1032,6 @@ protected:
 /// Handling a Double Vector that is used as a sliding window input
 /// to either a sliding window average or sum-of-products
 /// Generate a sequential loop for reductions
-class OpAverage;
-class OpCount;
-
 template<class Base>
 class DynamicKernelSlidingArgument : public Base
 {
@@ -1345,185 +1342,7 @@ public:
     }
 
     /// Emit the definition for the auxiliary reduction kernel
-    virtual void GenSlidingWindowFunction( std::stringstream& ss )
-    {
-        if (!dynamic_cast<OpAverage*>(mpCodeGen.get()))
-        {
-            std::string name = Base::GetName();
-            ss << "__kernel void " << name;
-            ss << "_reduction(__global double* A, "
-                "__global double *result,int arrayLength,int windowSize){\n";
-            ss << "    double tmp, current_result =" <<
-                mpCodeGen->GetBottom();
-            ss << ";\n";
-            ss << "    int writePos = get_group_id(1);\n";
-            ss << "    int lidx = get_local_id(0);\n";
-            ss << "    __local double shm_buf[256];\n";
-            if (mpDVR->IsStartFixed())
-                ss << "    int offset = 0;\n";
-            else // if (!mpDVR->IsStartFixed())
-                ss << "    int offset = get_group_id(1);\n";
-            if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
-                ss << "    int end = windowSize;\n";
-            else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
-                ss << "    int end = offset + windowSize;\n";
-            else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
-                ss << "    int end = windowSize + get_group_id(1);\n";
-            else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
-                ss << "    int end = windowSize;\n";
-            ss << "    end = min(end, arrayLength);\n";
-
-            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    int loop = arrayLength/512 + 1;\n";
-            ss << "    for (int l=0; l<loop; l++){\n";
-            ss << "    tmp = " << mpCodeGen->GetBottom() << ";\n";
-            ss << "    int loopOffset = l*512;\n";
-            ss << "    if((loopOffset + lidx + offset + 256) < end) {\n";
-            ss << "        tmp = legalize(" << mpCodeGen->Gen2(
-                "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n";
-            ss << "        tmp = legalize(" << mpCodeGen->Gen2(
-                "A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n";
-            ss << "    } else if ((loopOffset + lidx + offset) < end)\n";
-            ss << "        tmp = legalize(" << mpCodeGen->Gen2(
-                "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n";
-            ss << "    shm_buf[lidx] = tmp;\n";
-            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    for (int i = 128; i >0; i/=2) {\n";
-            ss << "        if (lidx < i)\n";
-            ss << "            shm_buf[lidx] = ";
-            // Special case count
-            if (dynamic_cast<OpCount*>(mpCodeGen.get()))
-                ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
-            else
-                ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << 
";\n";
-            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    }\n";
-            ss << "        if (lidx == 0)\n";
-            ss << "            current_result =";
-            if (dynamic_cast<OpCount*>(mpCodeGen.get()))
-                ss << "current_result + shm_buf[0]";
-            else
-                ss << mpCodeGen->Gen2("current_result", "shm_buf[0]");
-            ss << ";\n";
-            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    }\n";
-            ss << "    if (lidx == 0)\n";
-            ss << "        result[writePos] = current_result;\n";
-            ss << "}\n";
-        }
-        else
-        {
-            std::string name = Base::GetName();
-            /*sum reduction*/
-            ss << "__kernel void " << name << "_sum";
-            ss << "_reduction(__global double* A, "
-                "__global double *result,int arrayLength,int windowSize){\n";
-            ss << "    double tmp, current_result =" <<
-                mpCodeGen->GetBottom();
-            ss << ";\n";
-            ss << "    int writePos = get_group_id(1);\n";
-            ss << "    int lidx = get_local_id(0);\n";
-            ss << "    __local double shm_buf[256];\n";
-            if (mpDVR->IsStartFixed())
-                ss << "    int offset = 0;\n";
-            else // if (!mpDVR->IsStartFixed())
-                ss << "    int offset = get_group_id(1);\n";
-            if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
-                ss << "    int end = windowSize;\n";
-            else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
-                ss << "    int end = offset + windowSize;\n";
-            else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
-                ss << "    int end = windowSize + get_group_id(1);\n";
-            else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
-                ss << "    int end = windowSize;\n";
-            ss << "    end = min(end, arrayLength);\n";
-            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    int loop = arrayLength/512 + 1;\n";
-            ss << "    for (int l=0; l<loop; l++){\n";
-            ss << "    tmp = " << mpCodeGen->GetBottom() << ";\n";
-            ss << "    int loopOffset = l*512;\n";
-            ss << "    if((loopOffset + lidx + offset + 256) < end) {\n";
-            ss << "        tmp = legalize(";
-            ss << "(A[loopOffset + lidx + offset]+ tmp)";
-            ss << ", tmp);\n";
-            ss << "        tmp = legalize((A[loopOffset + lidx + offset + 
256]+ tmp)";
-            ss << ", tmp);\n";
-            ss << "    } else if ((loopOffset + lidx + offset) < end)\n";
-            ss << "        tmp = legalize((A[loopOffset + lidx + offset] + 
tmp)";
-            ss << ", tmp);\n";
-            ss << "    shm_buf[lidx] = tmp;\n";
-            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    for (int i = 128; i >0; i/=2) {\n";
-            ss << "        if (lidx < i)\n";
-            ss << "            shm_buf[lidx] = ";
-            ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
-            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    }\n";
-            ss << "        if (lidx == 0)\n";
-            ss << "            current_result =";
-            ss << "current_result + shm_buf[0]";
-            ss << ";\n";
-            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    }\n";
-            ss << "    if (lidx == 0)\n";
-            ss << "        result[writePos] = current_result;\n";
-            ss << "}\n";
-            /*count reduction*/
-            ss << "__kernel void " << name << "_count";
-            ss << "_reduction(__global double* A, "
-                "__global double *result,int arrayLength,int windowSize){\n";
-            ss << "    double tmp, current_result =" <<
-                mpCodeGen->GetBottom();
-            ss << ";\n";
-            ss << "    int writePos = get_group_id(1);\n";
-            ss << "    int lidx = get_local_id(0);\n";
-            ss << "    __local double shm_buf[256];\n";
-            if (mpDVR->IsStartFixed())
-                ss << "    int offset = 0;\n";
-            else // if (!mpDVR->IsStartFixed())
-                ss << "    int offset = get_group_id(1);\n";
-            if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
-                ss << "    int end = windowSize;\n";
-            else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
-                ss << "    int end = offset + windowSize;\n";
-            else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
-                ss << "    int end = windowSize + get_group_id(1);\n";
-            else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
-                ss << "    int end = windowSize;\n";
-            ss << "    end = min(end, arrayLength);\n";
-            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    int loop = arrayLength/512 + 1;\n";
-            ss << "    for (int l=0; l<loop; l++){\n";
-            ss << "    tmp = " << mpCodeGen->GetBottom() << ";\n";
-            ss << "    int loopOffset = l*512;\n";
-            ss << "    if((loopOffset + lidx + offset + 256) < end) {\n";
-            ss << "        tmp = legalize((isnan(A[loopOffset + lidx + 
offset])?tmp:tmp+1.0)";
-            ss << ", tmp);\n";
-            ss << "        tmp = legalize((isnan(A[loopOffset + lidx + 
offset+256])?tmp:tmp+1.0)";
-            ss << ", tmp);\n";
-            ss << "    } else if ((loopOffset + lidx + offset) < end)\n";
-            ss << "        tmp = legalize((isnan(A[loopOffset + lidx + 
offset])?tmp:tmp+1.0)";
-            ss << ", tmp);\n";
-            ss << "    shm_buf[lidx] = tmp;\n";
-            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    for (int i = 128; i >0; i/=2) {\n";
-            ss << "        if (lidx < i)\n";
-            ss << "            shm_buf[lidx] = ";
-            ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
-            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    }\n";
-            ss << "        if (lidx == 0)\n";
-            ss << "            current_result =";
-            ss << "current_result + shm_buf[0];";
-            ss << ";\n";
-            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-            ss << "    }\n";
-            ss << "    if (lidx == 0)\n";
-            ss << "        result[writePos] = current_result;\n";
-            ss << "}\n";
-        }
-
-    }
+    virtual void GenSlidingWindowFunction( std::stringstream& ss );
 
     virtual std::string GenSlidingWindowDeclRef( bool ) const
     {
@@ -1537,194 +1356,9 @@ public:
 
     /// Controls how the elements in the DoubleVectorRef are traversed
     size_t GenReductionLoopHeader(
-        std::stringstream& ss, int nResultSize, bool& needBody )
-    {
-        assert(mpDVR);
-        size_t nCurWindowSize = mpDVR->GetRefRowSize();
-        std::string temp = Base::GetName() + "[gid0]";
-        ss << "tmp = ";
-        // Special case count
-        if (dynamic_cast<OpAverage*>(mpCodeGen.get()))
-        {
-            ss << mpCodeGen->Gen2(temp, "tmp") << ";\n";
-            ss << "nCount = nCount-1;\n";
-            ss << "nCount = nCount +"; /*re-assign nCount from count 
reduction*/
-            ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n";
-        }
-        else if (dynamic_cast<OpCount*>(mpCodeGen.get()))
-            ss << temp << "+ tmp";
-        else
-            ss << mpCodeGen->Gen2(temp, "tmp");
-        ss << ";\n\t";
-        needBody = false;
-        return nCurWindowSize;
-    }
+        std::stringstream& ss, int nResultSize, bool& needBody );
 
-    virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program 
mpProgram )
-    {
-        assert(Base::mpClmem == nullptr);
-
-        openclwrapper::KernelEnv kEnv;
-        openclwrapper::setKernelEnv(&kEnv);
-        cl_int err;
-        size_t nInput = mpDVR->GetArrayLength();
-        size_t nCurWindowSize = mpDVR->GetRefRowSize();
-        // create clmem buffer
-        if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr)
-            throw Unhandled(__FILE__, __LINE__);
-        double* pHostBuffer = const_cast<double*>(
-            mpDVR->GetArrays()[Base::mnIndex].mpNumericArray);
-        size_t szHostBuffer = nInput * sizeof(double);
-        Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
-            cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR,
-            szHostBuffer,
-            pHostBuffer, &err);
-        SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " 
<< nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host 
buffer " << pHostBuffer);
-
-        mpClmem2 = clCreateBuffer(kEnv.mpkContext,
-            CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-            sizeof(double) * w, nullptr, nullptr);
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__);
-        SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << 
sizeof(double) << "*" << w << "=" << (sizeof(double)*w));
-
-        // reproduce the reduction function name
-        std::string kernelName;
-        if (!dynamic_cast<OpAverage*>(mpCodeGen.get()))
-            kernelName = Base::GetName() + "_reduction";
-        else
-            kernelName = Base::GetName() + "_sum_reduction";
-        cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), 
&err);
-        if (err != CL_SUCCESS)
-            throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__);
-        SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " 
<< kernelName << " in program " << mpProgram);
-
-        // set kernel arg of reduction kernel
-        // TODO(Wei Wei): use unique name for kernel
-        cl_mem buf = Base::GetCLBuffer();
-        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": 
cl_mem: " << buf);
-        err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
-            static_cast<void*>(&buf));
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": 
cl_mem: " << mpClmem2);
-        err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2);
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": 
cl_int: " << nInput);
-        err = clSetKernelArg(redKernel, 2, sizeof(cl_int), 
static_cast<void*>(&nInput));
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": 
cl_int: " << nCurWindowSize);
-        err = clSetKernelArg(redKernel, 3, sizeof(cl_int), 
static_cast<void*>(&nCurWindowSize));
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-        // set work group size and execute
-        size_t global_work_size[] = { 256, static_cast<size_t>(w) };
-        size_t const local_work_size[] = { 256, 1 };
-        SAL_INFO("sc.opencl", "Enqueuing kernel " << redKernel);
-        err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
-            global_work_size, local_work_size, 0, nullptr, nullptr);
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, 
__LINE__);
-        err = clFinish(kEnv.mpkCmdQueue);
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clFinish", err, __FILE__, __LINE__);
-        if (dynamic_cast<OpAverage*>(mpCodeGen.get()))
-        {
-            /*average need more reduction kernel for count computing*/
-            std::unique_ptr<double[]> pAllBuffer(new double[2 * w]);
-            double* resbuf = 
static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
-                mpClmem2,
-                CL_TRUE, CL_MAP_READ, 0,
-                sizeof(double) * w, 0, nullptr, nullptr,
-                &err));
-            if (err != CL_SUCCESS)
-                throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, 
__LINE__);
-
-            for (int i = 0; i < w; i++)
-                pAllBuffer[i] = resbuf[i];
-            err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 
0, nullptr, nullptr);
-            if (err != CL_SUCCESS)
-                throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, 
__LINE__);
-
-            kernelName = Base::GetName() + "_count_reduction";
-            redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
-            if (err != CL_SUCCESS)
-                throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__);
-            SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with 
name " << kernelName << " in program " << mpProgram);
-
-            // set kernel arg of reduction kernel
-            buf = Base::GetCLBuffer();
-            SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": 
cl_mem: " << buf);
-            err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
-                static_cast<void*>(&buf));
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-            SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": 
cl_mem: " << mpClmem2);
-            err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2);
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-            SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": 
cl_int: " << nInput);
-            err = clSetKernelArg(redKernel, 2, sizeof(cl_int), 
static_cast<void*>(&nInput));
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-            SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": 
cl_int: " << nCurWindowSize);
-            err = clSetKernelArg(redKernel, 3, sizeof(cl_int), 
static_cast<void*>(&nCurWindowSize));
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-
-            // set work group size and execute
-            size_t global_work_size1[] = { 256, static_cast<size_t>(w) };
-            size_t const local_work_size1[] = { 256, 1 };
-            SAL_INFO("sc.opencl", "Enqueuing kernel " << redKernel);
-            err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, 
nullptr,
-                global_work_size1, local_work_size1, 0, nullptr, nullptr);
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, 
__LINE__);
-            err = clFinish(kEnv.mpkCmdQueue);
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clFinish", err, __FILE__, __LINE__);
-            resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
-                mpClmem2,
-                CL_TRUE, CL_MAP_READ, 0,
-                sizeof(double) * w, 0, nullptr, nullptr,
-                &err));
-            if (err != CL_SUCCESS)
-                throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, 
__LINE__);
-            for (int i = 0; i < w; i++)
-                pAllBuffer[i + w] = resbuf[i];
-            err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 
0, nullptr, nullptr);
-            // FIXME: Is it intentional to not throw an OpenCLError even if 
the clEnqueueUnmapMemObject() fails?
-            if (CL_SUCCESS != err)
-                SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << 
openclwrapper::errorString(err));
-            if (mpClmem2)
-            {
-                err = clReleaseMemObject(mpClmem2);
-                SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", 
"clReleaseMemObject failed: " << openclwrapper::errorString(err));
-                mpClmem2 = nullptr;
-            }
-            mpClmem2 = clCreateBuffer(kEnv.mpkContext,
-                cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
-                w * sizeof(double) * 2, pAllBuffer.get(), &err);
-            if (CL_SUCCESS != err)
-                throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__);
-            SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << 
w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host 
buffer " << pAllBuffer.get());
-        }
-        // set kernel arg
-        SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: 
" << mpClmem2);
-        err = clSetKernelArg(k, argno, sizeof(cl_mem), &mpClmem2);
-        if (CL_SUCCESS != err)
-            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
-        return 1;
-    }
+    virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program 
mpProgram );
 
     ~ParallelReductionVectorRef()
     {
@@ -2327,6 +1961,379 @@ public:
     virtual std::string BinFuncName() const override { return "fsop"; }
 };
 
+template<class Base>
+void ParallelReductionVectorRef<Base>::GenSlidingWindowFunction( 
std::stringstream& ss )
+{
+    if (!dynamic_cast<OpAverage*>(mpCodeGen.get()))
+    {
+        std::string name = Base::GetName();
+        ss << "__kernel void " << name;
+        ss << "_reduction(__global double* A, "
+            "__global double *result,int arrayLength,int windowSize){\n";
+        ss << "    double tmp, current_result =" <<
+            mpCodeGen->GetBottom();
+        ss << ";\n";
+        ss << "    int writePos = get_group_id(1);\n";
+        ss << "    int lidx = get_local_id(0);\n";
+        ss << "    __local double shm_buf[256];\n";
+        if (mpDVR->IsStartFixed())
+            ss << "    int offset = 0;\n";
+        else // if (!mpDVR->IsStartFixed())
+            ss << "    int offset = get_group_id(1);\n";
+        if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+            ss << "    int end = windowSize;\n";
+        else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+            ss << "    int end = offset + windowSize;\n";
+        else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+            ss << "    int end = windowSize + get_group_id(1);\n";
+        else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+            ss << "    int end = windowSize;\n";
+        ss << "    end = min(end, arrayLength);\n";
+
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    int loop = arrayLength/512 + 1;\n";
+        ss << "    for (int l=0; l<loop; l++){\n";
+        ss << "    tmp = " << mpCodeGen->GetBottom() << ";\n";
+        ss << "    int loopOffset = l*512;\n";
+        ss << "    if((loopOffset + lidx + offset + 256) < end) {\n";
+        ss << "        tmp = legalize(" << mpCodeGen->Gen2(
+            "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n";
+        ss << "        tmp = legalize(" << mpCodeGen->Gen2(
+            "A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n";
+        ss << "    } else if ((loopOffset + lidx + offset) < end)\n";
+        ss << "        tmp = legalize(" << mpCodeGen->Gen2(
+            "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n";
+        ss << "    shm_buf[lidx] = tmp;\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    for (int i = 128; i >0; i/=2) {\n";
+        ss << "        if (lidx < i)\n";
+        ss << "            shm_buf[lidx] = ";
+        // Special case count
+        if (dynamic_cast<OpCount*>(mpCodeGen.get()))
+            ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
+        else
+            ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << 
";\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "        if (lidx == 0)\n";
+        ss << "            current_result =";
+        if (dynamic_cast<OpCount*>(mpCodeGen.get()))
+            ss << "current_result + shm_buf[0]";
+        else
+            ss << mpCodeGen->Gen2("current_result", "shm_buf[0]");
+        ss << ";\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "    if (lidx == 0)\n";
+        ss << "        result[writePos] = current_result;\n";
+        ss << "}\n";
+    }
+    else
+    {
+        std::string name = Base::GetName();
+        /*sum reduction*/
+        ss << "__kernel void " << name << "_sum";
+        ss << "_reduction(__global double* A, "
+            "__global double *result,int arrayLength,int windowSize){\n";
+        ss << "    double tmp, current_result =" <<
+            mpCodeGen->GetBottom();
+        ss << ";\n";
+        ss << "    int writePos = get_group_id(1);\n";
+        ss << "    int lidx = get_local_id(0);\n";
+        ss << "    __local double shm_buf[256];\n";
+        if (mpDVR->IsStartFixed())
+            ss << "    int offset = 0;\n";
+        else // if (!mpDVR->IsStartFixed())
+            ss << "    int offset = get_group_id(1);\n";
+        if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+            ss << "    int end = windowSize;\n";
+        else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+            ss << "    int end = offset + windowSize;\n";
+        else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+            ss << "    int end = windowSize + get_group_id(1);\n";
+        else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+            ss << "    int end = windowSize;\n";
+        ss << "    end = min(end, arrayLength);\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    int loop = arrayLength/512 + 1;\n";
+        ss << "    for (int l=0; l<loop; l++){\n";
+        ss << "    tmp = " << mpCodeGen->GetBottom() << ";\n";
+        ss << "    int loopOffset = l*512;\n";
+        ss << "    if((loopOffset + lidx + offset + 256) < end) {\n";
+        ss << "        tmp = legalize(";
+        ss << "(A[loopOffset + lidx + offset]+ tmp)";
+        ss << ", tmp);\n";
+        ss << "        tmp = legalize((A[loopOffset + lidx + offset + 256]+ 
tmp)";
+        ss << ", tmp);\n";
+        ss << "    } else if ((loopOffset + lidx + offset) < end)\n";
+        ss << "        tmp = legalize((A[loopOffset + lidx + offset] + tmp)";
+        ss << ", tmp);\n";
+        ss << "    shm_buf[lidx] = tmp;\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    for (int i = 128; i >0; i/=2) {\n";
+        ss << "        if (lidx < i)\n";
+        ss << "            shm_buf[lidx] = ";
+        ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "        if (lidx == 0)\n";
+        ss << "            current_result =";
+        ss << "current_result + shm_buf[0]";
+        ss << ";\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "    if (lidx == 0)\n";
+        ss << "        result[writePos] = current_result;\n";
+        ss << "}\n";
+        /*count reduction*/
+        ss << "__kernel void " << name << "_count";
+        ss << "_reduction(__global double* A, "
+            "__global double *result,int arrayLength,int windowSize){\n";
+        ss << "    double tmp, current_result =" <<
+            mpCodeGen->GetBottom();
+        ss << ";\n";
+        ss << "    int writePos = get_group_id(1);\n";
+        ss << "    int lidx = get_local_id(0);\n";
+        ss << "    __local double shm_buf[256];\n";
+        if (mpDVR->IsStartFixed())
+            ss << "    int offset = 0;\n";
+        else // if (!mpDVR->IsStartFixed())
+            ss << "    int offset = get_group_id(1);\n";
+        if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+            ss << "    int end = windowSize;\n";
+        else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+            ss << "    int end = offset + windowSize;\n";
+        else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
+            ss << "    int end = windowSize + get_group_id(1);\n";
+        else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
+            ss << "    int end = windowSize;\n";
+        ss << "    end = min(end, arrayLength);\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    int loop = arrayLength/512 + 1;\n";
+        ss << "    for (int l=0; l<loop; l++){\n";
+        ss << "    tmp = " << mpCodeGen->GetBottom() << ";\n";
+        ss << "    int loopOffset = l*512;\n";
+        ss << "    if((loopOffset + lidx + offset + 256) < end) {\n";
+        ss << "        tmp = legalize((isnan(A[loopOffset + lidx + 
offset])?tmp:tmp+1.0)";
+        ss << ", tmp);\n";
+        ss << "        tmp = legalize((isnan(A[loopOffset + lidx + 
offset+256])?tmp:tmp+1.0)";
+        ss << ", tmp);\n";
+        ss << "    } else if ((loopOffset + lidx + offset) < end)\n";
+        ss << "        tmp = legalize((isnan(A[loopOffset + lidx + 
offset])?tmp:tmp+1.0)";
+        ss << ", tmp);\n";
+        ss << "    shm_buf[lidx] = tmp;\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    for (int i = 128; i >0; i/=2) {\n";
+        ss << "        if (lidx < i)\n";
+        ss << "            shm_buf[lidx] = ";
+        ss << "shm_buf[lidx] + shm_buf[lidx + i];\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "        if (lidx == 0)\n";
+        ss << "            current_result =";
+        ss << "current_result + shm_buf[0];";
+        ss << ";\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "    if (lidx == 0)\n";
+        ss << "        result[writePos] = current_result;\n";
+        ss << "}\n";
+    }
+
+}
+
+template<class Base>
+size_t ParallelReductionVectorRef<Base>::GenReductionLoopHeader(
+    std::stringstream& ss, int nResultSize, bool& needBody )
+{
+    assert(mpDVR);
+    size_t nCurWindowSize = mpDVR->GetRefRowSize();
+    std::string temp = Base::GetName() + "[gid0]";
+    ss << "tmp = ";
+    // Special case count
+    if (dynamic_cast<OpAverage*>(mpCodeGen.get()))
+    {
+        ss << mpCodeGen->Gen2(temp, "tmp") << ";\n";
+        ss << "nCount = nCount-1;\n";
+        ss << "nCount = nCount +"; /*re-assign nCount from count reduction*/
+        ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n";
+    }
+    else if (dynamic_cast<OpCount*>(mpCodeGen.get()))
+        ss << temp << "+ tmp";
+    else
+        ss << mpCodeGen->Gen2(temp, "tmp");
+    ss << ";\n\t";
+    needBody = false;
+    return nCurWindowSize;
+}
+
+template<class Base>
+size_t ParallelReductionVectorRef<Base>::Marshal( cl_kernel k, int argno, int 
w, cl_program mpProgram )
+{
+    assert(Base::mpClmem == nullptr);
+
+    openclwrapper::KernelEnv kEnv;
+    openclwrapper::setKernelEnv(&kEnv);
+    cl_int err;
+    size_t nInput = mpDVR->GetArrayLength();
+    size_t nCurWindowSize = mpDVR->GetRefRowSize();
+    // create clmem buffer
+    if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr)
+        throw Unhandled(__FILE__, __LINE__);
+    double* pHostBuffer = const_cast<double*>(
+        mpDVR->GetArrays()[Base::mnIndex].mpNumericArray);
+    size_t szHostBuffer = nInput * sizeof(double);
+    Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
+        cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR,
+        szHostBuffer,
+        pHostBuffer, &err);
+    SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << 
nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " 
<< pHostBuffer);
+
+    mpClmem2 = clCreateBuffer(kEnv.mpkContext,
+        CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        sizeof(double) * w, nullptr, nullptr);
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__);
+    SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << 
sizeof(double) << "*" << w << "=" << (sizeof(double)*w));
+
+    // reproduce the reduction function name
+    std::string kernelName;
+    if (!dynamic_cast<OpAverage*>(mpCodeGen.get()))
+        kernelName = Base::GetName() + "_reduction";
+    else
+        kernelName = Base::GetName() + "_sum_reduction";
+    cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
+    if (err != CL_SUCCESS)
+        throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__);
+    SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << 
kernelName << " in program " << mpProgram);
+
+    // set kernel arg of reduction kernel
+    // TODO(Wei Wei): use unique name for kernel
+    cl_mem buf = Base::GetCLBuffer();
+    SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: 
" << buf);
+    err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
+        static_cast<void*>(&buf));
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+    SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: 
" << mpClmem2);
+    err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2);
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+    SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: 
" << nInput);
+    err = clSetKernelArg(redKernel, 2, sizeof(cl_int), 
static_cast<void*>(&nInput));
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+    SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: 
" << nCurWindowSize);
+    err = clSetKernelArg(redKernel, 3, sizeof(cl_int), 
static_cast<void*>(&nCurWindowSize));
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+    // set work group size and execute
+    size_t global_work_size[] = { 256, static_cast<size_t>(w) };
+    size_t const local_work_size[] = { 256, 1 };
+    SAL_INFO("sc.opencl", "Enqueuing kernel " << redKernel);
+    err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
+        global_work_size, local_work_size, 0, nullptr, nullptr);
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__);
+    err = clFinish(kEnv.mpkCmdQueue);
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clFinish", err, __FILE__, __LINE__);
+    if (dynamic_cast<OpAverage*>(mpCodeGen.get()))
+    {
+        /*average need more reduction kernel for count computing*/
+        std::unique_ptr<double[]> pAllBuffer(new double[2 * w]);
+        double* resbuf = 
static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
+            mpClmem2,
+            CL_TRUE, CL_MAP_READ, 0,
+            sizeof(double) * w, 0, nullptr, nullptr,
+            &err));
+        if (err != CL_SUCCESS)
+            throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__);
+
+        for (int i = 0; i < w; i++)
+            pAllBuffer[i] = resbuf[i];
+        err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, 
nullptr, nullptr);
+        if (err != CL_SUCCESS)
+            throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, 
__LINE__);
+
+        kernelName = Base::GetName() + "_count_reduction";
+        redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
+        if (err != CL_SUCCESS)
+            throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__);
+        SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " 
<< kernelName << " in program " << mpProgram);
+
+        // set kernel arg of reduction kernel
+        buf = Base::GetCLBuffer();
+        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": 
cl_mem: " << buf);
+        err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
+            static_cast<void*>(&buf));
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": 
cl_mem: " << mpClmem2);
+        err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2);
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": 
cl_int: " << nInput);
+        err = clSetKernelArg(redKernel, 2, sizeof(cl_int), 
static_cast<void*>(&nInput));
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+        SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": 
cl_int: " << nCurWindowSize);
+        err = clSetKernelArg(redKernel, 3, sizeof(cl_int), 
static_cast<void*>(&nCurWindowSize));
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+
+        // set work group size and execute
+        size_t global_work_size1[] = { 256, static_cast<size_t>(w) };
+        size_t const local_work_size1[] = { 256, 1 };
+        SAL_INFO("sc.opencl", "Enqueuing kernel " << redKernel);
+        err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
+            global_work_size1, local_work_size1, 0, nullptr, nullptr);
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, 
__LINE__);
+        err = clFinish(kEnv.mpkCmdQueue);
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clFinish", err, __FILE__, __LINE__);
+        resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
+            mpClmem2,
+            CL_TRUE, CL_MAP_READ, 0,
+            sizeof(double) * w, 0, nullptr, nullptr,
+            &err));
+        if (err != CL_SUCCESS)
+            throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__);
+        for (int i = 0; i < w; i++)
+            pAllBuffer[i + w] = resbuf[i];
+        err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, 
nullptr, nullptr);
+        // FIXME: Is it intentional to not throw an OpenCLError even if the 
clEnqueueUnmapMemObject() fails?
+        if (CL_SUCCESS != err)
+            SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << 
openclwrapper::errorString(err));
+        if (mpClmem2)
+        {
+            err = clReleaseMemObject(mpClmem2);
+            SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject 
failed: " << openclwrapper::errorString(err));
+            mpClmem2 = nullptr;
+        }
+        mpClmem2 = clCreateBuffer(kEnv.mpkContext,
+            cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
+            w * sizeof(double) * 2, pAllBuffer.get(), &err);
+        if (CL_SUCCESS != err)
+            throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__);
+        SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w 
<< "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer 
" << pAllBuffer.get());
+    }
+    // set kernel arg
+    SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " 
<< mpClmem2);
+    err = clSetKernelArg(k, argno, sizeof(cl_mem), &mpClmem2);
+    if (CL_SUCCESS != err)
+        throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__);
+    return 1;
+}
+
 struct SumIfsArgs
 {
     explicit SumIfsArgs(cl_mem x) : mCLMem(x), mConst(0.0) { }
_______________________________________________
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

Reply via email to