Verified. This patch can save 2/3 time cost for clEnqueueCopyBuffer. The test case has been sent out: [Beignet][PATCH 1/2 v2] Prepare to add uperformance test suite [Beignet] [PATCH 2/2 v2] Add unit performance suite
Thanks --Sun, Yi > -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of Lv > Meng > Sent: Thursday, March 20, 2014 3:07 PM > To: [email protected] > Cc: Lv, Meng > Subject: [Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer > performance in not-aligned case > > Signed-off-by: Lv Meng <[email protected]> > --- > src/CMakeLists.txt | 3 +- > src/cl_context.h | 1 + > src/cl_mem.c | 79 > ++++++++++++++++++++++---- > src/kernels/cl_internel_copy_buf_dword_copy.cl | 19 +++++++ > 4 files changed, 89 insertions(+), 13 deletions(-) mode change 100644 => > 100755 src/CMakeLists.txt mode change 100644 => 100755 src/cl_context.h > mode change 100644 => 100755 src/cl_mem.c create mode 100755 > src/kernels/cl_internel_copy_buf_dword_copy.cl > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt old mode 100644 new > mode 100755 index 95ff56f..3c23d3d > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -18,7 +18,8 @@ endforeach (KF) > endmacro (MakeKernelBinStr) > > set (KERNEL_STR_FILES) > -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 > cl_internal_copy_buf_align16) > +set (KERNEL_NAMES cl_internal_copy_buf_align1 > +cl_internal_copy_buf_align4 > +cl_internal_copy_buf_align16 cl_internel_copy_buf_dword_copy) > MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" > "${KERNEL_NAMES}") > > set(OPENCL_SRC > diff --git a/src/cl_context.h b/src/cl_context.h old mode 100644 new mode > 100755 index 29bcb9f..7326458 > --- a/src/cl_context.h > +++ b/src/cl_context.h > @@ -43,6 +43,7 @@ enum _cl_internal_ker_type { > CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0, > CL_ENQUEUE_COPY_BUFFER_ALIGN4, > CL_ENQUEUE_COPY_BUFFER_ALIGN16, > + CL_ENQUEUE_COPY_BUFFER_DWORD_COPY, > CL_ENQUEUE_COPY_BUFFER_RECT, > CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image > 2d > CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image > 2d > diff --git a/src/cl_mem.c b/src/cl_mem.c old mode 100644 new mode 100755 > index 9e0d334..0fd2959 > --- a/src/cl_mem.c > +++ b/src/cl_mem.c > @@ -749,6 +749,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem > src_buf, cl_mem dst_buf, > size_t global_off[] = {0,0,0}; > size_t global_sz[] = {1,1,1}; > size_t local_sz[] = {1,1,1}; > + int baligned = 1; > > /* We use one kernel to copy the data. The kernel is lazily created. */ > assert(src_buf->ctx == dst_buf->ctx); @@ -759,6 +760,7 @@ > cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, > > ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_BUFFER_ALIGN1, > cl_internal_copy_buf_align1_str, > (size_t)cl_internal_copy_buf_align1_str_size, NULL); > + baligned = 0; > } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) { > extern char cl_internal_copy_buf_align4_str[]; > extern int cl_internal_copy_buf_align4_str_size; > @@ -782,20 +784,73 @@ cl_mem_copy(cl_command_queue queue, cl_mem > src_buf, cl_mem dst_buf, > if (!ker) > return CL_OUT_OF_RESOURCES; > > - if (cb < LOCAL_SZ_0) { > - local_sz[0] = 1; > + if(baligned) { > + if (cb < LOCAL_SZ_0) { > + local_sz[0] = 1; > + } else { > + local_sz[0] = LOCAL_SZ_0; > + } > + global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0; > + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); > + cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset); > + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); > + cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset); > + cl_kernel_set_arg(ker, 4, sizeof(int), &cb); > + ret = cl_command_queue_ND_range(queue, ker, 1, global_off, > + global_sz, local_sz); > } else { > - local_sz[0] = LOCAL_SZ_0; > + extern char cl_internel_copy_buf_dword_copy_str[]; > + extern int cl_internel_copy_buf_dword_copy_str_size; > + cl_kernel dword_ker = > cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_BUFFER_DWORD_COPY, > + cl_internel_copy_buf_dword_copy_str, > (size_t)cl_internel_copy_buf_dword_copy_str_size, NULL); > + if (!dword_ker) > + return CL_OUT_OF_RESOURCES; > + int upbyte = dst_offset%4; > + if(upbyte) > + upbyte = 4-upbyte; > + int alignbyte = cb - upbyte; > + int aligndword = alignbyte/4; > + int downbyte = alignbyte%4; > + int dstalignoffset = dst_offset/4; > + if(upbyte){ > + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); > + cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset); > + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); > + cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset); > + cl_kernel_set_arg(ker, 4, sizeof(int), &upbyte); > + global_sz[0] = LOCAL_SZ_0; > + local_sz[0] = LOCAL_SZ_0; > + ret = cl_command_queue_ND_range(queue, ker, 1, global_off, > global_sz, local_sz); > + if(aligndword) > + cl_command_queue_flush(queue); > + dst_offset += upbyte; > + src_offset += upbyte; > + dstalignoffset += 1; > + } > + if(aligndword){ > + cl_kernel_set_arg(dword_ker, 0, sizeof(cl_mem), &src_buf); > + cl_kernel_set_arg(dword_ker, 1, sizeof(int), &src_offset); > + cl_kernel_set_arg(dword_ker, 2, sizeof(cl_mem), &dst_buf); > + cl_kernel_set_arg(dword_ker, 3, sizeof(int), &dstalignoffset); > + cl_kernel_set_arg(dword_ker, 4, sizeof(int), &aligndword); > + global_sz[0] = ((aligndword + LOCAL_SZ_0 - > 1)/LOCAL_SZ_0)*LOCAL_SZ_0; > + local_sz[0] = LOCAL_SZ_0; > + ret = cl_command_queue_ND_range(queue, dword_ker, 1, global_off, > global_sz, local_sz); > + if(downbyte) > + cl_command_queue_flush(queue); > + src_offset += aligndword*4; > + dst_offset += aligndword*4; > + } > + if(downbyte){ > + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); > + cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset); > + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); > + cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset); > + cl_kernel_set_arg(ker, 4, sizeof(int), &downbyte); > + global_sz[0] = LOCAL_SZ_0; > + local_sz[0] = LOCAL_SZ_0; > + ret = cl_command_queue_ND_range(queue, ker, 1, global_off, > global_sz, local_sz); > + } > } > - global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0; > - > - cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); > - cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset); > - cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); > - cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset); > - cl_kernel_set_arg(ker, 4, sizeof(int), &cb); > - > - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, > local_sz); > > return ret; > } > diff --git a/src/kernels/cl_internel_copy_buf_dword_copy.cl > b/src/kernels/cl_internel_copy_buf_dword_copy.cl > new file mode 100755 > index 0000000..55a76d0 > --- /dev/null > +++ b/src/kernels/cl_internel_copy_buf_dword_copy.cl > @@ -0,0 +1,19 @@ > +kernel void dword_copy(__global unsigned int*src,int srcoffset,__global > unsigned int*dst,int dstalignoffset,int size){ > + unsigned int outdata = 0; > + unsigned char lsm[8]; > + unsigned int* li = lsm; > + int lsmoffset = srcoffset%4; > + __global unsigned int *src_algin = src+(srcoffset/4); > + __global unsigned int *dst_align = dst+dstalignoffset; > + int gid = get_global_id(0); > + if(gid<size){ > + *li = src_algin[gid]; > + if(lsmoffset){ > + *(li+1) = src_algin[gid+1]; > + outdata = > (lsm[lsmoffset])|(lsm[lsmoffset+1]<<8)|(lsm[lsmoffset+2]<<16)|(lsm[lsmoffset > +3]<<24); > + } > + else > + outdata = *li; > + dst_align[gid] = outdata; > + } > +} > \ No newline at end of file > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
