From: Pan Xiuli <[email protected]> Enqueue multiple times if the the size is not uniform, at most 2 times for 1D, 4times for 2D and 8 times for 3D. Using the workdim offset of walker in batch buffer to keep work groups in series.
TODO: handle events for the flush between multiple enqueues Signed-off-by: Pan Xiuli <[email protected]> --- src/cl_api.c | 8 ------ src/cl_command_queue.c | 60 +++++++++++++++++++++++++++++++++++++++++++-- src/cl_command_queue_gen7.c | 19 ++++++++------ src/cl_driver.h | 1 + src/intel/intel_gpgpu.c | 14 ++++++----- 5 files changed, 78 insertions(+), 24 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index 298a9ab..f45bd25 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -3394,14 +3394,6 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, } } - /* Local sizes must be non-null and divide global sizes */ - if (local_work_size != NULL) - for (i = 0; i < work_dim; ++i) - if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) { - err = CL_INVALID_WORK_GROUP_SIZE; - goto error; - } - /* Queue and kernel must share the same context */ assert(kernel->program); if (command_queue->ctx != kernel->program->ctx) { diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 24094c8..6572c47 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -209,7 +209,7 @@ cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t ma return CL_SUCCESS; } -extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *); +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *,const size_t *, const size_t *, const size_t *, const size_t *, const size_t *); static cl_int cl_kernel_check_args(cl_kernel k) @@ -222,6 +222,61 @@ cl_kernel_check_args(cl_kernel k) } LOCAL cl_int +cl_command_queue_ND_range_wrap(cl_command_queue queue, + cl_kernel ker, + const uint32_t work_dim, + const size_t *global_wk_off, + const size_t *global_wk_sz, + const size_t *local_wk_sz) +{ + /* Used for non uniform work group size */ + cl_int err = CL_SUCCESS; + int i,j,k,count = 0; + const size_t global_wk_sz_div[3] = { + global_wk_sz[0]/local_wk_sz[0]*local_wk_sz[0], + global_wk_sz[1]/local_wk_sz[1]*local_wk_sz[1], + global_wk_sz[2]/local_wk_sz[2]*local_wk_sz[2] + }; + + const size_t global_wk_sz_rem[3] = { + global_wk_sz[0]%local_wk_sz[0], + global_wk_sz[1]%local_wk_sz[1], + global_wk_sz[2]%local_wk_sz[2] + }; + + const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem}; + /* Go through the at most 8 cases and euque if there is work items left */ + for(i = 0; i < 2;i++) { + for(j = 0; j < 2;j++) { + for(k = 0; k < 2; k++) { + size_t global_wk_sz_use[3] = {global_wk_all[k][0],global_wk_all[j][1],global_wk_all[i][2]}; + size_t global_dim_off[3] = { + k * global_wk_sz_div[0] / local_wk_sz[0], + j * global_wk_sz_div[1] / local_wk_sz[1], + i * global_wk_sz_div[2] / local_wk_sz[2] + }; + size_t local_wk_sz_use[3] = { + k ? global_wk_sz_rem[0] : local_wk_sz[0], + j ? global_wk_sz_rem[1] : local_wk_sz[1], + i ? global_wk_sz_rem[2] : local_wk_sz[2] + }; + if(local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) continue; + TRY (cl_command_queue_ND_range_gen7, queue, ker, work_dim, global_wk_off,global_dim_off, global_wk_sz,global_wk_sz_use,local_wk_sz, local_wk_sz_use); + /* TODO: need to handle events for multiple enqueue, now is a workaroud for uniform group size */ + if(!(global_wk_sz_rem[0] == 0 && global_wk_sz_rem[1] == 0 && global_wk_sz_rem[2] == 0)) + err = cl_command_queue_flush(queue); + } + if(work_dim < 2) + break; + } + if(work_dim < 3) + break; + } +error: + return err; +} + +LOCAL cl_int cl_command_queue_ND_range(cl_command_queue queue, cl_kernel k, const uint32_t work_dim, @@ -238,7 +293,8 @@ cl_command_queue_ND_range(cl_command_queue queue, TRY (cl_kernel_check_args, k); if (ver == 7 || ver == 75 || ver == 8 || ver == 9) - TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz); + //TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz); + TRY (cl_command_queue_ND_range_wrap, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz); else FATAL ("Unknown Gen Device"); diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 6bfacbf..b00e383 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -240,9 +240,9 @@ cl_curbe_fill(cl_kernel ker, UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]); UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]); UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]); - UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]); - UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]); - UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]); + UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0]%enqueued_local_wk_sz[0]?1:0)); + UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1]%enqueued_local_wk_sz[1]?1:0)); + UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2]%enqueued_local_wk_sz[2]?1:0)); UPLOAD(GBE_CURBE_THREAD_NUM, thread_n); UPLOAD(GBE_CURBE_WORK_DIM, work_dim); #undef UPLOAD @@ -338,8 +338,11 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, cl_kernel ker, const uint32_t work_dim, const size_t *global_wk_off, + const size_t *global_dim_off, const size_t *global_wk_sz, - const size_t *local_wk_sz) + const size_t *global_wk_sz_use, + const size_t *local_wk_sz, + const size_t *local_wk_sz_use) { GET_QUEUE_THREAD_GPGPU(queue); cl_context ctx = queue->ctx; @@ -365,7 +368,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, kernel.use_slm = interp_kernel_use_slm(ker->opaque); /* Compute the number of HW threads we need */ - if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) { + if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz_use, 3, &local_sz) != CL_SUCCESS)) { fprintf(stderr, "Beignet: Work group size exceed Kerne's work group size.\n"); return err; } @@ -378,7 +381,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, } /* Curbe step 1: fill the constant urb buffer data shared by all threads */ if (ker->curbe) { - kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz ,local_wk_sz, thread_n); + kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz_use ,local_wk_sz, thread_n); if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) { fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz); return CL_OUT_OF_RESOURCES; @@ -428,7 +431,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, for (i = 0; i < thread_n; ++i) { memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz); } - TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n); + TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz_use, simd_sz, cst_sz, thread_n); if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0) goto error; } @@ -441,7 +444,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, cl_gpgpu_batch_start(gpgpu); /* Issue the GPGPU_WALKER command */ - cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz); + cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off,global_dim_off, global_wk_sz_use, local_wk_sz_use); /* Close the batch buffer and submit it */ cl_gpgpu_batch_end(gpgpu, 0); diff --git a/src/cl_driver.h b/src/cl_driver.h index 39c5f30..25323ac 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -285,6 +285,7 @@ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], + const size_t global_dim_off[3], const size_t global_wk_sz[3], const size_t local_wk_sz[3]); extern cl_gpgpu_walker_cb *cl_gpgpu_walker; diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index ffdd122..727c0fb 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1840,6 +1840,7 @@ intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], + const size_t global_dim_off[3], const size_t global_wk_sz[3], const size_t local_wk_sz[3]) { @@ -1889,6 +1890,7 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], + const size_t global_dim_off[3], const size_t global_wk_sz[3], const size_t local_wk_sz[3]) { @@ -1916,14 +1918,14 @@ intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu, OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ else OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */ + OUT_BATCH(gpgpu->batch, global_dim_off[0]); OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, global_wk_dim[0]+global_dim_off[0]); + OUT_BATCH(gpgpu->batch, global_dim_off[1]); OUT_BATCH(gpgpu->batch, 0); - OUT_BATCH(gpgpu->batch, global_wk_dim[0]); - OUT_BATCH(gpgpu->batch, 0); - OUT_BATCH(gpgpu->batch, 0); - OUT_BATCH(gpgpu->batch, global_wk_dim[1]); - OUT_BATCH(gpgpu->batch, 0); - OUT_BATCH(gpgpu->batch, global_wk_dim[2]); + OUT_BATCH(gpgpu->batch, global_wk_dim[1]+global_dim_off[1]); + OUT_BATCH(gpgpu->batch, global_dim_off[2]); + OUT_BATCH(gpgpu->batch, global_wk_dim[2]+global_dim_off[2]); OUT_BATCH(gpgpu->batch, right_mask); OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/ ADVANCE_BATCH(gpgpu->batch); -- 2.5.0 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
