Re: [Beignet] [PATCH] Make in-order command queues actually be in-order

2018-07-21 Thread Rebecca N. Palmer
A demonstration that "in-order" queues currently aren't:

//g++ -o queue_order_test queue_order_test.c -lOpenCL
//Depends: beignet-opencl-icd ocl-icd-opencl-dev
#include 
#include 
int main()
{
  cl_int status;
  cl_device_id device;
clGetDeviceIDs(NULL,CL_DEVICE_TYPE_ALL,1,,NULL);
char device_name[101];
device_name[100]=0;
clGetDeviceInfo(device,CL_DEVICE_NAME,100,device_name,NULL);
printf("Using device %s",device_name);
cl_context ctx;
  cl_command_queue queue;
  cl_program program1,program2;
  cl_kernel kernel1,kernel2;
  cl_mem buffer;
  cl_event uevent1,uevent2,kernels_finished[2];
  size_t n = 3;
  cl_int test_data[3] = {3, 7, 5};
  const char* kernel1_source = "__kernel void test1(__global int *buf) {"
  "printf(\"kern1 \");"
  "  buf[get_global_id(0)] = 2* buf[get_global_id(0)];"
  "}";
  const char* kernel2_source = "__kernel void test2(__global int *buf) {"
  "printf(\"kern2 \");"
  "  buf[get_global_id(0)] = 9+ buf[get_global_id(0)];"
  "}";
  //Expected result: 15 23 19 if 1 runs first (in-order queue), 24 32 28 if 2 
runs first (out-of-order queue)
  ctx = clCreateContext(NULL, 1, , NULL, NULL, );
  if(!ctx)
return 1;

//cl_queue_properties 
qsettings[3]={CL_QUEUE_PROPERTIES,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,0};
cl_queue_properties qsettings[3]={CL_QUEUE_PROPERTIES,0,0};
queue = clCreateCommandQueueWithProperties(ctx, device, qsettings, );
//queue = clCreateCommandQueueWithProperties(ctx, device, 0, );
cl_command_queue_properties qp;
clGetCommandQueueInfo(queue,CL_QUEUE_PROPERTIES,sizeof(qp),,NULL);
printf(" queue properties %i\n",qp);
program1 = clCreateProgramWithSource(ctx, 1, _source, NULL, );
clBuildProgram(program1, 1, , "", NULL, NULL);
kernel1 = clCreateKernel(program1, "test1", );
program2 = clCreateProgramWithSource(ctx, 1, _source, NULL, );
clBuildProgram(program2, 1, , "", NULL, NULL);
kernel2 = clCreateKernel(program2, "test2", );
buffer = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n*4, test_data, );
uevent1=clCreateUserEvent(ctx,);
uevent2=clCreateUserEvent(ctx,);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), );
clSetKernelArg(kernel2, 0, sizeof(cl_mem), );
clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, , , 1,, 
_finished[0]);
clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, , , 0,NULL, 
_finished[1]);//without uevent2, bypasses queue
//clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, , , 1,, 
_finished[1]);
clSetUserEventStatus(uevent2,CL_COMPLETE);
printf("\nsetting event %p (others %p %p) - enter a 
number\n",uevent1,kernels_finished[0],kernels_finished[1]);
int j;scanf("%i",);
clSetUserEventStatus(uevent1,CL_COMPLETE);
clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, n*4, test_data, 2, 
kernels_finished, NULL);
printf("\nresult: %i %i %i\n",test_data[0],test_data[1],test_data[2]);
}

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Make in-order command queues actually be in-order

2018-07-21 Thread Rebecca N. Palmer
When beignet added out-of-order execution support (7fd45f15),
it made *all* command queues out-of-order, even if they were
created as (and are reported by clGetCommandQueueInfo as) in-order.

Signed-off-by: Rebecca N. Palmer 
---
Not sure whether this one is actually worth it: it's clearly
against the spec, but I'm not aware of it causing any
real-world bugs.  (I noticed it while investigating
an issue that turned out to be unrelated.)  Users who expect a
queue to be in-order are probably not using events, and that
makes a beignet queue effectively in-order.

(This is *not* true of out-of-order queues in some other ICDs,
e.g. pocl: it is true in Beignet because our flush (in particular
the implicit one before a blocking copy) is also an ordering
barrier, but the spec doesn't require that.  If you choose not to
take this, it might be a good idea to add a comment to
cl_command_queue_wait_flush documenting that.)

--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -283,7 +283,7 @@ clEnqueueSVMFree (cl_command_queue comma
 data->size  = num_svm_pointers;
 data->ptr   = user_data;
 
-if (e_status == CL_COMPLETE) {
+if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == 
CL_COMPLETE)) {
   // Sync mode, no need to queue event.
   err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
   if (err != CL_SUCCESS) {
@@ -429,7 +429,7 @@ cl_int clEnqueueSVMMemcpy (cl_command_qu
 data->const_ptr= src_ptr;
 data->size = size;
 
-if (e_status == CL_COMPLETE) {
+if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == 
CL_COMPLETE)) {
   // Sync mode, no need to queue event.
   err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
   if (err != CL_SUCCESS) {
@@ -441,6 +441,9 @@ cl_int clEnqueueSVMMemcpy (cl_command_qu
 break;
   }
   cl_command_queue_enqueue_event(command_queue, e);
+  if (blocking_copy) {
+cl_event_wait_for_events_list(1, );
+  }
 }
   } while(0);
 
@@ -518,7 +521,7 @@ cl_int clEnqueueSVMMemFill (cl_command_q
 data->pattern_size = pattern_size;
 data->size = size;
 
-if (e_status == CL_COMPLETE) {
+if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == 
CL_COMPLETE)) {
   // Sync mode, no need to queue event.
   err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
   if (err != CL_SUCCESS) {
--- a/src/cl_api_kernel.c
+++ b/src/cl_api_kernel.c
@@ -223,6 +223,7 @@ clEnqueueNDRangeKernel(cl_command_queue
 count *= global_wk_sz_rem[2] ? 2 : 1;
 
 const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+cl_bool allow_immediate_submit = 
cl_command_queue_allow_bypass_submit(command_queue);
 /* Go through the at most 8 cases and euque if there is work items left */
 for (i = 0; i < 2; i++) {
   for (j = 0; j < 2; j++) {
@@ -263,7 +264,7 @@ clEnqueueNDRangeKernel(cl_command_queue
 break;
   }
 
-  err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : 
CL_QUEUED), CL_FALSE);
+  err = cl_event_exec(e, ((allow_immediate_submit && event_status == 
CL_COMPLETE) ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
   if (err != CL_SUCCESS) {
 break;
   }
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -309,7 +309,7 @@ clEnqueueMapBuffer(cl_command_queue comm
 if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
   data->write_map = 1;
 
-if (e_status == CL_COMPLETE) {
+if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == 
CL_COMPLETE)) {
   // Sync mode, no need to queue event.
   err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
   if (err != CL_SUCCESS) {
@@ -322,6 +322,9 @@ clEnqueueMapBuffer(cl_command_queue comm
   }
 
   cl_command_queue_enqueue_event(command_queue, e);
+  if (blocking_map) {
+cl_event_wait_for_events_list(1, );
+  }
 }
 
 ptr = data->ptr;
@@ -469,7 +472,7 @@ clEnqueueUnmapMemObject(cl_command_queue
 data->mem_obj = memobj;
 data->ptr = mapped_ptr;
 
-if (e_status == CL_COMPLETE) { // No need to wait
+if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == 
CL_COMPLETE)) { // No need to wait
   err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
   if (err != CL_SUCCESS) {
 break;
@@ -571,7 +574,7 @@ clEnqueueReadBuffer(cl_command_queue com
 data->offset = offset;
 data->size = size;
 
-if (e_status == CL_COMPLETE) {
+if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == 
CL_COMPLETE)) {
   // Sync mode, no need to queue event.
   err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
   if (err != CL_SUCCESS) {
@@ -583,6 +586,9 @@ clEnqueueReadBuffer(cl_command_queue com
 break;
   }
   cl_command_queue_enqueue_event(command_queue, e);
+  if (blocking_read) {
+cl_event_wait_for_events_list(1, );
+  }
 }
   } while (0);
 
@@