[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-10-04 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
yaxunl marked an inline comment as done.
Closed by commit rL314932: [OpenCL] Clean up and add missing fields for block 
struct (authored by yaxunl).

Changed prior to commit:
  https://reviews.llvm.org/D37822?vs=116877=117732#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D37822

Files:
  cfe/trunk/lib/CodeGen/CGBlocks.cpp
  cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp
  cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h
  cfe/trunk/lib/CodeGen/TargetInfo.h
  cfe/trunk/test/CodeGen/blocks-opencl.cl
  cfe/trunk/test/CodeGenOpenCL/blocks.cl
  cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -7,7 +7,7 @@
 typedef struct {int a;} ndrange_t;
 
 // N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
@@ -27,9 +27,10 @@
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
+  // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
-  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange,
  ^(void) {
a[i] = b[i];
@@ -39,7 +40,7 @@
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
   // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
   // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
@@ -52,11 +53,11 @@
   // B32: %[[TMP:.*]] = alloca [1 x i32]
   // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
   // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 

[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-10-04 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked an inline comment as done.
yaxunl added inline comments.



Comment at: test/CodeGenOpenCL/blocks.cl:30
+  // COMMON: %[[block_captured:.*]] = getelementptr inbounds <{ i32, i32, i8 
addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }>* %[[block]], i32 
0, i32 3
+  // COMMON: %[[r0:.*]] = load i32, i32* %i
+  // COMMON: store i32 %[[r0]], i32* %[[block_captured]],

Anastasia wrote:
> It might be better to give those r0-r7 some names for readability if possible!
Will fix it when committing.


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-10-04 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia accepted this revision.
Anastasia added a comment.
This revision is now accepted and ready to land.

LGTM! Thanks!




Comment at: test/CodeGenOpenCL/blocks.cl:30
+  // COMMON: %[[block_captured:.*]] = getelementptr inbounds <{ i32, i32, i8 
addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }>* %[[block]], i32 
0, i32 3
+  // COMMON: %[[r0:.*]] = load i32, i32* %i
+  // COMMON: store i32 %[[r0]], i32* %[[block_captured]],

It might be better to give those r0-r7 some names for readability if possible!


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-27 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 116877.
yaxunl marked 4 inline comments as done.
yaxunl added a comment.

Rebased to ToT and revised by Anastasia's comments.


https://reviews.llvm.org/D37822

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/TargetInfo.h
  test/CodeGen/blocks-opencl.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -7,7 +7,7 @@
 typedef struct {int a;} ndrange_t;
 
 // N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
@@ -27,9 +27,10 @@
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
+  // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
-  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange,
  ^(void) {
a[i] = b[i];
@@ -39,7 +40,7 @@
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
   // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
   // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
@@ -52,11 +53,11 @@
   // B32: %[[TMP:.*]] = alloca [1 x i32]
   // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
   // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
   // B64: %[[TMP:.*]] = alloca [1 x i64]
   // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
   // B64: store i64 256, i64* %[[TMP1]], align 8
-  // B64: call i32 

[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-27 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 4 inline comments as done.
yaxunl added inline comments.



Comment at: lib/CodeGen/CGBlocks.cpp:311
+// The header is basically 'struct { int; int; generic void *;
+// custom_fields; }'. Assert that that struct is packed.
+auto GenPtrAlign = CharUnits::fromQuantity(

Anastasia wrote:
> remove one "that".
will do.



Comment at: lib/CodeGen/CGBlocks.cpp:312
+// custom_fields; }'. Assert that that struct is packed.
+auto GenPtrAlign = CharUnits::fromQuantity(
+CGM.getTarget().getPointerAlign(LangAS::opencl_generic) / 8);

Anastasia wrote:
> I think the alignment might not be computed correctly now if there will be 
> custom fields that might have a bigger size than a pointer? Also what happens 
> if we have captures as well?
Will fix.

The captures will be accounted for by computeBlockInfo and BlockSize and 
BlockAlign will be updated.



Comment at: lib/CodeGen/CGBlocks.cpp:850
+   CGM.getDataLayout().getTypeAllocSize(I->getType())),
+   "block.custom");
+  }

Anastasia wrote:
> do we need to add numeration to each item name?
yes. will add it.



Comment at: lib/CodeGen/CGBlocks.cpp:1250
   // Function
   fields.add(blockFn);
 

Anastasia wrote:
> If we reorder fields and put this on top we can merge the if statements above 
> and below this point.
By convention the size of the whole struct is the first field so that the 
library function reads the first integer and knows how many bytes to copy.


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-25 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added inline comments.



Comment at: lib/CodeGen/CGBlocks.cpp:311
+// The header is basically 'struct { int; int; generic void *;
+// custom_fields; }'. Assert that that struct is packed.
+auto GenPtrAlign = CharUnits::fromQuantity(

remove one "that".



Comment at: lib/CodeGen/CGBlocks.cpp:312
+// custom_fields; }'. Assert that that struct is packed.
+auto GenPtrAlign = CharUnits::fromQuantity(
+CGM.getTarget().getPointerAlign(LangAS::opencl_generic) / 8);

I think the alignment might not be computed correctly now if there will be 
custom fields that might have a bigger size than a pointer? Also what happens 
if we have captures as well?



Comment at: lib/CodeGen/CGBlocks.cpp:850
+   CGM.getDataLayout().getTypeAllocSize(I->getType())),
+   "block.custom");
+  }

do we need to add numeration to each item name?



Comment at: lib/CodeGen/CGBlocks.cpp:1250
   // Function
   fields.add(blockFn);
 

If we reorder fields and put this on top we can merge the if statements above 
and below this point.


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-22 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 116363.
yaxunl edited the summary of this revision.
yaxunl added a comment.

Add custom fields to block and target hooks to fill them.


https://reviews.llvm.org/D37822

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/TargetInfo.h
  test/CodeGen/blocks-opencl.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -7,7 +7,7 @@
 typedef struct {int a;} ndrange_t;
 
 // N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
@@ -27,9 +27,10 @@
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
+  // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
-  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange,
  ^(void) {
a[i] = b[i];
@@ -39,7 +40,7 @@
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
   // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
   // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
@@ -52,11 +53,11 @@
   // B32: %[[TMP:.*]] = alloca [1 x i32]
   // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
   // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
   // B64: %[[TMP:.*]] = alloca [1 x i64]
   // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
   // B64: store i64 256, i64* %[[TMP1]], align 8
-  // B64: call i32 

[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In https://reviews.llvm.org/D37822#877903, @Anastasia wrote:

> In https://reviews.llvm.org/D37822#877572, @yaxunl wrote:
>
> > In https://reviews.llvm.org/D37822#873876, @Anastasia wrote:
> >
> > > In https://reviews.llvm.org/D37822#872446, @yaxunl wrote:
> > >
> > > > In https://reviews.llvm.org/D37822#872291, @Anastasia wrote:
> > > >
> > > > > Could you please explain a bit more why the alignment have to be put 
> > > > > explicitly in the struct? I am just not very convinced this is 
> > > > > general enough.
> > > >
> > > >
> > > > The captured variables are fields of the block literal struct. Due to 
> > > > alignment requirement of these fields, there is alignment requirement of
> > > >  the block literal struct. The ISA of the block invoke function is 
> > > > generated with the assumption of these alignments. If the block literal 
> > > > is
> > > >  allocated at a memory address not satisfying the alignment 
> > > > requirement, the kernel behavior is undefined.
> > > >
> > > > Generally, __enqueue_kernel library function needs to prepare the 
> > > > kernel argument before launching the kernel. It usually does this by 
> > > > copying
> > > >  the block literal to some buffer then pass the address of the buffer 
> > > > to the kernel. Then the address of the buffer has to satisfy the 
> > > > alignment
> > > >  requirement.
> > > >
> > > > If this block literal struct is not general enough, how about add 
> > > > another field as target reserved size, and leave the remaining space of 
> > > > header for
> > > >  target specific use. And add a target hook to allow target fill the 
> > > > reserved space, e.g.
> > > >
> > > >   struct __opencl_block_literal {
> > > > int total_size;
> > > > int align;
> > > > __generic void *invoke;
> > > > int target_reserved_size; /* round up to 4 bytes */
> > > > int target_reserved[];
> > > > /* captures */
> > > >   };
> > > >
> > >
> > >
> > > I like the idea of the target reserved part actually. But not sure how it 
> > > could be used without adding any target specific methods?
> >
> >
> > If we decide to add target reserved fields, I can add target hooks to fill 
> > these fields. However I would suggest to leave this for future since I 
> > don't see there is need for other fields for now.
>
>
> I could imagine it can be usefull for some vendor implementations.




> 
> 
>>> However, I am still not clear why the alignment of this struct has to be 
>>> different from any other struct Clang produces. Normally the alignment of 
>>> objects have to be known during IR generation to put them correctly in the 
>>> attributes of generated alloca, store and loads. But as a field inside 
>>> struct I don't know how it can be useful. I would imagine `enqueue_kernel` 
>>> would just operate on the block as if it would be an arbitrary buffer of 
>>> data. Also would size of the struct not account for any padding to make 
>>> sure the alignment can be deduced based on it correctly?
>> 
>> enqueue_kernel needs to pass the block struct to the kernel. Let's assume it 
>> does this by copying the block struct to a buffer. If enqueue_kernel does 
>> not know the alignment of the struct, it can only put it at an arbitrary 
>> address in the buffer. Then the kernel has to copy the struct to an aligned 
>> private memory and load the fields. However, if the enqueued_kernel knows 
>> the alignment of the struct, it can put it at an address satisfying the 
>> alignment. Then the kernel can load the fields directly from the buffer, 
>> skips the step of copying to an aligned private memory. Therefore, alignment 
>> of the block struct is usually a useful information for enqueue_kernel. I 
>> think that's why in the SPIRV spec OpEnqueueKernel requires an alignment 
>> operand for the block context.
> 
> Ok, I just think in C if you use `malloc` to obtain a pointer to some memory 
> location it doesn't take any alignment information. Then you can use the 
> pointer to copy any data including the struct into the location its pointed 
> to. And the pointer can be used later on correctly. I think the alignment is 
> deduced in this case from the type or the size of an object. Do you know 
> where the alignment information is used for SPIRV call? Also how is the block 
> represented in SPIRV?

If you just use malloc and put your struct in it, there is no guarantee that 
your struct is aligned at the required alignment, then your kernel cannot load 
a field directly from that memory. For example, if your first field is an int 
and the instruction can only load an int from an addr aligned at 4, and your 
malloc'ed addr is aligned at 1, then you cannot load that int directly. 
Instead, you need to copy the 4 bytes to an addr aligned at 4, then use that 
instruction to load it. If you use posix_memalign to get an aligned buffer, 
then your kernel can generate more efficient code.

OpEnqueueKernel instruction in SPIR-V is for representing 

[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 116277.
yaxunl marked 6 inline comments as done.
yaxunl added a comment.

Revise by Anastasia's comments.


https://reviews.llvm.org/D37822

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  test/CodeGen/blocks-opencl.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -7,7 +7,7 @@
 typedef struct {int a;} ndrange_t;
 
 // N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
@@ -27,9 +27,10 @@
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
+  // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
-  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange,
  ^(void) {
a[i] = b[i];
@@ -39,7 +40,7 @@
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
   // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
   // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
@@ -52,11 +53,11 @@
   // B32: %[[TMP:.*]] = alloca [1 x i32]
   // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
   // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
   // B64: %[[TMP:.*]] = alloca [1 x i64]
   // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
   // B64: store i64 256, i64* %[[TMP1]], align 8
-  // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], 

[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-21 Thread Brian Sumner via Phabricator via cfe-commits
b-sumner added a comment.

In https://reviews.llvm.org/D37822#877903, @Anastasia wrote:

> In https://reviews.llvm.org/D37822#877572, @yaxunl wrote:
>
> > In https://reviews.llvm.org/D37822#873876, @Anastasia wrote:
> >
> > > In https://reviews.llvm.org/D37822#872446, @yaxunl wrote:
> > >
> > > > In https://reviews.llvm.org/D37822#872291, @Anastasia wrote:
> > > >
> > > > > Could you please explain a bit more why the alignment have to be put 
> > > > > explicitly in the struct? I am just not very convinced this is 
> > > > > general enough.
> > > >
> > > >
> > > > The captured variables are fields of the block literal struct. Due to 
> > > > alignment requirement of these fields, there is alignment requirement of
> > > >  the block literal struct. The ISA of the block invoke function is 
> > > > generated with the assumption of these alignments. If the block literal 
> > > > is
> > > >  allocated at a memory address not satisfying the alignment 
> > > > requirement, the kernel behavior is undefined.
> > > >
> > > > Generally, __enqueue_kernel library function needs to prepare the 
> > > > kernel argument before launching the kernel. It usually does this by 
> > > > copying
> > > >  the block literal to some buffer then pass the address of the buffer 
> > > > to the kernel. Then the address of the buffer has to satisfy the 
> > > > alignment
> > > >  requirement.
> > > >
> > > > If this block literal struct is not general enough, how about add 
> > > > another field as target reserved size, and leave the remaining space of 
> > > > header for
> > > >  target specific use. And add a target hook to allow target fill the 
> > > > reserved space, e.g.
> > > >
> > > >   struct __opencl_block_literal {
> > > > int total_size;
> > > > int align;
> > > > __generic void *invoke;
> > > > int target_reserved_size; /* round up to 4 bytes */
> > > > int target_reserved[];
> > > > /* captures */
> > > >   };
> > > >
> > >
> > >
> > > I like the idea of the target reserved part actually. But not sure how it 
> > > could be used without adding any target specific methods?
> >
> >
> > If we decide to add target reserved fields, I can add target hooks to fill 
> > these fields. However I would suggest to leave this for future since I 
> > don't see there is need for other fields for now.
>
>
> I could imagine it can be usefull for some vendor implementations.
>
> >> However, I am still not clear why the alignment of this struct has to be 
> >> different from any other struct Clang produces. Normally the alignment of 
> >> objects have to be known during IR generation to put them correctly in the 
> >> attributes of generated alloca, store and loads. But as a field inside 
> >> struct I don't know how it can be useful. I would imagine `enqueue_kernel` 
> >> would just operate on the block as if it would be an arbitrary buffer of 
> >> data. Also would size of the struct not account for any padding to make 
> >> sure the alignment can be deduced based on it correctly?
> > 
> > enqueue_kernel needs to pass the block struct to the kernel. Let's assume 
> > it does this by copying the block struct to a buffer. If enqueue_kernel 
> > does not know the alignment of the struct, it can only put it at an 
> > arbitrary address in the buffer. Then the kernel has to copy the struct to 
> > an aligned private memory and load the fields. However, if the 
> > enqueued_kernel knows the alignment of the struct, it can put it at an 
> > address satisfying the alignment. Then the kernel can load the fields 
> > directly from the buffer, skips the step of copying to an aligned private 
> > memory. Therefore, alignment of the block struct is usually a useful 
> > information for enqueue_kernel. I think that's why in the SPIRV spec 
> > OpEnqueueKernel requires an alignment operand for the block context.
>
> Ok, I just think in C if you use `malloc` to obtain a pointer to some memory 
> location it doesn't take any alignment information. Then you can use the 
> pointer to copy any data including the struct into the location its pointed 
> to. And the pointer can be used later on correctly. I think the alignment is 
> deduced in this case from the type or the size of an object. Do you know 
> where the alignment information is used for SPIRV call? Also how is the block 
> represented in SPIRV?


Actually malloc alignment is not sufficient more many uses such as CPU 
supported vectors, e.g. AVX512 or passed to create buffer with 
use-host-pointer.  In such cases you need posix_memalign or some similar API.  
Having the alignment means it is available if needed.  If an implementation 
doesn't need it, there is no harm is there?


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-21 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

In https://reviews.llvm.org/D37822#877572, @yaxunl wrote:

> In https://reviews.llvm.org/D37822#873876, @Anastasia wrote:
>
> > In https://reviews.llvm.org/D37822#872446, @yaxunl wrote:
> >
> > > In https://reviews.llvm.org/D37822#872291, @Anastasia wrote:
> > >
> > > > Could you please explain a bit more why the alignment have to be put 
> > > > explicitly in the struct? I am just not very convinced this is general 
> > > > enough.
> > >
> > >
> > > The captured variables are fields of the block literal struct. Due to 
> > > alignment requirement of these fields, there is alignment requirement of
> > >  the block literal struct. The ISA of the block invoke function is 
> > > generated with the assumption of these alignments. If the block literal is
> > >  allocated at a memory address not satisfying the alignment requirement, 
> > > the kernel behavior is undefined.
> > >
> > > Generally, __enqueue_kernel library function needs to prepare the kernel 
> > > argument before launching the kernel. It usually does this by copying
> > >  the block literal to some buffer then pass the address of the buffer to 
> > > the kernel. Then the address of the buffer has to satisfy the alignment
> > >  requirement.
> > >
> > > If this block literal struct is not general enough, how about add another 
> > > field as target reserved size, and leave the remaining space of header for
> > >  target specific use. And add a target hook to allow target fill the 
> > > reserved space, e.g.
> > >
> > >   struct __opencl_block_literal {
> > > int total_size;
> > > int align;
> > > __generic void *invoke;
> > > int target_reserved_size; /* round up to 4 bytes */
> > > int target_reserved[];
> > > /* captures */
> > >   };
> > >
> >
> >
> > I like the idea of the target reserved part actually. But not sure how it 
> > could be used without adding any target specific methods?
>
>
> If we decide to add target reserved fields, I can add target hooks to fill 
> these fields. However I would suggest to leave this for future since I don't 
> see there is need for other fields for now.


I could imagine it can be usefull for some vendor implementations.

>> However, I am still not clear why the alignment of this struct has to be 
>> different from any other struct Clang produces. Normally the alignment of 
>> objects have to be known during IR generation to put them correctly in the 
>> attributes of generated alloca, store and loads. But as a field inside 
>> struct I don't know how it can be useful. I would imagine `enqueue_kernel` 
>> would just operate on the block as if it would be an arbitrary buffer of 
>> data. Also would size of the struct not account for any padding to make sure 
>> the alignment can be deduced based on it correctly?
> 
> enqueue_kernel needs to pass the block struct to the kernel. Let's assume it 
> does this by copying the block struct to a buffer. If enqueue_kernel does not 
> know the alignment of the struct, it can only put it at an arbitrary address 
> in the buffer. Then the kernel has to copy the struct to an aligned private 
> memory and load the fields. However, if the enqueued_kernel knows the 
> alignment of the struct, it can put it at an address satisfying the 
> alignment. Then the kernel can load the fields directly from the buffer, 
> skips the step of copying to an aligned private memory. Therefore, alignment 
> of the block struct is usually a useful information for enqueue_kernel. I 
> think that's why in the SPIRV spec OpEnqueueKernel requires an alignment 
> operand for the block context.

Ok, I just think in C if you use `malloc` to obtain a pointer to some memory 
location it doesn't take any alignment information. Then you can use the 
pointer to copy any data including the struct into the location its pointed to. 
And the pointer can be used later on correctly. I think the alignment is 
deduced in this case from the type or the size of an object. Do you know where 
the alignment information is used for SPIRV call? Also how is the block 
represented in SPIRV?


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-21 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 4 inline comments as done.
yaxunl added a comment.

In https://reviews.llvm.org/D37822#873876, @Anastasia wrote:

> In https://reviews.llvm.org/D37822#872446, @yaxunl wrote:
>
> > In https://reviews.llvm.org/D37822#872291, @Anastasia wrote:
> >
> > > Could you please explain a bit more why the alignment have to be put 
> > > explicitly in the struct? I am just not very convinced this is general 
> > > enough.
> >
> >
> > The captured variables are fields of the block literal struct. Due to 
> > alignment requirement of these fields, there is alignment requirement of
> >  the block literal struct. The ISA of the block invoke function is 
> > generated with the assumption of these alignments. If the block literal is
> >  allocated at a memory address not satisfying the alignment requirement, 
> > the kernel behavior is undefined.
> >
> > Generally, __enqueue_kernel library function needs to prepare the kernel 
> > argument before launching the kernel. It usually does this by copying
> >  the block literal to some buffer then pass the address of the buffer to 
> > the kernel. Then the address of the buffer has to satisfy the alignment
> >  requirement.
> >
> > If this block literal struct is not general enough, how about add another 
> > field as target reserved size, and leave the remaining space of header for
> >  target specific use. And add a target hook to allow target fill the 
> > reserved space, e.g.
> >
> >   struct __opencl_block_literal {
> > int total_size;
> > int align;
> > __generic void *invoke;
> > int target_reserved_size; /* round up to 4 bytes */
> > int target_reserved[];
> > /* captures */
> >   };
> >
>
>
> I like the idea of the target reserved part actually. But not sure how it 
> could be used without adding any target specific methods?


If we decide to add target reserved fields, I can add target hooks to fill 
these fields. However I would suggest to leave this for future since I don't 
see there is need for other fields for now.

> However, I am still not clear why the alignment of this struct has to be 
> different from any other struct Clang produces. Normally the alignment of 
> objects have to be known during IR generation to put them correctly in the 
> attributes of generated alloca, store and loads. But as a field inside struct 
> I don't know how it can be useful. I would imagine `enqueue_kernel` would 
> just operate on the block as if it would be an arbitrary buffer of data. Also 
> would size of the struct not account for any padding to make sure the 
> alignment can be deduced based on it correctly?

enqueue_kernel needs to pass the block struct to the kernel. Let's assume it 
does this by copying the block struct to a buffer. If enqueue_kernel does not 
know the alignment of the struct, it can only put it at an arbitrary address in 
the buffer. Then the kernel has to copy the struct to an aligned private memory 
and load the fields. However, if the enqueued_kernel knows the alignment of the 
struct, it can put it at an address satisfying the alignment. Then the kernel 
can load the fields directly from the buffer, skips the step of copying to an 
aligned private memory. Therefore, alignment of the block struct is usually a 
useful information for enqueue_kernel. I think that's why in the SPIRV spec 
OpEnqueueKernel requires an alignment operand for the block context.




Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:108
+llvm::PointerType *CGOpenCLRuntime::getGenericVoidPointerType() {
+  return llvm::IntegerType::getInt8PtrTy(
+  CGM.getLLVMContext(),

Anastasia wrote:
> Should we put an assert of LangOpts.OpenCL?
Will do.



Comment at: test/CodeGen/blocks-opencl.cl:1
 // RUN: %clang_cc1 -O0 %s -ffake-address-space-map -emit-llvm -o - -fblocks 
-triple x86_64-unknown-unknown | FileCheck %s
+// RUN: %clang_cc1 -O0 %s -emit-llvm -o - -fblocks -triple amdgcn | FileCheck 
%s

Anastasia wrote:
> Btw, do you think we need this test any more? And if yes, could this be moved 
> to CodeGenOpenCL?
I think this one can be removed since what it tests is covered by 
CodeGenOpenCL/blocks.cl.


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-18 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

In https://reviews.llvm.org/D37822#872446, @yaxunl wrote:

> In https://reviews.llvm.org/D37822#872291, @Anastasia wrote:
>
> > Could you please explain a bit more why the alignment have to be put 
> > explicitly in the struct? I am just not very convinced this is general 
> > enough.
>
>
> The captured variables are fields of the block literal struct. Due to 
> alignment requirement of these fields, there is alignment requirement of
>  the block literal struct. The ISA of the block invoke function is generated 
> with the assumption of these alignments. If the block literal is
>  allocated at a memory address not satisfying the alignment requirement, the 
> kernel behavior is undefined.
>
> Generally, __enqueue_kernel library function needs to prepare the kernel 
> argument before launching the kernel. It usually does this by copying
>  the block literal to some buffer then pass the address of the buffer to the 
> kernel. Then the address of the buffer has to satisfy the alignment
>  requirement.
>
> If this block literal struct is not general enough, how about add another 
> field as target reserved size, and leave the remaining space of header for
>  target specific use. And add a target hook to allow target fill the reserved 
> space, e.g.
>
>   struct __opencl_block_literal {
> int total_size;
> int align;
> __generic void *invoke;
> int target_reserved_size; /* round up to 4 bytes */
> int target_reserved[];
> /* captures */
>   };
>


I like the idea of the target reserved part actually. But not sure how it could 
be used without adding any target specific methods?

However, I am still not clear why the alignment of this struct has to be 
different from any other struct Clang produces. Normally the alignment of 
objects have to be known during IR generation to put them correctly in the 
attributes of generated alloca, store and loads. But as a field inside struct I 
don't know how it can be useful. I would imagine `enqueue_kernel` would just 
operate on the block as if it would be an arbitrary buffer of data. Also would 
size of the struct not account for any padding to make sure the alignment can 
be deduced based on it correctly?




Comment at: lib/CodeGen/CGOpenCLRuntime.cpp:108
+llvm::PointerType *CGOpenCLRuntime::getGenericVoidPointerType() {
+  return llvm::IntegerType::getInt8PtrTy(
+  CGM.getLLVMContext(),

Should we put an assert of LangOpts.OpenCL?



Comment at: test/CodeGen/blocks-opencl.cl:1
 // RUN: %clang_cc1 -O0 %s -ffake-address-space-map -emit-llvm -o - -fblocks 
-triple x86_64-unknown-unknown | FileCheck %s
+// RUN: %clang_cc1 -O0 %s -emit-llvm -o - -fblocks -triple amdgcn | FileCheck 
%s

Btw, do you think we need this test any more? And if yes, could this be moved 
to CodeGenOpenCL?


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-15 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 2 inline comments as done.
yaxunl added a comment.

In https://reviews.llvm.org/D37822#872291, @Anastasia wrote:

> Could you please explain a bit more why the alignment have to be put 
> explicitly in the struct? I am just not very convinced this is general enough.


The captured variables are fields of the block literal struct. Due to alignment 
requirement of these fields, there is alignment requirement of
the block literal struct. The ISA of the block invoke function is generated 
with the assumption of these alignments. If the block literal is
allocated at a memory address not satisfying the alignment requirement, the 
kernel behavior is undefined.

Generally, __enqueue_kernel library function needs to prepare the kernel 
argument before launching the kernel. It usually does this by copying
the block literal to some buffer then pass the address of the buffer to the 
kernel. Then the address of the buffer has to satisfy the alignment
requirement.

If this block literal struct is not general enough, how about add another field 
as target reserved size, and leave the remaining space of header for
target specific use. And add a target hook to allow target fill the reserved 
space, e.g.

  struct __opencl_block_literal {
int total_size;
int align;
__generic void *invoke;
int target_reserved_size; /* round up to 4 bytes */
int target_reserved[];
/* captures */
  };




Comment at: lib/CodeGen/CGBlocks.cpp:314
 
   assert(elementTypes.empty());
-  elementTypes.push_back(CGM.VoidPtrTy);

Anastasia wrote:
> Why removing this?
It is not removed. It is moved to line 307.



Comment at: test/CodeGenOpenCL/blocks.cl:17
   int i;
-// Checking for null instead of @_NSConcreteStackBlock symbol
-// COMMON: store i8* null, i8** %block.isa
+  // COMMON-NOT: store i8* null, i8** %block.isa
+  // COMMON: %[[block_size:.*]] = getelementptr inbounds <{ i32, i32, i8 
addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }>* %block, i32 0, 
i32 0

Anastasia wrote:
> We don't need to check other fields too?
will add checks.


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-15 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia added a comment.

Could you please explain a bit more why the alignment have to be put explicitly 
in the struct? I am just not very convinced this is general enough.




Comment at: lib/CodeGen/CGBlocks.cpp:314
 
   assert(elementTypes.empty());
-  elementTypes.push_back(CGM.VoidPtrTy);

Why removing this?



Comment at: test/CodeGenOpenCL/blocks.cl:17
   int i;
-// Checking for null instead of @_NSConcreteStackBlock symbol
-// COMMON: store i8* null, i8** %block.isa
+  // COMMON-NOT: store i8* null, i8** %block.isa
+  // COMMON: %[[block_size:.*]] = getelementptr inbounds <{ i32, i32, i8 
addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }>* %block, i32 0, 
i32 0

We don't need to check other fields too?


https://reviews.llvm.org/D37822



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-14 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 115222.
yaxunl added a comment.

Fix bug about calling blocks.


https://reviews.llvm.org/D37822

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  test/CodeGen/blocks-opencl.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -7,7 +7,7 @@
 typedef struct {int a;} ndrange_t;
 
 // N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
@@ -27,9 +27,10 @@
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
+  // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
-  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange,
  ^(void) {
a[i] = b[i];
@@ -39,7 +40,7 @@
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
   // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
   // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
@@ -52,11 +53,11 @@
   // B32: %[[TMP:.*]] = alloca [1 x i32]
   // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
   // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
   // B64: %[[TMP:.*]] = alloca [1 x i64]
   // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
   // B64: store i64 256, i64* %[[TMP1]], align 8
-  // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{(.[0-9]+)?}}, i8 

[PATCH] D37822: [OpenCL] Clean up and add missing fields for block struct

2017-09-13 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.

Currently block is translated to a structure equivalent to

  struct Block {
void *isa;
int flags;
int reserved;
void *invoke;
void *descriptor;
  };

Except `invoke`, which is the pointer to the block invoke function,
all other fields are useless for OpenCL, which clutter the IR and
also waste memory since the block struct is passed to the block
invoke function as argument.

On the other hand, the size and alignment of the block struct is
not stored in the struct, which causes difficulty to implement
__enqueue_kernel as library function, since the library function
needs to know the size and alignment of the argument which needs
to be passed to the kernel.

This patch removes the useless fields from the block struct and adds
size and align fields. The equivalent block struct will become

  struct Block {
int size;
int align;
generic void *invoke;
  };

It also changes the pointer to the invoke function to be
a generic pointer since the address space of a function
may not be private on certain targets.


https://reviews.llvm.org/D37822

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  test/CodeGen/blocks-opencl.cl
  test/CodeGenOpenCL/blocks.cl
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -7,7 +7,7 @@
 typedef struct {int a;} ndrange_t;
 
 // N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor addrspace(2)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: @block_G =  addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
@@ -27,9 +27,10 @@
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
+  // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
-  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{(.[0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+  // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange,
  ^(void) {
a[i] = b[i];
@@ -39,7 +40,7 @@
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
   // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
-  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor addrspace(2)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
   // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
   // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
@@ -52,11 +53,11 @@
   // B32: %[[TMP:.*]] = alloca [1 x i32]
   // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
   // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}*