[PATCH] D43240: [OpenCL] Fix __enqueue_block for block with captures

2018-02-15 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL325264: [OpenCL] Fix __enqueue_block for block with captures 
(authored by yaxunl, committed by ).
Herald added subscribers: llvm-commits, nhaehnle.

Changed prior to commit:
  https://reviews.llvm.org/D43240?vs=134041=134439#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D43240

Files:
  cfe/trunk/lib/CodeGen/CGBlocks.cpp
  cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp
  cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h
  cfe/trunk/lib/CodeGen/CodeGenFunction.h
  cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
  cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl

Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -29,6 +29,10 @@
 // COMMON: define internal spir_func void [[INV_G]](i8 addrspace(4)* %{{.*}}, i8 addrspace(3)* %{{.*}})
 const bl_t block_G = (bl_t) ^ (local void *a) {};
 
+void callee(int id, __global int *out) {
+  out[id] = id;
+}
+
 // COMMON-LABEL: define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %{{.*}}, i32 addrspace(1)* %b, i32 %i)
 kernel void device_side_enqueue(global int *a, global int *b, int i) {
   // COMMON: %default_queue = alloca %opencl.queue_t*
@@ -282,6 +286,21 @@
   // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
   block_A();
 
+  void (^block_C)(void) = ^{
+callee(i, a);
+  };
+
+  // Emits block literal on stack and block kernel [[INVLK3]].
+  // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL3:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
+  // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
+  // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* {{.*}} to i8 addrspace(4)*
+  // COMMON-LABEL: call i32 @__enqueue_kernel_basic(
+  // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}},
+  // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK3:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
+  // COMMON-SAME: i8 addrspace(4)* [[BL_I8]])
+  enqueue_kernel(default_queue, flags, ndrange, block_C);
+
   // Emits global block literal [[BLG9]] and block kernel [[INVGK9]]. [[INVGK9]] calls [[INV9]].
   // COMMON: call i32 @__get_kernel_work_group_size_impl(
   // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK9:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
@@ -333,6 +352,7 @@
 // COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}})
 // COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}})
 // COMMON: define internal spir_kernel void [[INVGK8]](i8 addrspace(4)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVLK3]](i8 addrspace(4)*{{.*}})
 // COMMON: define internal spir_kernel void [[INVGK9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
 // COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
 // COMMON: define internal spir_kernel void [[INVGK10]](i8 addrspace(4)*{{.*}})
Index: cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
===
--- cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -2,6 +2,10 @@
 
 typedef struct {int a;} ndrange_t;
 
+void callee(long id, global long *out) {
+  out[id] = id;
+}
+
 // CHECK-LABEL: define amdgpu_kernel void @test
 kernel void test(global char *a, char b, global long *c, long d) {
   queue_t default_queue;
@@ -24,6 +28,12 @@
  c[0] = d;
  ((local int*)lp)[0] = 1;
  }, 100);
+
+  void (^block)(void) = ^{
+callee(d, c);
+  };
+
+  enqueue_kernel(default_queue, flags, ndrange, block);
 }
 
 // CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8*, i8 addrspace(1)*, i8 }>)
@@ -42,4 +52,7 @@
 // CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_3_kernel(<{ i32, i32, i8*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, i8 addrspace(3)*)
 // CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}}
 
+// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_4_kernel(<{ i32, i32, i8*, i64, i64 addrspace(1)* }>)
+// CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} 

[PATCH] D43240: [OpenCL] Fix __enqueue_block for block with captures

2018-02-15 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In https://reviews.llvm.org/D43240#1008560, @Anastasia wrote:

> LGTM! Thanks for looking at this. Just one thing, I was wondering whether it 
> would be cleaner way if we extend 
> test/CodeGenOpenCL/cl20-device-side-enqueue.cl instead of adding a new one 
> here? Because this is the test that is meant to exercise all DSE codegen 
> bits. Perhaps we can modify one block in that test to have the same format as 
> here (i.e. using captures), since  now we test the same block there most of 
> the time. However, we don't test any of kernel wrapper 
> `*_block_invoke_kernel` there. Not sure why...


We do check block wrappers in cl20-device-side-enqueue.cl, which is done at the 
end of the test.

I did not add this test to cl20-device-side-enqueue.cl because 
cl20-device-side-enqueue.cl is already very complicated. I can add this test to 
cl20-device-side-enqueue.cl when committing.


https://reviews.llvm.org/D43240



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43240: [OpenCL] Fix __enqueue_block for block with captures

2018-02-15 Thread Anastasia Stulova via Phabricator via cfe-commits
Anastasia accepted this revision.
Anastasia added a comment.
This revision is now accepted and ready to land.

LGTM! Thanks for looking at this. Just one thing, I was wondering whether it 
would be cleaner way if we extend 
test/CodeGenOpenCL/cl20-device-side-enqueue.cl instead of adding a new one 
here? Because this is the test that is meant to exercise all DSE codegen bits. 
Perhaps we can modify one block in that test to have the same format as here 
(i.e. using captures), since  now we test the same block there most of the 
time. However, we don't test any of kernel wrapper `*_block_invoke_kernel` 
there. Not sure why...


https://reviews.llvm.org/D43240



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43240: [OpenCL] Fix __enqueue_block for block with captures

2018-02-13 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
yaxunl added reviewers: Anastasia, bader.

The following test case causes issue with codegen of __enqueue_block

  void (^block)(void) = ^{ callee(id, out); };
  
  enqueue_kernel(queue, 0, ndrange, block); 

Clang first does codegen for block expression in the first line and deletes its 
block info.
Clang then tries to do codegen for the same block expression again for the 
second line,
and fails because the block info is gone.

The fix is to do normal codegen for both lines. Introduce an API to OpenCL 
runtime to
record llvm block invoke function and llvm block literal emitted for each AST 
block
expression, and use the recorded information for generating the wrapper kernel.

The EmitBlockLiteral APIs are cleaned up to minimize changes to the normal 
codegen
of blocks.

Another minor issue is that some clean up AST expression is generated for block
with captures, which can be stripped by IgnoreImplicit.


https://reviews.llvm.org/D43240

Files:
  lib/CodeGen/CGBlocks.cpp
  lib/CodeGen/CGOpenCLRuntime.cpp
  lib/CodeGen/CGOpenCLRuntime.h
  lib/CodeGen/CodeGenFunction.h
  test/CodeGenOpenCL/enqueue-block-with-captures.cl

Index: test/CodeGenOpenCL/enqueue-block-with-captures.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-block-with-captures.cl
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -O0 -o - -triple spir-unknown-unknown | FileCheck %s
+
+typedef struct {int a;} ndrange_t;
+
+void callee(int id, __global int* out) {
+  out[id] = id;
+}
+
+kernel void test(int id, __global int* out) {
+
+  void (^block)(void) = ^{ callee(id, out); };
+
+  queue_t queue;
+  ndrange_t ndrange;
+  // CHECK: call i32 @__enqueue_kernel_basic
+  enqueue_kernel(queue, 0, ndrange, block);
+}
+
+// CHECK: define internal spir_kernel void @__test_block_invoke_kernel(i8 addrspace(4)*)
+// CHECK:  call void @__test_block_invoke(i8 addrspace(4)* %0)
Index: lib/CodeGen/CodeGenFunction.h
===
--- lib/CodeGen/CodeGenFunction.h
+++ lib/CodeGen/CodeGenFunction.h
@@ -1583,10 +1583,7 @@
   /// \return an LLVM value which is a pointer to a struct which contains
   /// information about the block, including the block invoke function, the
   /// captured variables, etc.
-  /// \param InvokeF will contain the block invoke function if it is not
-  /// nullptr.
-  llvm::Value *EmitBlockLiteral(const BlockExpr *,
-llvm::Function **InvokeF = nullptr);
+  llvm::Value *EmitBlockLiteral(const BlockExpr *);
   static void destroyBlockInfos(CGBlockInfo *info);
 
   llvm::Function *GenerateBlockFunction(GlobalDecl GD,
@@ -3010,11 +3007,8 @@
   LValue EmitOMPSharedLValue(const Expr *E);
 
 private:
-  /// Helpers for blocks. Returns invoke function by \p InvokeF if it is not
-  /// nullptr. It should be called without \p InvokeF if the caller does not
-  /// need invoke function to be returned.
-  llvm::Value *EmitBlockLiteral(const CGBlockInfo ,
-llvm::Function **InvokeF = nullptr);
+  /// Helpers for blocks.
+  llvm::Value *EmitBlockLiteral(const CGBlockInfo );
 
   /// struct with the values to be passed to the OpenMP loop-related functions
   struct OMPLoopArguments {
Index: lib/CodeGen/CGOpenCLRuntime.h
===
--- lib/CodeGen/CGOpenCLRuntime.h
+++ lib/CodeGen/CGOpenCLRuntime.h
@@ -23,6 +23,7 @@
 
 namespace clang {
 
+class BlockExpr;
 class Expr;
 class VarDecl;
 
@@ -39,8 +40,9 @@
 
   /// Structure for enqueued block information.
   struct EnqueuedBlockInfo {
-llvm::Function *Kernel; /// Enqueued block kernel.
-llvm::Value *BlockArg;  /// The first argument to enqueued block kernel.
+llvm::Function *InvokeFunc; /// Block invoke function.
+llvm::Function *Kernel; /// Enqueued block kernel.
+llvm::Value *BlockArg;  /// The first argument to enqueued block kernel.
   };
   /// Maps block expression to block information.
   llvm::DenseMap EnqueuedBlockMap;
@@ -76,6 +78,15 @@
   /// \return enqueued block information for enqueued block.
   EnqueuedBlockInfo emitOpenCLEnqueuedBlock(CodeGenFunction ,
 const Expr *E);
+
+  /// \brief Record invoke function and block literal emitted during normal
+  /// codegen for a block expression. The information is used by
+  /// emitOpenCLEnqueuedBlock to emit wrapper kernel.
+  ///
+  /// \param InvokeF invoke function emitted for the block expression.
+  /// \param Block block literal emitted for the block expression.
+  void recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF,
+   llvm::Value *Block);
 };
 
 }
Index: lib/CodeGen/CGOpenCLRuntime.cpp
===
--- lib/CodeGen/CGOpenCLRuntime.cpp
+++ lib/CodeGen/CGOpenCLRuntime.cpp
@@ -112,37 +112,50 @@