[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-03 Thread Scott Linder via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL338899: [OpenCL] Always emit alloca in entry block for 
enqueue_kernel builtin (authored by scott.linder, committed by ).
Herald added a subscriber: llvm-commits.

Changed prior to commit:
  https://reviews.llvm.org/D50104?vs=158816=159021#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D50104

Files:
  cfe/trunk/lib/CodeGen/CGBuiltin.cpp
  cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
  cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl

Index: cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
+++ cfe/trunk/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG
+
+// Check that the enqueue_kernel array temporary is in the entry block to avoid
+// a dynamic alloca
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: %block_sizes = alloca [1 x i64]
+// SPIR32: %block_sizes = alloca [1 x i32]
+// SPIR64: %block_sizes = alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
+
+// Check that the temporary is scoped to the `if`
+
+// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24)
+// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32)
Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
 
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
@@ -46,8 +47,31 @@
   // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*]
   clk_event_t event_wait_list2[] = {clk_event};
 
-  // Emits block literal on stack and block kernel [[INVLK1]].
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
+
+  // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32]
+  // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64]
+  // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64]
+
+  // Emits block literal on stack and block kernel [[INVLK1]].
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
@@ -73,48 +97,54 @@
   // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, 

[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-02 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl accepted this revision.
yaxunl added a comment.

LGTM. Thanks!


https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-02 Thread Scott Linder via Phabricator via cfe-commits
scott.linder updated this revision to Diff 158816.
scott.linder added a comment.

Emit lifetime intrinsics for the sizes temp, and update test


https://reviews.llvm.org/D50104

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl
  test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl

Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG
+
+// Check that the enqueue_kernel array temporary is in the entry block to avoid
+// a dynamic alloca
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: %block_sizes = alloca [1 x i64]
+// SPIR32: %block_sizes = alloca [1 x i32]
+// SPIR64: %block_sizes = alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
+
+// Check that the temporary is scoped to the `if`
+
+// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24)
+// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32)
Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32
 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES
 
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
@@ -46,8 +47,31 @@
   // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*]
   clk_event_t event_wait_list2[] = {clk_event};
 
-  // Emits block literal on stack and block kernel [[INVLK1]].
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
+
+  // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32]
+  // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64]
+  // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64]
+  // CHECK-LIFETIMES: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64]
+
+  // Emits block literal on stack and block kernel [[INVLK1]].
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
@@ -73,48 +97,54 @@
   // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]],
   // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
   // COMMON-SAME: i8 addrspace(4)* [[BL_I8]])
-
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
  

[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-02 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In https://reviews.llvm.org/D50104#1185920, @scott.linder wrote:

> I still don't quite see what you describe; with that change all of the 
> lifetime.end calls pile up just before the enclosing function returns, not 
> after each call to enqueue_kernel. Looking at 
> https://clang.llvm.org/doxygen/EHScopeStack_8h_source.html#l00078 I don't see 
> any option which isn't based on scope. The lifetime.start calls do occur 
> where I would expect, though, so I will update the patch.


Sorry my mistake. In this case, the full expressions seems to be the calling 
function, so using pushFullExprCleanup to emit lifetime.end does not work well 
here.

You need to call EmitLifetimeEnd explicitly after emitting the function call.


https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-02 Thread Scott Linder via Phabricator via cfe-commits
scott.linder added a comment.

I still don't quite see what you describe; with that change all of the 
lifetime.end calls pile up just before the enclosing function returns, not 
after each call to enqueue_kernel. Looking at 
https://clang.llvm.org/doxygen/EHScopeStack_8h_source.html#l00078 I don't see 
any option which isn't based on scope. The lifetime.start calls do occur where 
I would expect, though, so I will update the patch.


https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-02 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added a comment.

In https://reviews.llvm.org/D50104#1184362, @scott.linder wrote:

> Address feedback; I hope I understood correctly what debug info to check for.
>
> I don't see where in CreateMemTemp and friends EmitLifetimeStart gets called, 
> and I don't see any lifetime intrinsics in the IR even at -O1.


Emitting lifetime intrinsic is optional. In this case, since the life time of 
the temp var is just before and after the function call, emitting lifetime 
intrinsics can help optimizers.

It can be done by code like this:

   if (auto *Size = EmitLifetimeStart(
   CGM.getDataLayout().getTypeAllocSize(Alloca.getElementType()),
   Alloca.getPointer())) {
 pushFullExprCleanup(NormalEHLifetimeMarker, Alloca,
Size);
  }

Then the lifetime.start should be emitted before the function call and 
lifetime.end should be emitted just after the function call.


https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-02 Thread Sven van Haastregt via Phabricator via cfe-commits
svenvh accepted this revision.
svenvh added a comment.
This revision is now accepted and ready to land.

LGTM, thanks!


https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-01 Thread Scott Linder via Phabricator via cfe-commits
scott.linder updated this revision to Diff 158618.
scott.linder added a comment.

Update test


https://reviews.llvm.org/D50104

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGenOpenCL/cl20-device-side-enqueue.cl
  test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl

Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG
+
+// Check that the enqueue_kernel array temporary is in the entry block to avoid
+// a dynamic alloca
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: %block_sizes = alloca [1 x i64]
+// SPIR32: %block_sizes = alloca [1 x i32]
+// SPIR64: %block_sizes = alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
+
+// Check that the temporary is scoped to the `if`
+
+// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24)
+// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32)
Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -46,8 +46,24 @@
   // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*]
   clk_event_t event_wait_list2[] = {clk_event};
 
-  // Emits block literal on stack and block kernel [[INVLK1]].
   // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
+
+  // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64]
+  // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32]
+  // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64]
+  // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32]
+  // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64]
+
+  // Emits block literal on stack and block kernel [[INVLK1]].
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
   // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
@@ -73,48 +89,44 @@
   // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]],  %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]],
   // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*),
   // COMMON-SAME: i8 addrspace(4)* [[BL_I8]])
-
   enqueue_kernel(default_queue, flags, ndrange, 2, _wait_list, _event,
  ^(void) {
a[i] = b[i];
  });
 
   // Emits global block literal [[BLG1]] and block kernel [[INVGK1]].
   // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
   // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
-  // B32: %[[TMP:.*]] = alloca [1 x i32]
-  // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
-  // B32: store i32 256, i32* %[[TMP1]], align 4
-  // B64: %[[TMP:.*]] = alloca [1 x i64]
-  // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
-  // B64: store i64 256, i64* %[[TMP1]], align 8
+  // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES1]], i32 0, i32 0
+  // B32: store i32 256, i32* %[[TMP]], align 4
+  // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES1]], i32 0, i32 0
+  // B64: store i64 256, i64* %[[TMP]], align 8
   // COMMON-LABEL: call i32 @__enqueue_kernel_varargs(
   // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], 

[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-01 Thread Sven van Haastregt via Phabricator via cfe-commits
svenvh added a comment.

You'll probably also need to update 
`test/CodeGenOpenCL/cl20-device-side-enqueue.cl`; please verify with make/ninja 
`check-clang`.


https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-01 Thread Scott Linder via Phabricator via cfe-commits
scott.linder updated this revision to Diff 158545.
scott.linder added a comment.

Address feedback; I hope I understood correctly what debug info to check for.

I don't see where in CreateMemTemp and friends EmitLifetimeStart gets called, 
and I don't see any lifetime intrinsics in the IR even at -O1.


https://reviews.llvm.org/D50104

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl


Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | 
FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple 
"spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple 
"spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - 
-triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG
+
+// Check that the enqueue_kernel array temporary is in the entry block to avoid
+// a dynamic alloca
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: %block_sizes = alloca [1 x i64]
+// SPIR32: %block_sizes = alloca [1 x i32]
+// SPIR64: %block_sizes = alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
+
+// Check that the temporary is scoped to the `if`
+
+// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24)
+// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32)
Index: lib/CodeGen/CGBuiltin.cpp
===
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -3338,15 +3338,18 @@
 // Create a temporary array to hold the sizes of local pointer arguments
 // for the block. \p First is the position of the first size argument.
 auto CreateArrayForSizeVar = [=](unsigned First) {
-  auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First);
-  auto *Arr = Builder.CreateAlloca(AT);
+  llvm::APInt ArraySize(32, NumArgs - First);
+  QualType SizeArrayTy = getContext().getConstantArrayType(
+  getContext().getSizeType(), ArraySize, ArrayType::Normal,
+  /*IndexTypeQuals=*/0);
+  auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
   llvm::Value *Ptr;
   // Each of the following arguments specifies the size of the 
corresponding
   // argument passed to the enqueued block.
   auto *Zero = llvm::ConstantInt::get(IntTy, 0);
   for (unsigned I = First; I < NumArgs; ++I) {
 auto *Index = llvm::ConstantInt::get(IntTy, I - First);
-auto *GEP = Builder.CreateGEP(Arr, {Zero, Index});
+auto *GEP = Builder.CreateGEP(Tmp.getPointer(), {Zero, Index});
 if (I == First)
   Ptr = GEP;
 auto *V =


Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG
+
+// Check that the enqueue_kernel array temporary is in the entry block to avoid
+// a dynamic alloca
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: %block_sizes = alloca [1 x i64]
+// SPIR32: %block_sizes = alloca [1 x i32]
+// SPIR64: %block_sizes = alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
+
+// Check that the temporary is scoped to the `if`
+
+// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24)
+// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32)
Index: 

[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-08-01 Thread Matt Arsenault via Phabricator via cfe-commits
arsenm added a comment.

Should this also test for lifetime markers?


Repository:
  rC Clang

https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-07-31 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl added inline comments.



Comment at: lib/CodeGen/CGBuiltin.cpp:3342
   auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First);
+  // Always insert the alloca in the entry block so it remains static in
+  // the SelectionDAG.
+  BasicBlock *Begin = nullptr;
+  if (Instruction *Entry = CurFn->getEntryBlock().getTerminator()) {
+Begin = Builder.GetInsertBlock();
+Builder.SetInsertPoint(Entry);
+  }
   auto *Arr = Builder.CreateAlloca(AT);
   llvm::Value *Ptr;

You may try CreateMemTemp. It should handle the insert position and also debug 
info.



Comment at: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl:2
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | 
FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple 
"spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple 
"spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR64

Can we have a run line for debug info?


Repository:
  rC Clang

https://reviews.llvm.org/D50104



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50104: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin

2018-07-31 Thread Scott Linder via Phabricator via cfe-commits
scott.linder created this revision.
scott.linder added reviewers: yaxunl, Anastasia, arsenm.
Herald added subscribers: cfe-commits, wdng.

Ensures the statically sized alloca is not converted to DYNAMIC_STACKALLOC 
later because it is not in the entry block.

I believe it is valid to insert the alloca in the entry block, but I'm not 
confident the way I accomplish it is correct.


Repository:
  rC Clang

https://reviews.llvm.org/D50104

Files:
  lib/CodeGen/CGBuiltin.cpp
  test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl


Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | 
FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple 
"spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple 
"spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR64
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: alloca [1 x i64]
+// SPIR32: alloca [1 x i32]
+// SPIR64: alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
Index: lib/CodeGen/CGBuiltin.cpp
===
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -3339,7 +3339,16 @@
 // for the block. \p First is the position of the first size argument.
 auto CreateArrayForSizeVar = [=](unsigned First) {
   auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First);
+  // Always insert the alloca in the entry block so it remains static in
+  // the SelectionDAG.
+  BasicBlock *Begin = nullptr;
+  if (Instruction *Entry = CurFn->getEntryBlock().getTerminator()) {
+Begin = Builder.GetInsertBlock();
+Builder.SetInsertPoint(Entry);
+  }
   auto *Arr = Builder.CreateAlloca(AT);
+  if (Begin)
+Builder.SetInsertPoint(Begin);
   llvm::Value *Ptr;
   // Each of the following arguments specifies the size of the 
corresponding
   // argument passed to the enqueued block.


Index: test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
===
--- /dev/null
+++ test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefixes=COMMON,AMDGPU
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR32
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,SPIR64
+
+typedef struct {int a;} ndrange_t;
+
+kernel void test(int i) {
+// COMMON-LABEL: define {{.*}} void @test
+// COMMON-LABEL: entry:
+// AMDGPU: alloca [1 x i64]
+// SPIR32: alloca [1 x i32]
+// SPIR64: alloca [1 x i64]
+// COMMON-LABEL: if.then:
+// COMMON-NOT: alloca
+// COMMON-LABEL: if.end
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  if (i)
+enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32);
+}
Index: lib/CodeGen/CGBuiltin.cpp
===
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -3339,7 +3339,16 @@
 // for the block. \p First is the position of the first size argument.
 auto CreateArrayForSizeVar = [=](unsigned First) {
   auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First);
+  // Always insert the alloca in the entry block so it remains static in
+  // the SelectionDAG.
+  BasicBlock *Begin = nullptr;
+  if (Instruction *Entry = CurFn->getEntryBlock().getTerminator()) {
+Begin = Builder.GetInsertBlock();
+Builder.SetInsertPoint(Entry);
+  }
   auto *Arr = Builder.CreateAlloca(AT);
+  if (Begin)
+Builder.SetInsertPoint(Begin);
   llvm::Value *Ptr;
   // Each of the following arguments specifies the size of the corresponding
   // argument passed to the enqueued block.
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits