[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-26 Thread Lewis Baker via Phabricator via cfe-commits
lewissbaker added a comment.

>   There are still room for improvements, in particular, GCC has a 4K frame 
> for this function.

I think GCC having a smaller coroutine frame is probably because it does not 
yet implement the allocation-elision optimisation which inlines the nested 
coroutine frame (which is 4K) into the parent coroutine frame.
I have not looked yet, but I suspect that you'll see within the 
get_big_object2() code it will perform another heap allocation for the 
get_big_object() frame.

Until we have more general support for async RVO, I think 8K is probably as 
good as we're going to be able to get (4K for this coroutine's promise and 4K 
for child coroutine-frame and its promise).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-24 Thread Xun Li via Phabricator via cfe-commits
lxfind updated this revision to Diff 273173.
lxfind added a comment.

Remove unused variable


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314

Files:
  llvm/lib/Transforms/Coroutines/CoroSplit.cpp
  llvm/test/Transforms/Coroutines/coro-split-02.ll
  llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll

Index: llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
+++ llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
@@ -1,6 +1,5 @@
-; Tests that coro-split can handle the case when a code after coro.suspend uses
-; a value produces between coro.save and coro.suspend (%Result.i19)
-; and checks whether stray coro.saves are properly removed
+; Tests that coro-split will optimize the lifetime.start maker of each local variable,
+; sink them to the places closest to the actual use.
 ; RUN: opt < %s -coro-split -S | FileCheck %s
 ; RUN: opt < %s -passes=coro-split -S | FileCheck %s
 
@@ -15,6 +14,9 @@
 entry:
   %ref.tmp7 = alloca %"struct.lean_future::Awaiter", align 8
   %testval = alloca i32
+  %cast = bitcast i32* %testval to i8*
+  ; lifetime of %testval starts here, but not used until await.ready.
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %alloc = call i8* @malloc(i64 16) #3
   %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
@@ -29,8 +31,8 @@
 await.ready:
   %StrayCoroSave = call token @llvm.coro.save(i8* null)
   %val = load i32, i32* %Result.i19
-  %cast = bitcast i32* %testval to i8*
-  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -40,14 +42,15 @@
 }
 
 ; CHECK-LABEL: @a.resume(
-; CHECK: %testval = alloca i32
-; CHECK: getelementptr inbounds %a.Frame
+; CHECK: %testval = alloca i32, align 4
+; CHECK-NEXT:getelementptr inbounds %a.Frame
 ; CHECK-NEXT:getelementptr inbounds %"struct.lean_future::Awaiter"
-; CHECK-NOT: call token @llvm.coro.save(i8* null)
+; CHECK-NEXT:%cast1 = bitcast i32* %testval to i8*
 ; CHECK-NEXT:%val = load i32, i32* %Result
-; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
-; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
-; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast1)
+; CHECK-NEXT:%test = load i32, i32* %testval
+; CHECK-NEXT:call void @print(i32 %test)
+; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast1)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
 
Index: llvm/test/Transforms/Coroutines/coro-split-02.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-02.ll
+++ llvm/test/Transforms/Coroutines/coro-split-02.ll
@@ -31,6 +31,8 @@
   %val = load i32, i32* %Result.i19
   %cast = bitcast i32* %testval to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -47,6 +49,8 @@
 ; CHECK-NEXT:%val = load i32, i32* %Result
 ; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
 ; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:%test = load i32, i32* %testval
+; CHECK-NEXT:call void @print(i32 %test)
 ; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp
===
--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -75,7 +75,7 @@
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -563,7 +563,7 @@
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast(VMap[Shape.AllocaSpillBlock]);
   auto *OldEntry = &NewF->getEntryBlock();
@@ -1239,6 +1239,103 @@
   S.resize(N);
 }
 
+/// For every local variable that has lifetime intrinsics markers, we sink

[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-24 Thread Xun Li via Phabricator via cfe-commits
lxfind updated this revision to Diff 273171.
lxfind added a comment.

A few unintended changes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314

Files:
  llvm/lib/Transforms/Coroutines/CoroSplit.cpp
  llvm/test/Transforms/Coroutines/coro-split-02.ll
  llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll

Index: llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
+++ llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
@@ -1,6 +1,5 @@
-; Tests that coro-split can handle the case when a code after coro.suspend uses
-; a value produces between coro.save and coro.suspend (%Result.i19)
-; and checks whether stray coro.saves are properly removed
+; Tests that coro-split will optimize the lifetime.start maker of each local variable,
+; sink them to the places closest to the actual use.
 ; RUN: opt < %s -coro-split -S | FileCheck %s
 ; RUN: opt < %s -passes=coro-split -S | FileCheck %s
 
@@ -15,6 +14,9 @@
 entry:
   %ref.tmp7 = alloca %"struct.lean_future::Awaiter", align 8
   %testval = alloca i32
+  %cast = bitcast i32* %testval to i8*
+  ; lifetime of %testval starts here, but not used until await.ready.
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %alloc = call i8* @malloc(i64 16) #3
   %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
@@ -29,8 +31,8 @@
 await.ready:
   %StrayCoroSave = call token @llvm.coro.save(i8* null)
   %val = load i32, i32* %Result.i19
-  %cast = bitcast i32* %testval to i8*
-  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -40,14 +42,15 @@
 }
 
 ; CHECK-LABEL: @a.resume(
-; CHECK: %testval = alloca i32
-; CHECK: getelementptr inbounds %a.Frame
+; CHECK: %testval = alloca i32, align 4
+; CHECK-NEXT:getelementptr inbounds %a.Frame
 ; CHECK-NEXT:getelementptr inbounds %"struct.lean_future::Awaiter"
-; CHECK-NOT: call token @llvm.coro.save(i8* null)
+; CHECK-NEXT:%cast1 = bitcast i32* %testval to i8*
 ; CHECK-NEXT:%val = load i32, i32* %Result
-; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
-; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
-; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast1)
+; CHECK-NEXT:%test = load i32, i32* %testval
+; CHECK-NEXT:call void @print(i32 %test)
+; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast1)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
 
Index: llvm/test/Transforms/Coroutines/coro-split-02.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-02.ll
+++ llvm/test/Transforms/Coroutines/coro-split-02.ll
@@ -31,6 +31,8 @@
   %val = load i32, i32* %Result.i19
   %cast = bitcast i32* %testval to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -47,6 +49,8 @@
 ; CHECK-NEXT:%val = load i32, i32* %Result
 ; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
 ; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:%test = load i32, i32* %testval
+; CHECK-NEXT:call void @print(i32 %test)
 ; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp
===
--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -75,7 +75,7 @@
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -563,7 +563,7 @@
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast(VMap[Shape.AllocaSpillBlock]);
   auto *OldEntry = &NewF->getEntryBlock();
@@ -1239,6 +1239,106 @@
   S.resize(N);
 }
 
+/// For every local variable that has lifetime intrinsics markers, we sin

[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-24 Thread Xun Li via Phabricator via cfe-commits
lxfind updated this revision to Diff 273169.
lxfind added a comment.

Actually it seems pretty easy to handle the case of multiple BitCastInst, so 
did it here. Also added a test.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314

Files:
  llvm/lib/Transforms/Coroutines/CoroSplit.cpp
  llvm/test/Transforms/Coroutines/coro-split-02.ll
  llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll

Index: llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
+++ llvm/test/Transforms/Coroutines/coro-split-sink-lifetime.ll
@@ -1,6 +1,5 @@
-; Tests that coro-split can handle the case when a code after coro.suspend uses
-; a value produces between coro.save and coro.suspend (%Result.i19)
-; and checks whether stray coro.saves are properly removed
+; Tests that coro-split will optimize the lifetime.start maker of each local variable,
+; sink them to the places closest to the actual use.
 ; RUN: opt < %s -coro-split -S | FileCheck %s
 ; RUN: opt < %s -passes=coro-split -S | FileCheck %s
 
@@ -15,6 +14,9 @@
 entry:
   %ref.tmp7 = alloca %"struct.lean_future::Awaiter", align 8
   %testval = alloca i32
+  %cast = bitcast i32* %testval to i8*
+  ; lifetime of %testval starts here, but not used until await.ready.
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %alloc = call i8* @malloc(i64 16) #3
   %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc)
@@ -29,8 +31,8 @@
 await.ready:
   %StrayCoroSave = call token @llvm.coro.save(i8* null)
   %val = load i32, i32* %Result.i19
-  %cast = bitcast i32* %testval to i8*
-  call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -40,14 +42,15 @@
 }
 
 ; CHECK-LABEL: @a.resume(
-; CHECK: %testval = alloca i32
-; CHECK: getelementptr inbounds %a.Frame
+; CHECK: %testval = alloca i32, align 4
+; CHECK-NEXT:getelementptr inbounds %a.Frame
 ; CHECK-NEXT:getelementptr inbounds %"struct.lean_future::Awaiter"
-; CHECK-NOT: call token @llvm.coro.save(i8* null)
+; CHECK-NEXT:%cast1 = bitcast i32* %testval to i8*
 ; CHECK-NEXT:%val = load i32, i32* %Result
-; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
-; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
-; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast1)
+; CHECK-NEXT:%test = load i32, i32* %testval
+; CHECK-NEXT:call void @print(i32 %test)
+; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast1)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
 
Index: llvm/test/Transforms/Coroutines/coro-split-02.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-02.ll
+++ llvm/test/Transforms/Coroutines/coro-split-02.ll
@@ -31,6 +31,8 @@
   %val = load i32, i32* %Result.i19
   %cast = bitcast i32* %testval to i8*
   call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
+  %test = load i32, i32* %testval
+  call void @print(i32 %test)
   call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
   call void @print(i32 %val)
   br label %exit
@@ -40,14 +42,15 @@
 }
 
 ; CHECK-LABEL: @a.resume(
-; CHECK: %testval = alloca i32
 ; CHECK: getelementptr inbounds %a.Frame
 ; CHECK-NEXT:getelementptr inbounds %"struct.lean_future::Awaiter"
 ; CHECK-NOT: call token @llvm.coro.save(i8* null)
 ; CHECK-NEXT:%val = load i32, i32* %Result
 ; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
 ; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
-; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
+; CHECK-NEXT:%test = load i32, i32* %testval
+; CHECK-NEXT:call void @print(i32 %test)
+; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8*  %cast)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
 
Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp
===
--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -75,7 +75,7 @@
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -563,7 +563,7 @@
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it end

[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-24 Thread Xun Li via Phabricator via cfe-commits
lxfind marked an inline comment as done.
lxfind added inline comments.



Comment at: llvm/lib/Transforms/Coroutines/CoroSplit.cpp:1286
+continue;
+  if (CastInst) {
+// If we have multiple cast instructions for the alloca, don't

junparser wrote:
> It is possible to handle multiple cast instructions as long as they are only 
> used by lifetime marker intrinsic. 
It's certainly possible. I didn't do it here because a reasonable compiler 
frontend should never emit multiple cast instructions for the same variable in 
order to mark lifetime. If there are, they must be used for something else.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-24 Thread JunMa via Phabricator via cfe-commits
junparser added a comment.

@lxfind, Thank you!  And could you please add some testcases?




Comment at: llvm/lib/Transforms/Coroutines/CoroSplit.cpp:1286
+continue;
+  if (CastInst) {
+// If we have multiple cast instructions for the alloca, don't

It is possible to handle multiple cast instructions as long as they are only 
used by lifetime marker intrinsic. 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-24 Thread Xun Li via Phabricator via cfe-commits
lxfind updated this revision to Diff 272916.
lxfind added a comment.

Address test failures


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314

Files:
  llvm/lib/Transforms/Coroutines/CoroSplit.cpp
  llvm/test/Transforms/Coroutines/coro-split-02.ll

Index: llvm/test/Transforms/Coroutines/coro-split-02.ll
===
--- llvm/test/Transforms/Coroutines/coro-split-02.ll
+++ llvm/test/Transforms/Coroutines/coro-split-02.ll
@@ -40,14 +40,9 @@
 }
 
 ; CHECK-LABEL: @a.resume(
-; CHECK: %testval = alloca i32
 ; CHECK: getelementptr inbounds %a.Frame
 ; CHECK-NEXT:getelementptr inbounds %"struct.lean_future::Awaiter"
-; CHECK-NOT: call token @llvm.coro.save(i8* null)
 ; CHECK-NEXT:%val = load i32, i32* %Result
-; CHECK-NEXT:%cast = bitcast i32* %testval to i8*
-; CHECK-NEXT:call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast)
-; CHECK-NEXT:call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast)
 ; CHECK-NEXT:call void @print(i32 %val)
 ; CHECK-NEXT:ret void
 
Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp
===
--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -75,7 +75,7 @@
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -563,7 +563,7 @@
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast(VMap[Shape.AllocaSpillBlock]);
   auto *OldEntry = &NewF->getEntryBlock();
@@ -1239,6 +1239,106 @@
   S.resize(N);
 }
 
+/// For every local variable that has lifetime intrinsics markers, we sink
+/// their lifetime.start marker to the places where the variable is being
+/// used for the first time. Doing so minimizes the lifetime of each variable,
+/// hence minimizing the amount of data we end up putting on the frame.
+static void sinkLifetimeStartMarkers(Function &F) {
+  DominatorTree Dom(F);
+  for (Instruction &I : instructions(F)) {
+// We look for this particular pattern:
+//   %tmpX = alloca %.., align ...
+//   %0 = bitcast %...* %tmpX to i8*
+//   call void @llvm.lifetime.start.p0i8(i64 ..., i8* nonnull %0) #2
+if (!isa(&I))
+  continue;
+BitCastInst *CastInst = nullptr;
+// There can be multiple lifetime start markers for the same variable.
+SmallPtrSet LifetimeStartInsts;
+// SinkBarriers stores all instructions that use this local variable.
+// When sinking the lifetime start intrinsics, we can never sink past
+// these barriers.
+SmallPtrSet SinkBarriers;
+bool Valid = true;
+auto addSinkBarrier = [&](Instruction *I) {
+  // When adding a new barrier to SinkBarriers, we maintain the case
+  // that no instruction in SinkBarriers dominates another instruction.
+  bool FoundDom = false;
+  SmallPtrSet ToRemove;
+  for (auto *S : SinkBarriers) {
+if (Dom.dominates(S, I)) {
+  FoundDom = true;
+  break;
+} else if (Dom.dominates(I, S)) {
+  ToRemove.insert(S);
+}
+  }
+  if (!FoundDom) {
+SinkBarriers.insert(I);
+for (auto *R : ToRemove) {
+  SinkBarriers.erase(R);
+}
+  }
+};
+for (User *U : I.users()) {
+  if (!isa(U))
+continue;
+  if (CastInst) {
+// If we have multiple cast instructions for the alloca, don't
+// deal with it beause it's too complex.
+Valid = false;
+break;
+  }
+  CastInst = cast(U);
+  for (User *CU : CastInst->users()) {
+// If we see any user of CastInst that's not lifetime start/end
+// intrinsics, give up because it's too complex.
+if (auto *CUI = dyn_cast(CU)) {
+  if (CUI->getIntrinsicID() == Intrinsic::lifetime_start)
+LifetimeStartInsts.insert(CUI);
+  else if (CUI->getIntrinsicID() == Intrinsic::lifetime_end)
+addSinkBarrier(CUI);
+  else
+Valid = false;
+} else {
+  Valid = false;
+}
+  }
+}
+if (!Valid || LifetimeStartInsts.empty())
+  continue;
+
+for (User *U : I.users()) {
+  if (U == CastInst)
+continue;
+  // Every user of the variable is also a sink barrier.
+  addSinkBarrier(cast(U));
+}
+
+// For each sink barrier, we insert a lifetime start marker right
+// before it.
+const auto *LifetimeStartInst = *

[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread Xun Li via Phabricator via cfe-commits
lxfind updated this revision to Diff 272904.
lxfind added a comment.
Herald added subscribers: llvm-commits, hiraditya.
Herald added a project: LLVM.

Tackle this problem inside CoroSplit as an optimization. Instead of only 
handling one particular case, we now look at every local variable in the 
coroutine, and sink their lifetime start markers when possible. This will bring 
in more benefits than doing so during IR emit. Confirmed that it works.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314

Files:
  llvm/lib/Transforms/Coroutines/CoroSplit.cpp

Index: llvm/lib/Transforms/Coroutines/CoroSplit.cpp
===
--- llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -75,7 +75,7 @@
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -563,7 +563,7 @@
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast(VMap[Shape.AllocaSpillBlock]);
   auto *OldEntry = &NewF->getEntryBlock();
@@ -1239,6 +1239,106 @@
   S.resize(N);
 }
 
+/// For every local variable that has lifetime intrinsics markers, we sink
+/// their lifetime.start marker to the places where the variable is being
+/// used for the first time. Doing so minimizes the lifetime of each variable,
+/// hence minimizing the amount of data we end up putting on the frame.
+static void sinkLifetimeStartMarkers(Function &F) {
+  DominatorTree Dom(F);
+  for (Instruction &I : instructions(F)) {
+// We look for this particular pattern:
+//   %tmpX = alloca %.., align ...
+//   %0 = bitcast %...* %tmpX to i8*
+//   call void @llvm.lifetime.start.p0i8(i64 ..., i8* nonnull %0) #2
+if (!isa(&I))
+  continue;
+BitCastInst *CastInst = nullptr;
+// There can be multiple lifetime start markers for the same variable.
+SmallPtrSet LifetimeStartInsts;
+// SinkBarriers stores all instructions that use this local variable.
+// When sinking the lifetime start intrinsics, we can never sink past
+// these barriers.
+SmallPtrSet SinkBarriers;
+bool Valid = true;
+auto addSinkBarrier = [&](Instruction *I) {
+  // When adding a new barrier to SinkBarriers, we maintain the case
+  // that no instruction in SinkBarriers dominates another instruction.
+  bool FoundDom = false;
+  SmallPtrSet ToRemove;
+  for (auto *S : SinkBarriers) {
+if (Dom.dominates(S, I)) {
+  FoundDom = true;
+  break;
+} else if (Dom.dominates(I, S)) {
+  ToRemove.insert(S);
+}
+  }
+  if (!FoundDom) {
+SinkBarriers.insert(I);
+for (auto *R : ToRemove) {
+  SinkBarriers.erase(R);
+}
+  }
+};
+for (User *U : I.users()) {
+  if (!isa(U))
+continue;
+  if (CastInst) {
+// If we have multiple cast instructions for the alloca, don't
+// deal with it beause it's too complex.
+Valid = false;
+break;
+  }
+  CastInst = cast(U);
+  for (User *CU : CastInst->users()) {
+// If we see any user of CastInst that's not lifetime start/end
+// intrinsics, give up because it's too complex.
+if (auto *CUI = dyn_cast(CU)) {
+  if (CUI->getIntrinsicID() == Intrinsic::lifetime_start)
+LifetimeStartInsts.insert(CUI);
+  else if (CUI->getIntrinsicID() == Intrinsic::lifetime_end)
+addSinkBarrier(CUI);
+  else
+Valid = false;
+} else {
+  Valid = false;
+}
+  }
+}
+if (!Valid || LifetimeStartInsts.empty())
+  continue;
+
+for (User *U : I.users()) {
+  if (U == CastInst)
+continue;
+  // Every user of the variable is also a sink barrier.
+  addSinkBarrier(cast(U));
+}
+
+// For each sink barrier, we insert a lifetime start marker right
+// before it.
+const auto *LifetimeStartInst = *LifetimeStartInsts.begin();
+for (auto *S : SinkBarriers) {
+  if (auto *IS = dyn_cast(S)) {
+if (IS->getIntrinsicID() == Intrinsic::lifetime_end) {
+  // If we have a lifetime end marker in SinkBarriers, meaning it's
+  // not dominated by any other users, we can safely delete it.
+  IS->eraseFromParent();
+  continue;
+}
+  }
+  LifetimeStartInst->clone()->insertBefore(S);
+}
+// All the old markers are no longer necessary.
+f

[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread JunMa via Phabricator via cfe-commits
junparser added a comment.

In D82314#2109437 , @lxfind wrote:

> In D82314#2107910 , @junparser wrote:
>
> > Rather than doing it here, can we build await_resume call expression with 
> > MaterializedTemporaryExpr when expand the coawait expression. That's how 
> > gcc does.
>
>
> There doesn't appear to be a way to do that in Clang. It goes from the AST to 
> IR directly, and there needs to be a MaterializedTemporaryExpr to wrap the 
> result of co_await. Could you elaborate on how this might be done in Clang?


For now, we only wrap coawait expression with MaterializedTemporaryExpr when 
the kind of result is VK_RValue, We can wrap await_resume call instead in such 
case when build coawait expression. so in emitSuspendExpression, we can 
directly emit await_call expression with MaterializedTemporaryExpr.

I think this should work, although i'm not so sure.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread JunMa via Phabricator via cfe-commits
junparser added a comment.

In D82314#2109893 , @rsmith wrote:

> In D82314#2109728 , @lxfind wrote:
>
> > @rsmith Thanks. That's a good point. Do you know if there already exists 
> > optimization passes in LLVM that attempts to shrink the range of lifetime 
> > intrinsics? If so, I am curious why that does not help in this case. Or is 
> > it generally unsafe to move the lifetime intrinsics, and we could only do 
> > it here with specific context knowledge about coroutines.
>
>
> I don't know for sure, but I would expect someone to have implemented such a 
> pass already. Moving a lifetime start intrinsic later, past instructions that 
> can't possibly reference the object in question, seems like it should always 
> be safe and (presumably) should always be a good thing to do, and similarly 
> for moving lifetime end markers earlier. It could be that such a pass exists 
> but it is run too late in the pass pipeline, so the coroutine split pass 
> doesn't get to take advantage of it.


@lxfind,  Also lifetime marker of variable are much complex because of the 
existing of exceptional path(multiple lifetime start & multiple lifetime end) , 
so it is hard to optimize such cases.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread Richard Smith - zygoloid via Phabricator via cfe-commits
rsmith added a comment.

In D82314#2109728 , @lxfind wrote:

> @rsmith Thanks. That's a good point. Do you know if there already exists 
> optimization passes in LLVM that attempts to shrink the range of lifetime 
> intrinsics? If so, I am curious why that does not help in this case. Or is it 
> generally unsafe to move the lifetime intrinsics, and we could only do it 
> here with specific context knowledge about coroutines.


I don't know for sure, but I would expect someone to have implemented such a 
pass already. Moving a lifetime start intrinsic later, past instructions that 
can't possibly reference the object in question, seems like it should always be 
safe and (presumably) should always be a good thing to do, and similarly for 
moving lifetime end markers earlier. It could be that such a pass exists but it 
is run too late in the pass pipeline, so the coroutine split pass doesn't get 
to take advantage of it.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread Xun Li via Phabricator via cfe-commits
lxfind added a comment.

@rsmith Thanks. That's a good point. Do you know if there already exists 
optimization passes in LLVM that attempts to shrink the range of lifetime 
intrinsics? If so, I am curious why that does not help in this case. Or is it 
generally unsafe to move the lifetime intrinsics, and we could only do it here 
with specific context knowledge about coroutines.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread Richard Smith - zygoloid via Phabricator via cfe-commits
rsmith added a comment.

In D82314#2109437 , @lxfind wrote:

> In D82314#2107910 , @junparser wrote:
>
> > Rather than doing it here, can we build await_resume call expression with 
> > MaterializedTemporaryExpr when expand the coawait expression. That's how 
> > gcc does.
>
>
> There doesn't appear to be a way to do that in Clang. It goes from the AST to 
> IR directly, and there needs to be a MaterializedTemporaryExpr to wrap the 
> result of co_await. Could you elaborate on how this might be done in Clang?


For a call such as:

  coro f() { awaitable a; (co_await a).g(); }

we produce an AST like:

  |   |   `-CXXMemberCallExpr  'void'
  |   | `-MemberExpr  '' .g 
0x55d38948ca98
  |   |   `-MaterializeTemporaryExpr  'huge' xvalue
  |   | `-ParenExpr  'huge'
  |   |   `-CoawaitExpr  'huge'
  |   | |-DeclRefExpr  'awaitable' lvalue Var 
0x55d38948d4c8 'a' 'awaitable'
  |   | |-CXXMemberCallExpr  'bool'
  |   | | `-MemberExpr  '' 
.await_ready 0x55d38948cee8
  |   | |   `-OpaqueValueExpr  'awaitable' lvalue
  |   | | `-DeclRefExpr  'awaitable' lvalue Var 
0x55d38948d4c8 'a' 'awaitable'
  |   | |-CXXMemberCallExpr  'void'
  |   | | |-MemberExpr  '' 
.await_suspend 0x55d38948d298
  |   | | | `-OpaqueValueExpr  'awaitable' lvalue
  |   | | |   `-DeclRefExpr  'awaitable' lvalue Var 
0x55d38948d4c8 'a' 'awaitable'
  |   | | `-CXXConstructExpr  
'std::coroutine_handle<>':'std::experimental::coroutines_v1::coroutine_handle'
 'void (std::experimental::coroutines_v1::coroutine_handle &&) noexcept'
  |   | |   `-ImplicitCastExpr  
'std::experimental::coroutines_v1::coroutine_handle' xvalue 

  |   | | `-MaterializeTemporaryExpr  
'std::experimental::coroutines_v1::coroutine_handle' xvalue
  |   | |   `-CallExpr  
'std::experimental::coroutines_v1::coroutine_handle'
  |   | | |-ImplicitCastExpr  
'std::experimental::coroutines_v1::coroutine_handle 
(*)(void *) noexcept' 
  |   | | | `-DeclRefExpr  
'std::experimental::coroutines_v1::coroutine_handle (void 
*) noexcept' lvalue CXXMethod 0x55d38948adb0 'from_address' 
'std::experimental::coroutines_v1::coroutine_handle (void 
*) noexcept'
  |   | | `-CallExpr  'void *'
  |   | |   `-ImplicitCastExpr  'void *(*)() 
noexcept' 
  |   | | `-DeclRefExpr  'void *() noexcept' 
lvalue Function 0x55d38948ef38 '__builtin_coro_frame' 'void *() noexcept'
  |   | `-CXXBindTemporaryExpr  'huge' (CXXTemporary 
0x55d38948ffb8)
  |   |   `-CXXMemberCallExpr  'huge'
  |   | `-MemberExpr  '' 
.await_resume 0x55d38948cff8
  |   |   `-OpaqueValueExpr  'awaitable' lvalue
  |   | `-DeclRefExpr  'awaitable' lvalue Var 
0x55d38948d4c8 'a' 'awaitable'

See https://godbolt.org/z/hVn2u9.

I think the suggestion is to move the `MaterializeTemporaryExpr` from being 
wrapped around the  `CoawaitExpr` to being wrapped around the `.await_resume` 
call within it.

Unfortunately, that's not a correct change. The language rules require us to 
delay materializing the temporary until we see how it is used. For example, in 
a case such as

  g(co_await a);

... no temporary is materialized at all, and the `await_resume` call instead 
directly initializes the parameter slot for the function.



It seems to me that the same issue (of making large objects unnecessarily live 
across suspend points) can arise for other similar cases too. For example, 
consider:

  huge(co_await a).g(); // suppose `co_await a` returns something small

Here again we will start the lifetime of the `huge` temporary before the 
suspend point, and only initialize it after the resume. Your change won't help 
here, because it's not the lifetime markers of the value produced by `co_await` 
that are the problem.

As a result of the above, I think this is the wrong level at which to perform 
this optimization. Instead, I think you should consider whether we can move 
lifetime start markers later (and end markers earlier, for unescaped locals) as 
part of the coroutine splitting pass.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-23 Thread Xun Li via Phabricator via cfe-commits
lxfind added a comment.

In D82314#2107910 , @junparser wrote:

> Rather than doing it here, can we build await_resume call expression with 
> MaterializedTemporaryExpr when expand the coawait expression. That's how gcc 
> does.


There doesn't appear to be a way to do that in Clang. It goes from the AST to 
IR directly, and there needs to be a MaterializedTemporaryExpr to wrap the 
result of co_await. Could you elaborate on how this might be done in Clang?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-22 Thread JunMa via Phabricator via cfe-commits
junparser added a comment.

Rather than doing it here, can we build await_resume call expression with 
MaterializedTemporaryExpr when expand the coawait expression. That's how gcc 
does.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D82314/new/

https://reviews.llvm.org/D82314



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D82314: [RFC][Coroutines] Optimize the lifespan of temporary co_await object

2020-06-22 Thread Xun Li via Phabricator via cfe-commits
lxfind created this revision.
lxfind added reviewers: lewissbaker, modocache, junparser.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

If we ever assign co_await to a temporary variable, such as foo(co_await expr),
we generate AST that looks like this: 
MaterializedTemporaryExpr(CoawaitExpr(...)).
MaterializedTemporaryExpr would emit an intrinsics that marks the lifetime 
start of the
temporary storage. However such temporary storage will not be used until 
co_await is ready
to write the result. Marking the lifetime start way too early causes extra 
storage to be
put in the coroutine frame instead of the stack.
As you can see from https://godbolt.org/z/XsLUiY, the frame generated for 
get_big_object2 is 12K, which contains a big_object object unnecessarily.
After this patch, the frame size for get_big_object2 is now only 8K. There are 
still room for improvements, in particular, GCC has a 4K frame for this 
function. But that's a separate problem and not addressed in this patch.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D82314

Files:
  clang/lib/CodeGen/CGCoroutine.cpp


Index: clang/lib/CodeGen/CGCoroutine.cpp
===
--- clang/lib/CodeGen/CGCoroutine.cpp
+++ clang/lib/CodeGen/CGCoroutine.cpp
@@ -245,10 +245,39 @@
   }
 
   LValueOrRValue Res;
-  if (forLValue)
+  if (forLValue) {
 Res.LV = CGF.EmitLValue(S.getResumeExpr());
-  else
+  } else {
+// If the result of co_await is stored into a memory address, we want to
+// make sure the lifetime of that memory address does not start too early,
+// causing more space to be allocated in the frame rather than on stack.
+// The lifetime of the return value of co_await should start here right
+// before we attempt to assign it.
+auto adjustLifetimeStart = [&]() {
+  // aggSlot points to the instruction that allocated the object. Latter
+  // instructions use it in this pattern:
+  //   %tmpX = alloca %.., align 4
+  //   %0 = bitcast %...* %tmpX to i8*
+  //   call void @llvm.lifetime.start.p0i8(i64 ..., i8* nonnull %0) #2
+  // Hence we trace back through uses to eventually locate the lifetime
+  // start intrinsics marker, and move it down to the current insertion
+  // point.
+  auto *AllocaInst =
+  dyn_cast_or_null(aggSlot.getPointer());
+  if (!AllocaInst || AllocaInst->getNumUses() != 1)
+return;
+  auto *CastInst = dyn_cast(*AllocaInst->users().begin());
+  if (!CastInst || CastInst->getNumUses() != 1)
+return;
+  if (auto *LifetimeInst =
+  dyn_cast(*CastInst->users().begin())) {
+LifetimeInst->removeFromParent();
+CGF.Builder.Insert(LifetimeInst);
+  }
+};
+adjustLifetimeStart();
 Res.RV = CGF.EmitAnyExpr(S.getResumeExpr(), aggSlot, ignoreResult);
+  }
 
   if (TryStmt) {
 Builder.CreateFlagStore(false, Coro.ResumeEHVar);


Index: clang/lib/CodeGen/CGCoroutine.cpp
===
--- clang/lib/CodeGen/CGCoroutine.cpp
+++ clang/lib/CodeGen/CGCoroutine.cpp
@@ -245,10 +245,39 @@
   }
 
   LValueOrRValue Res;
-  if (forLValue)
+  if (forLValue) {
 Res.LV = CGF.EmitLValue(S.getResumeExpr());
-  else
+  } else {
+// If the result of co_await is stored into a memory address, we want to
+// make sure the lifetime of that memory address does not start too early,
+// causing more space to be allocated in the frame rather than on stack.
+// The lifetime of the return value of co_await should start here right
+// before we attempt to assign it.
+auto adjustLifetimeStart = [&]() {
+  // aggSlot points to the instruction that allocated the object. Latter
+  // instructions use it in this pattern:
+  //   %tmpX = alloca %.., align 4
+  //   %0 = bitcast %...* %tmpX to i8*
+  //   call void @llvm.lifetime.start.p0i8(i64 ..., i8* nonnull %0) #2
+  // Hence we trace back through uses to eventually locate the lifetime
+  // start intrinsics marker, and move it down to the current insertion
+  // point.
+  auto *AllocaInst =
+  dyn_cast_or_null(aggSlot.getPointer());
+  if (!AllocaInst || AllocaInst->getNumUses() != 1)
+return;
+  auto *CastInst = dyn_cast(*AllocaInst->users().begin());
+  if (!CastInst || CastInst->getNumUses() != 1)
+return;
+  if (auto *LifetimeInst =
+  dyn_cast(*CastInst->users().begin())) {
+LifetimeInst->removeFromParent();
+CGF.Builder.Insert(LifetimeInst);
+  }
+};
+adjustLifetimeStart();
 Res.RV = CGF.EmitAnyExpr(S.getResumeExpr(), aggSlot, ignoreResult);
+  }
 
   if (TryStmt) {
 Builder.CreateFlagStore(false, Coro.ResumeEHVar);
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits