[PATCH] D129476: [AArch64][SVE] Prefer SIMD variant of clast[ab]

2022-07-13 Thread Cullen Rhodes via Phabricator via cfe-commits
c-rhodes closed this revision.
c-rhodes marked an inline comment as done.
c-rhodes added a comment.

In D129476#3645667 , @efriedma wrote:

> Please update the comment, then LGTM

Done, cheers Eli.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129476/new/

https://reviews.llvm.org/D129476

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D129476: [AArch64][SVE] Prefer SIMD variant of clast[ab]

2022-07-12 Thread Eli Friedman via Phabricator via cfe-commits
efriedma accepted this revision.
efriedma added a comment.
This revision is now accepted and ready to land.

Please update the comment, then LGTM




Comment at: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp:801
+  IntrinsicInst ) {
+  // Replace scalar integer CLAST[AB] intrinsic with optimal SIMD variant.
+  IRBuilder<> Builder(II.getContext());

Maybe put a bit of the explanation you just gave into a comment here, for 
reference.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129476/new/

https://reviews.llvm.org/D129476

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D129476: [AArch64][SVE] Prefer SIMD variant of clast[ab]

2022-07-12 Thread Cullen Rhodes via Phabricator via cfe-commits
c-rhodes added a comment.

In D129476#3643382 , @efriedma wrote:

> Are you saying that it's faster to use clasta targeting a float register, 
> then move the result to an integer register, rather than use the integer form 
> directly?  Or is the issue just that we want to split the operations in case 
> we can simplify the resulting bitcast?

The former, for the most part. If clast[ab] is inside a loop and is a 
loop-carried dependency it's considerably faster. If it's a straight 
bitcast-to-fp + clast[ab] + bitcast-to-int then that costs a cycle or two more 
depending on the micro-architecture, but in slightly more complicated code from 
what I've observed the SIMD with bitcasts is as fast as the integer variant, 
if not faster.

> Do we expect SelectionDAG to combine a clasta+bitcast to a single 
> instruction?  Do we have test coverage for that?

No, there'll be a mov to an integer register.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129476/new/

https://reviews.llvm.org/D129476

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D129476: [AArch64][SVE] Prefer SIMD variant of clast[ab]

2022-07-11 Thread Eli Friedman via Phabricator via cfe-commits
efriedma added a comment.

Are you saying that it's faster to use clasta targeting a float register, then 
move the result to an integer register, rather than use the integer form 
directly?  Or is the issue just that we want to split the operations in case we 
can simplify the resulting bitcast?

Do we expect SelectionDAG to combine a clasta+bitcast to a single instruction?  
Do we have test coverage for that?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129476/new/

https://reviews.llvm.org/D129476

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D129476: [AArch64][SVE] Prefer SIMD variant of clast[ab]

2022-07-11 Thread Cullen Rhodes via Phabricator via cfe-commits
c-rhodes updated this revision to Diff 443594.
c-rhodes added a comment.

Add full patch context.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129476/new/

https://reviews.llvm.org/D129476

Files:
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
  llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
  llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll

Index: llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
===
--- /dev/null
+++ llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target triple = "aarch64"
+
+define i16 @clastb_n_i16( %pg, i16 %a,  %b) {
+; CHECK-LABEL: @clastb_n_i16(
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i16 [[A:%.*]] to half
+; CHECK-NEXT:[[TMP2:%.*]] = bitcast  [[B:%.*]] to 
+; CHECK-NEXT:[[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16( [[PG:%.*]], half [[TMP1]],  [[TMP2]])
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+; CHECK-NEXT:ret i16 [[TMP4]]
+;
+  %out = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16( %pg, i16 %a,  %b)
+  ret i16 %out
+}
+
+define i32 @clastb_n_i32( %pg, i32 %a,  %b) {
+; CHECK-LABEL: @clastb_n_i32(
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i32 [[A:%.*]] to float
+; CHECK-NEXT:[[TMP2:%.*]] = bitcast  [[B:%.*]] to 
+; CHECK-NEXT:[[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32( [[PG:%.*]], float [[TMP1]],  [[TMP2]])
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+; CHECK-NEXT:ret i32 [[TMP4]]
+;
+  %out = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32( %pg, i32 %a,  %b)
+  ret i32 %out
+}
+
+define i64 @clastb_n_i64( %pg, i64 %a,  %b) {
+; CHECK-LABEL: @clastb_n_i64(
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i64 [[A:%.*]] to double
+; CHECK-NEXT:[[TMP2:%.*]] = bitcast  [[B:%.*]] to 
+; CHECK-NEXT:[[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64( [[PG:%.*]], double [[TMP1]],  [[TMP2]])
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+; CHECK-NEXT:ret i64 [[TMP4]]
+;
+  %out = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64( %pg, i64 %a,  %b)
+  ret i64 %out
+}
+
+declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(, i16, )
+declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(, i32, )
+declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(, i64, )
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -796,6 +796,44 @@
   return IC.replaceInstUsesWith(II, Extract);
 }
 
+static Optional instCombineSVECondLast(InstCombiner ,
+  IntrinsicInst ) {
+  // Replace scalar integer CLAST[AB] intrinsic with optimal SIMD variant.
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint();
+  Value *Pg = II.getArgOperand(0);
+  Value *Fallback = II.getArgOperand(1);
+  Value *Vec = II.getArgOperand(2);
+  Type *Ty = II.getType();
+
+  if (!Ty->isIntegerTy())
+return None;
+
+  Type *FPTy;
+  switch (cast(Ty)->getBitWidth()) {
+  default:
+return None;
+  case 16:
+FPTy = Builder.getHalfTy();
+break;
+  case 32:
+FPTy = Builder.getFloatTy();
+break;
+  case 64:
+FPTy = Builder.getDoubleTy();
+break;
+  }
+
+  Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
+  auto *FPVTy = VectorType::get(
+  FPTy, cast(Vec->getType())->getElementCount());
+  Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
+  auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
+   {Pg, FPFallBack, FPVec});
+  Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
+  return IC.replaceInstUsesWith(II, FPIItoInt);
+}
+
 static Optional instCombineRDFFR(InstCombiner ,
 IntrinsicInst ) {
   LLVMContext  = II.getContext();
@@ -1294,6 +1332,9 @@
   case Intrinsic::aarch64_sve_lasta:
   case Intrinsic::aarch64_sve_lastb:
 return instCombineSVELast(IC, II);
+  case Intrinsic::aarch64_sve_clasta_n:
+  case Intrinsic::aarch64_sve_clastb_n:
+return instCombineSVECondLast(IC, II);
   case Intrinsic::aarch64_sve_cntd:
 return instCombineSVECntElts(IC, II, 2);
   case Intrinsic::aarch64_sve_cntw:
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
===
--- clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
@@ -215,14 +215,20 @@
 // CHECK-LABEL: @test_svclastb_n_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:   

[PATCH] D129476: [AArch64][SVE] Prefer SIMD variant of clast[ab]

2022-07-11 Thread Cullen Rhodes via Phabricator via cfe-commits
c-rhodes created this revision.
c-rhodes added reviewers: paulwalker-arm, bsmith, peterwaller-arm, DavidTruby.
Herald added subscribers: psnobl, hiraditya, kristof.beyls, tschuett.
Herald added a reviewer: efriedma.
Herald added a project: All.
c-rhodes requested review of this revision.
Herald added projects: clang, LLVM.
Herald added a subscriber: cfe-commits.

The scalar variant with GPR source/dest has considerably higher latency
than the SIMD scalar variant across a variety of micro-architectures:

  Core   ScalarSIMD
  
  Neoverse V1 9 cyc  3 cyc
  Neoverse N2 8 cyc  3 cyc
  Cortex A510 8 cyc  4 cyc


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D129476

Files:
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clasta.c
  clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_clastb.c
  llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
  llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll

Index: llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
===
--- /dev/null
+++ llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-clast.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+target triple = "aarch64"
+
+define i16 @clastb_n_i16( %pg, i16 %a,  %b) {
+; CHECK-LABEL: @clastb_n_i16(
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i16 [[A:%.*]] to half
+; CHECK-NEXT:[[TMP2:%.*]] = bitcast  [[B:%.*]] to 
+; CHECK-NEXT:[[TMP3:%.*]] = call half @llvm.aarch64.sve.clastb.n.nxv8f16( [[PG:%.*]], half [[TMP1]],  [[TMP2]])
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast half [[TMP3]] to i16
+; CHECK-NEXT:ret i16 [[TMP4]]
+;
+  %out = call i16 @llvm.aarch64.sve.clastb.n.nxv8i16( %pg, i16 %a,  %b)
+  ret i16 %out
+}
+
+define i32 @clastb_n_i32( %pg, i32 %a,  %b) {
+; CHECK-LABEL: @clastb_n_i32(
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i32 [[A:%.*]] to float
+; CHECK-NEXT:[[TMP2:%.*]] = bitcast  [[B:%.*]] to 
+; CHECK-NEXT:[[TMP3:%.*]] = call float @llvm.aarch64.sve.clastb.n.nxv4f32( [[PG:%.*]], float [[TMP1]],  [[TMP2]])
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast float [[TMP3]] to i32
+; CHECK-NEXT:ret i32 [[TMP4]]
+;
+  %out = call i32 @llvm.aarch64.sve.clastb.n.nxv4i32( %pg, i32 %a,  %b)
+  ret i32 %out
+}
+
+define i64 @clastb_n_i64( %pg, i64 %a,  %b) {
+; CHECK-LABEL: @clastb_n_i64(
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i64 [[A:%.*]] to double
+; CHECK-NEXT:[[TMP2:%.*]] = bitcast  [[B:%.*]] to 
+; CHECK-NEXT:[[TMP3:%.*]] = call double @llvm.aarch64.sve.clastb.n.nxv2f64( [[PG:%.*]], double [[TMP1]],  [[TMP2]])
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast double [[TMP3]] to i64
+; CHECK-NEXT:ret i64 [[TMP4]]
+;
+  %out = call i64 @llvm.aarch64.sve.clastb.n.nxv2i64( %pg, i64 %a,  %b)
+  ret i64 %out
+}
+
+declare i16 @llvm.aarch64.sve.clastb.n.nxv8i16(, i16, )
+declare i32 @llvm.aarch64.sve.clastb.n.nxv4i32(, i32, )
+declare i64 @llvm.aarch64.sve.clastb.n.nxv2i64(, i64, )
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -796,6 +796,44 @@
   return IC.replaceInstUsesWith(II, Extract);
 }
 
+static Optional instCombineSVECondLast(InstCombiner ,
+  IntrinsicInst ) {
+  // Replace scalar integer CLAST[AB] intrinsic with optimal SIMD variant.
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint();
+  Value *Pg = II.getArgOperand(0);
+  Value *Fallback = II.getArgOperand(1);
+  Value *Vec = II.getArgOperand(2);
+  Type *Ty = II.getType();
+
+  if (!Ty->isIntegerTy())
+return None;
+
+  Type *FPTy;
+  switch (cast(Ty)->getBitWidth()) {
+  default:
+return None;
+  case 16:
+FPTy = Builder.getHalfTy();
+break;
+  case 32:
+FPTy = Builder.getFloatTy();
+break;
+  case 64:
+FPTy = Builder.getDoubleTy();
+break;
+  }
+
+  Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
+  auto *FPVTy = VectorType::get(
+  FPTy, cast(Vec->getType())->getElementCount());
+  Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
+  auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
+   {Pg, FPFallBack, FPVec});
+  Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
+  return IC.replaceInstUsesWith(II, FPIItoInt);
+}
+
 static Optional instCombineRDFFR(InstCombiner ,
 IntrinsicInst ) {
   LLVMContext  = II.getContext();
@@ -1294,6 +1332,9 @@
   case Intrinsic::aarch64_sve_lasta:
   case Intrinsic::aarch64_sve_lastb:
 return instCombineSVELast(IC, II);
+  case Intrinsic::aarch64_sve_clasta_n:
+  case