[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_ds_bpermute argument types (PR #153501)

via cfe-commits Thu, 21 Aug 2025 22:22:22 -0700

github-actions[bot] wrote:

<!--LLVM CODE FORMAT COMMENT: {clang-format}-->



:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff HEAD~1 HEAD --extensions cpp -- 
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp clang/lib/Sema/SemaAMDGPU.cpp 
clang/lib/Sema/SemaChecking.cpp
``````````

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp 
b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 8b7e419a1..fca6fbbf5 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -165,20 +165,22 @@ Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned 
Index) {
 // Assumptions:
 // - Return type equals source type (frontend/Sema should enforce).
 // - Semantics are on the object representation (raw bits), including padding.
-// - For payloads > 32 bits, split into 32-bit words, permute each with the 
same index,
+// - For payloads > 32 bits, split into 32-bit words, permute each with the 
same
+// index,
 //   and reassemble.
-// - First-class scalar/vector values whose total size is a multiple of 32 
bits use a
-//   register-only path by bitcasting to <N x i32>. Aggregates or odd sizes 
use a
-//   memory-backed path.
-// - = 32-bit scalars (char/short/int/float/half) follow a fast i32 path for 
performance.
-llvm::Value *
-emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
-                     const clang::CallExpr *Call) {
-  auto &B   = CGF.Builder;
+// - First-class scalar/vector values whose total size is a multiple of 32 bits
+// use a
+//   register-only path by bitcasting to <N x i32>. Aggregates or odd sizes use
+//   a memory-backed path.
+// - = 32-bit scalars (char/short/int/float/half) follow a fast i32 path for
+// performance.
+llvm::Value *emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
+                                  const clang::CallExpr *Call) {
+  auto &B = CGF.Builder;
   auto &CGM = CGF.CGM;
   const llvm::DataLayout &DL = CGM.getDataLayout();
 
-  llvm::Type *I8  = B.getInt8Ty();
+  llvm::Type *I8 = B.getInt8Ty();
   llvm::Type *I32 = B.getInt32Ty();
   llvm::Type *I64 = B.getInt64Ty();
 
@@ -194,24 +196,29 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
   // - Integers: zext/trunc to i32.
   // - Pointers: ptrtoint to intptr, then zext/trunc to i32.
   // - Other first-class: bitcast to intN then zext/trunc to i32.
-  auto toI32Index = [&](llvm::Value *IdxVal, clang::QualType IdxQT) -> 
llvm::Value * {
+  auto toI32Index = [&](llvm::Value *IdxVal,
+                        clang::QualType IdxQT) -> llvm::Value * {
     (void)IdxQT; // signedness not relevant for index
     llvm::Type *Ty = IdxVal->getType();
     if (Ty->isIntegerTy())
       return B.CreateZExtOrTrunc(IdxVal, I32);
     if (Ty->isPointerTy()) {
       unsigned PtrBits = DL.getPointerSizeInBits(Ty->getPointerAddressSpace());
-      return B.CreateZExtOrTrunc(B.CreatePtrToInt(IdxVal, 
B.getIntNTy(PtrBits)), I32);
+      return B.CreateZExtOrTrunc(B.CreatePtrToInt(IdxVal, 
B.getIntNTy(PtrBits)),
+                                 I32);
     }
     unsigned Bits = getBitWidth(Ty);
     return B.CreateZExtOrTrunc(B.CreateBitCast(IdxVal, B.getIntNTy(Bits)), 
I32);
   };
 
   // Coerces an arbitrary = 32-bit scalar payload to i32.
-  // - Integers: extend to i32 honoring signedness if narrower; zext/trunc 
otherwise.
+  // - Integers: extend to i32 honoring signedness if narrower; zext/trunc
+  // otherwise.
   // - Pointers: ptrtoint to intptr, then zext/trunc to i32.
-  // - Other first-class scalars (e.g., float, half): bitcast to intN then 
zext/trunc to i32.
-  auto coercePayloadToI32 = [&](llvm::Value *Val, clang::QualType SrcQT) -> 
llvm::Value * {
+  // - Other first-class scalars (e.g., float, half): bitcast to intN then
+  // zext/trunc to i32.
+  auto coercePayloadToI32 = [&](llvm::Value *Val,
+                                clang::QualType SrcQT) -> llvm::Value * {
     llvm::Type *Ty = Val->getType();
     if (Ty->isIntegerTy()) {
       unsigned BW = Ty->getIntegerBitWidth();
@@ -224,20 +231,22 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
     }
     if (Ty->isPointerTy()) {
       unsigned PtrBits = DL.getPointerSizeInBits(Ty->getPointerAddressSpace());
-      return B.CreateZExtOrTrunc(B.CreatePtrToInt(Val, B.getIntNTy(PtrBits)), 
I32);
+      return B.CreateZExtOrTrunc(B.CreatePtrToInt(Val, B.getIntNTy(PtrBits)),
+                                 I32);
     }
     unsigned Bits = getBitWidth(Ty);
     return B.CreateZExtOrTrunc(B.CreateBitCast(Val, B.getIntNTy(Bits)), I32);
   };
 
   // Converts an i32 result back to an arbitrary = 32-bit destination type.
-  // - Integer = 32 bits: zext/sext/trunc appropriately using source 
signedness for narrow types.
+  // - Integer = 32 bits: zext/sext/trunc appropriately using source signedness
+  // for narrow types.
   // - Pointer = 32 bits: zext/trunc to pointer width and inttoptr.
   // - Other first-class types:
   //   - If 32 bits: bitcast i32 to destination type.
-  //   - If narrower than 32 bits (e.g., half = 16): first trunc i32 to iN, 
then bitcast iN to DstTy.
-  auto coerceFromI32ToType = [&](llvm::Value *I32Val,
-                                 llvm::Type *DstTy,
+  //   - If narrower than 32 bits (e.g., half = 16): first trunc i32 to iN, 
then
+  //   bitcast iN to DstTy.
+  auto coerceFromI32ToType = [&](llvm::Value *I32Val, llvm::Type *DstTy,
                                  clang::QualType SrcQT) -> llvm::Value * {
     if (DstTy->isIntegerTy()) {
       unsigned DW = DstTy->getIntegerBitWidth();
@@ -265,12 +274,14 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
     if (BW < 32)
       Tr = B.CreateTrunc(I32Val, IntBW);
     else if (BW > 32)
-      Tr = B.CreateZExt(I32Val, IntBW); // should not happen in the fast 
32-bit path
+      Tr = B.CreateZExt(I32Val,
+                        IntBW); // should not happen in the fast 32-bit path
     return B.CreateBitCast(Tr, DstTy);
   };
 
   // Returns {wordCount, tailBytes} for a payload size in bits.
-  auto wordCountAndTail = [&](unsigned totalBits) -> std::pair<unsigned, 
unsigned> {
+  auto wordCountAndTail =
+      [&](unsigned totalBits) -> std::pair<unsigned, unsigned> {
     unsigned bytes = totalBits / 8;
     return {bytes / 4, bytes % 4};
   };
@@ -297,14 +308,16 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
     if (totalBits <= 32) {
       llvm::Value *SrcI32 = coercePayloadToI32(SrcVal, SrcQT);
       llvm::SmallVector<llvm::Value *, 2> ArgsA{IndexI32, SrcI32};
-      llvm::Value *ResI32 = B.CreateCall(Bperm->getFunctionType(), Bperm, 
ArgsA);
-      llvm::Value *Res    = coerceFromI32ToType(ResI32, RetTy, SrcQT);
+      llvm::Value *ResI32 =
+          B.CreateCall(Bperm->getFunctionType(), Bperm, ArgsA);
+      llvm::Value *Res = coerceFromI32ToType(ResI32, RetTy, SrcQT);
       return Res;
     }
   }
 
-  // Fast path B: First-class scalar/vector whose total size is a multiple of 
32 bits.
-  // Bitcast to <N x i32>, permute each lane, bitcast back. Register-only; no 
memory.
+  // Fast path B: First-class scalar/vector whose total size is a multiple of 
32
+  // bits. Bitcast to <N x i32>, permute each lane, bitcast back. 
Register-only;
+  // no memory.
   if (!IsAggregate) {
     llvm::Value *SrcVal = CGF.EmitScalarExpr(Call->getArg(1));
     unsigned totalBits = getBitWidth(SrcVal->getType());
@@ -315,7 +328,8 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
       // Handle pointers by going through intptr first
       llvm::Value *AsIntN = SrcVal;
       if (SrcVal->getType()->isPointerTy()) {
-        unsigned PW = 
DL.getPointerSizeInBits(SrcVal->getType()->getPointerAddressSpace());
+        unsigned PW = DL.getPointerSizeInBits(
+            SrcVal->getType()->getPointerAddressSpace());
         AsIntN = B.CreatePtrToInt(SrcVal, B.getIntNTy(PW));
       }
 
@@ -324,7 +338,8 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
       llvm::Value *ResVec = llvm::UndefValue::get(I32VecTy);
       for (unsigned i = 0; i < words; ++i) {
         llvm::Value *Lane = B.CreateExtractElement(AsI32Vec, c32(i));
-        llvm::Value *Perm = B.CreateCall(Bperm->getFunctionType(), Bperm, 
{IndexI32, Lane});
+        llvm::Value *Perm =
+            B.CreateCall(Bperm->getFunctionType(), Bperm, {IndexI32, Lane});
         ResVec = B.CreateInsertElement(ResVec, Perm, c32(i));
       }
 
@@ -339,16 +354,20 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
 
   // General aggregate/odd-size path:
   // - Works for structs/arrays/complex and any total size.
-  // - Materialize source to a temp, process 4-byte words (unaligned 
loads/stores),
-  //   handle tail bytes by packing/unpacking into an i32, and return loaded 
Value*.
+  // - Materialize source to a temp, process 4-byte words (unaligned
+  // loads/stores),
+  //   handle tail bytes by packing/unpacking into an i32, and return loaded
+  //   Value*.
   auto emitAggregatePath = [&]() -> llvm::Value * {
     clang::QualType SrcQTLocal = Call->getArg(1)->getType();
     llvm::Type *SrcTy = CGF.ConvertType(SrcQTLocal);
 
-    clang::CodeGen::Address SrcAddr = CGF.CreateMemTemp(SrcQTLocal, 
"dsbperm.src");
-    clang::CodeGen::Address DstAddr = CGF.CreateMemTemp(RetQT,       
"dsbperm.dst");
+    clang::CodeGen::Address SrcAddr =
+        CGF.CreateMemTemp(SrcQTLocal, "dsbperm.src");
+    clang::CodeGen::Address DstAddr = CGF.CreateMemTemp(RetQT, "dsbperm.dst");
 
-    CGF.EmitAnyExprToMem(Call->getArg(1), SrcAddr, SrcQTLocal.getQualifiers(), 
/*IsInit*/true);
+    CGF.EmitAnyExprToMem(Call->getArg(1), SrcAddr, SrcQTLocal.getQualifiers(),
+                         /*IsInit*/ true);
 
     // i8 views of the buffers (as Address).
     clang::CodeGen::Address SrcI8Addr = SrcAddr.withElementType(I8);
@@ -357,8 +376,8 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
     auto CU = [&](uint64_t N) { return clang::CharUnits::fromQuantity(N); };
 
     uint64_t sizeBytes = DL.getTypeAllocSize(SrcTy);
-    uint64_t words     = sizeBytes / 4;
-    uint64_t tail      = sizeBytes % 4;
+    uint64_t words = sizeBytes / 4;
+    uint64_t tail = sizeBytes % 4;
 
     for (uint64_t i = 0; i < words; ++i) {
       uint64_t off = i * 4;
@@ -377,7 +396,8 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
       auto *Ld = B.CreateLoad(SrcWordI32Addr);
 
       llvm::SmallVector<llvm::Value *, 2> ArgsWord{IndexI32, Ld};
-      llvm::Value *Perm = B.CreateCall(Bperm->getFunctionType(), Bperm, 
ArgsWord);
+      llvm::Value *Perm =
+          B.CreateCall(Bperm->getFunctionType(), Bperm, ArgsWord);
 
       (void)B.CreateStore(Perm, DstWordI32Addr);
     }
@@ -398,7 +418,8 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
       }
 
       llvm::SmallVector<llvm::Value *, 2> ArgsTail{IndexI32, Pack};
-      llvm::Value *Perm = B.CreateCall(Bperm->getFunctionType(), Bperm, 
ArgsTail);
+      llvm::Value *Perm =
+          B.CreateCall(Bperm->getFunctionType(), Bperm, ArgsTail);
 
       for (uint64_t b = 0; b < tail; ++b) {
         llvm::Value *Byte = B.CreateTrunc(B.CreateLShr(Perm, c32(8 * b)), I8);
@@ -408,10 +429,12 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
       }
     }
 
-    // Load the final result from the destination temporary and return it as a 
Value*.
+    // Load the final result from the destination temporary and return it as a
+    // Value*.
     llvm::Value *Res = B.CreateLoad(DstAddr);
-    // For aggregates (struct/array/union), ensure determinism by freezing the 
value.
-    // freeze turns any undef/poison in padding into a fixed but arbitrary 
value.
+    // For aggregates (struct/array/union), ensure determinism by freezing the
+    // value. freeze turns any undef/poison in padding into a fixed but
+    // arbitrary value.
     if (Res->getType()->isAggregateType())
       Res = B.CreateFreeze(Res);
     return Res;
@@ -420,8 +443,6 @@ emitAMDGCNDsBpermute(clang::CodeGen::CodeGenFunction &CGF,
   return emitAggregatePath();
 }
 
-
-
 } // namespace
 
 // Generates the IR for __builtin_read_exec_*.

``````````

</details>


https://github.com/llvm/llvm-project/pull/153501
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_ds_bpermute argument types (PR #153501)

Reply via email to