https://github.com/yonghong-song updated 
https://github.com/llvm/llvm-project/pull/206876

>From 5e59d1f039ce1ca4d5df9f0ce20e22129c8a8a4c Mon Sep 17 00:00:00 2001
From: Yonghong Song <[email protected]>
Date: Tue, 30 Jun 2026 17:03:39 -0700
Subject: [PATCH] [BPF] Return small aggregates directly in registers

Previously the BPF ABI always returned aggregate (struct/union) types
indirectly through an sret pointer, regardless of size. This is
inconsistent with how classifyArgumentType() already passes small
aggregates: arguments up to 128 bits are coerced into one or two 64-bit
registers, while only larger aggregates use an indirect reference.

Make classifyReturnType() mirror that convention by factoring the shared
aggregate handling into a classifyAggregateType() helper used by both:

  - empty aggregates (0 bits) are ignored;
  - aggregates up to 64 bits are returned directly, coerced to an
    integer of the padded size;
  - aggregates of 65..128 bits are returned directly as [2 x i64];
  - aggregates larger than 128 bits are returned indirectly via sret.

This keeps each returned value within the backend's two-register return
convention and avoids an unnecessary memory round-trip for small structs.

This also aligns BPF with the general-purpose C ABIs of other targets:
both x86-64 (System V, RAX:RDX) and AArch64 (AAPCS64, X0:X1) return
aggregates up to 16 bytes in a pair of registers and only fall back to an
indirect sret pointer for larger ones.
---
 clang/lib/CodeGen/Targets/BPF.cpp           |  50 +++---
 clang/test/CodeGen/bpf-struct-return-regs.c |  73 ++++++++
 clang/test/CodeGen/bpf-struct-return.c      |  70 ++++++++
 llvm/test/CodeGen/BPF/aggr_ret_regs.ll      | 189 ++++++++++++++++++++
 4 files changed, 359 insertions(+), 23 deletions(-)
 create mode 100644 clang/test/CodeGen/bpf-struct-return-regs.c
 create mode 100644 clang/test/CodeGen/bpf-struct-return.c
 create mode 100644 llvm/test/CodeGen/BPF/aggr_ret_regs.ll

diff --git a/clang/lib/CodeGen/Targets/BPF.cpp 
b/clang/lib/CodeGen/Targets/BPF.cpp
index 3a7af346f1132..d3318d76703f2 100644
--- a/clang/lib/CodeGen/Targets/BPF.cpp
+++ b/clang/lib/CodeGen/Targets/BPF.cpp
@@ -22,30 +22,35 @@ class BPFABIInfo : public DefaultABIInfo {
 public:
   BPFABIInfo(CodeGenTypes &CGT) : DefaultABIInfo(CGT) {}
 
+  // Classify an aggregate (struct/union) used as an argument or a return
+  // value. Aggregates that fit in 1 or 2 registers are passed/returned
+  // directly, coerced to an integer or a pair of 64-bit integers; larger
+  // ones use an indirect reference.
+  ABIArgInfo classifyAggregateType(QualType Ty) const {
+    uint64_t Bits = getContext().getTypeSize(Ty);
+    if (Bits == 0)
+      return ABIArgInfo::getIgnore();
+
+    // Larger aggregates use an indirect reference.
+    if (Bits > 128)
+      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+
+    // If the aggregate needs 1 or 2 registers, do not use reference.
+    llvm::Type *CoerceTy;
+    if (Bits <= 64) {
+      CoerceTy = llvm::IntegerType::get(getVMContext(), llvm::alignTo(Bits, 
8));
+    } else {
+      llvm::Type *RegTy = llvm::IntegerType::get(getVMContext(), 64);
+      CoerceTy = llvm::ArrayType::get(RegTy, 2);
+    }
+    return ABIArgInfo::getDirect(CoerceTy);
+  }
+
   ABIArgInfo classifyArgumentType(QualType Ty) const {
     Ty = useFirstFieldIfTransparentUnion(Ty);
 
-    if (isAggregateTypeForABI(Ty)) {
-      uint64_t Bits = getContext().getTypeSize(Ty);
-      if (Bits == 0)
-        return ABIArgInfo::getIgnore();
-
-      // If the aggregate needs 1 or 2 registers, do not use reference.
-      if (Bits <= 128) {
-        llvm::Type *CoerceTy;
-        if (Bits <= 64) {
-          CoerceTy =
-              llvm::IntegerType::get(getVMContext(), llvm::alignTo(Bits, 8));
-        } else {
-          llvm::Type *RegTy = llvm::IntegerType::get(getVMContext(), 64);
-          CoerceTy = llvm::ArrayType::get(RegTy, 2);
-        }
-        return ABIArgInfo::getDirect(CoerceTy);
-      } else {
-        return getNaturalAlignIndirect(Ty,
-                                       getDataLayout().getAllocaAddrSpace());
-      }
-    }
+    if (isAggregateTypeForABI(Ty))
+      return classifyAggregateType(Ty);
 
     if (const auto *ED = Ty->getAsEnumDecl())
       Ty = ED->getIntegerType();
@@ -65,8 +70,7 @@ class BPFABIInfo : public DefaultABIInfo {
       return ABIArgInfo::getIgnore();
 
     if (isAggregateTypeForABI(RetTy))
-      return getNaturalAlignIndirect(RetTy,
-                                     getDataLayout().getAllocaAddrSpace());
+      return classifyAggregateType(RetTy);
 
     // Treat an enum type as its underlying type.
     if (const auto *ED = RetTy->getAsEnumDecl())
diff --git a/clang/test/CodeGen/bpf-struct-return-regs.c 
b/clang/test/CodeGen/bpf-struct-return-regs.c
new file mode 100644
index 0000000000000..b8891613b053b
--- /dev/null
+++ b/clang/test/CodeGen/bpf-struct-return-regs.c
@@ -0,0 +1,73 @@
+// REQUIRES: bpf-registered-target
+// RUN: %clang_cc1 -triple bpf -O2 -emit-llvm -disable-llvm-passes %s -o - | 
FileCheck %s
+//
+// Aggregates up to 16 bytes are returned directly in registers: coerced to an
+// integer when they fit in one register (<= 8 bytes), or to [2 x i64] when
+// they need two (9..16 bytes).
+
+struct foo1 {int a;};                             // 4 bytes  -> one register
+struct foo2 {int a; long b;};                     // 16 bytes -> two registers
+struct foo3 {int a; int b; long c;};              // 16 bytes -> two registers
+struct foo4 {int a; int b:20; int c:20; int d:24;}; // 16 bytes -> two 
registers
+
+#define __noinline __attribute__((noinline))
+
+__noinline struct foo1 bar1(int a) {
+// CHECK-LABEL: define dso_local i32 @bar1(
+// CHECK: ret i32
+  struct foo1 v = {a};
+  return v;
+}
+
+__noinline struct foo2 bar2(int a, int b) {
+// CHECK-LABEL: define dso_local [2 x i64] @bar2(
+// CHECK: ret [2 x i64]
+  struct foo2 v = {a, b};
+  return v;
+}
+
+__noinline struct foo3 bar3(int a, int b, int c) {
+// CHECK-LABEL: define dso_local [2 x i64] @bar3(
+// CHECK: ret [2 x i64]
+  struct foo3 v = {a, b, c};
+  return v;
+}
+
+__noinline struct foo4 bar4(int a, int b, int c, int d) {
+// CHECK-LABEL: define dso_local [2 x i64] @bar4(
+// CHECK: ret [2 x i64]
+  struct foo4 v = {a, b, c, d};
+  return v;
+}
+
+int check1(int a) {
+// CHECK-LABEL: define dso_local i32 @check1(
+// CHECK: %[[C1:.*]] = call i32 @bar1(
+// CHECK: store i32 %[[C1]]
+  struct foo1 v1 = bar1(a);
+  return v1.a;
+}
+
+int check2(int a, int b) {
+// CHECK-LABEL: define dso_local i32 @check2(
+// CHECK: %[[C2:.*]] = call [2 x i64] @bar2(
+// CHECK: store [2 x i64] %[[C2]]
+  struct foo2 v1 = bar2(a, b);
+  return v1.a + v1.b;
+}
+
+int check3(int a, int b, int c) {
+// CHECK-LABEL: define dso_local i32 @check3(
+// CHECK: %[[C3:.*]] = call [2 x i64] @bar3(
+// CHECK: store [2 x i64] %[[C3]]
+  struct foo3 v1 = bar3(a, b, c);
+  return v1.a + v1.b + v1.c;
+}
+
+int check4(int a, int b, int c, int d) {
+// CHECK-LABEL: define dso_local i32 @check4(
+// CHECK: %[[C4:.*]] = call [2 x i64] @bar4(
+// CHECK: store [2 x i64] %[[C4]]
+  struct foo4 v1 = bar4(a, b, c, d);
+  return v1.a + v1.b + v1.c + v1.d;
+}
diff --git a/clang/test/CodeGen/bpf-struct-return.c 
b/clang/test/CodeGen/bpf-struct-return.c
new file mode 100644
index 0000000000000..d7eab2afda172
--- /dev/null
+++ b/clang/test/CodeGen/bpf-struct-return.c
@@ -0,0 +1,70 @@
+// REQUIRES: bpf-registered-target
+// RUN: %clang_cc1 -triple bpf -O2 -emit-llvm -disable-llvm-passes %s -o - | 
FileCheck %s
+
+struct t1 {};
+struct t2 {
+  int a;
+};
+struct t3 {
+  int a;
+  long b;
+};
+struct t4 {
+  long a;
+  long b;
+  long c;
+};
+struct t5 {
+  char a;
+};
+union u1 {
+  int a;
+  long b;
+};
+
+struct t1 foo1(void) {
+// CHECK: define dso_local void @foo1()
+  struct t1 tmp = {};
+  return tmp;
+}
+
+struct t2 foo2(void) {
+// CHECK: define dso_local i32 @foo2()
+  struct t2 tmp = {};
+  return tmp;
+}
+
+struct t3 foo3(void) {
+// CHECK: define dso_local [2 x i64] @foo3()
+  struct t3 tmp = {};
+  return tmp;
+}
+
+struct t4 foo4(void) {
+// CHECK: define dso_local void @foo4(ptr dead_on_unwind noalias writable 
sret(%struct.t4) align 8 %agg.result)
+  struct t4 tmp = {};
+  return tmp;
+}
+
+struct t5 foo5(void) {
+// CHECK: define dso_local i8 @foo5()
+  struct t5 tmp = {};
+  return tmp;
+}
+
+union u1 foou(void) {
+// CHECK: define dso_local i64 @foou()
+  union u1 tmp = {};
+  return tmp;
+}
+
+int bar(void) {
+// CHECK-LABEL: define dso_local i32 @bar()
+// CHECK: %[[C2:.*]] = call i32 @foo2()
+// CHECK: store i32 %[[C2]]
+// CHECK: %[[C3:.*]] = call [2 x i64] @foo3()
+// CHECK: store [2 x i64] %[[C3]]
+  struct t2 a = foo2();
+  struct t3 b = foo3();
+  return a.a + b.a;
+}
diff --git a/llvm/test/CodeGen/BPF/aggr_ret_regs.ll 
b/llvm/test/CodeGen/BPF/aggr_ret_regs.ll
new file mode 100644
index 0000000000000..8a7d165f0acb0
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/aggr_ret_regs.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=bpfel -mcpu=v1 | FileCheck %s
+;
+; Generated from clang/test/CodeGen/bpf-struct-return-regs.c to show the
+; machine instructions for returning aggregates in registers:
+;   - foo1 (4 bytes)  is coerced to i32 and returned in one register (r0);
+;   - foo2/foo3/foo4 (16 bytes) are coerced to [2 x i64] and returned in two
+;     registers (r0 and r2).
+
+target triple = "bpf"
+
+define dso_local noundef i32 @bar1(i32 noundef returned %0) local_unnamed_addr 
#0 {
+; CHECK-LABEL: bar1:
+; CHECK:       .Lbar1$local:
+; CHECK-NEXT:    .type .Lbar1$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    r0 = r1
+; CHECK-NEXT:    exit
+  ret i32 %0
+}
+
+define dso_local [2 x i64] @bar2(i32 noundef %0, i32 noundef %1) 
local_unnamed_addr #0 {
+; CHECK-LABEL: bar2:
+; CHECK:       .Lbar2$local:
+; CHECK-NEXT:    .type .Lbar2$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    r0 = r1
+; CHECK-NEXT:    r0 <<= 32
+; CHECK-NEXT:    r0 >>= 32
+; CHECK-NEXT:    r2 <<= 32
+; CHECK-NEXT:    r2 s>>= 32
+; CHECK-NEXT:    exit
+  %3 = sext i32 %1 to i64
+  %4 = zext i32 %0 to i64
+  %5 = insertvalue [2 x i64] poison, i64 %4, 0
+  %6 = insertvalue [2 x i64] %5, i64 %3, 1
+  ret [2 x i64] %6
+}
+
+define dso_local [2 x i64] @bar3(i32 noundef %0, i32 noundef %1, i32 noundef 
%2) local_unnamed_addr #0 {
+; CHECK-LABEL: bar3:
+; CHECK:       .Lbar3$local:
+; CHECK-NEXT:    .type .Lbar3$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    r0 = r2
+; CHECK-NEXT:    r0 <<= 32
+; CHECK-NEXT:    r1 <<= 32
+; CHECK-NEXT:    r1 >>= 32
+; CHECK-NEXT:    r0 |= r1
+; CHECK-NEXT:    r3 <<= 32
+; CHECK-NEXT:    r3 s>>= 32
+; CHECK-NEXT:    r2 = r3
+; CHECK-NEXT:    exit
+  %4 = sext i32 %2 to i64
+  %5 = zext i32 %1 to i64
+  %6 = shl nuw i64 %5, 32
+  %7 = zext i32 %0 to i64
+  %8 = or disjoint i64 %6, %7
+  %9 = insertvalue [2 x i64] poison, i64 %8, 0
+  %10 = insertvalue [2 x i64] %9, i64 %4, 1
+  ret [2 x i64] %10
+}
+
+define dso_local [2 x i64] @bar4(i32 noundef %0, i32 noundef %1, i32 noundef 
%2, i32 noundef %3) local_unnamed_addr #0 {
+; CHECK-LABEL: bar4:
+; CHECK:       .Lbar4$local:
+; CHECK-NEXT:    .type .Lbar4$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    r0 = r2
+; CHECK-NEXT:    r1 <<= 32
+; CHECK-NEXT:    r1 >>= 32
+; CHECK-NEXT:    r0 &= 1048575
+; CHECK-NEXT:    r0 <<= 32
+; CHECK-NEXT:    r0 |= r1
+; CHECK-NEXT:    r3 &= 1048575
+; CHECK-NEXT:    r4 &= 16777215
+; CHECK-NEXT:    r4 <<= 32
+; CHECK-NEXT:    r4 |= r3
+; CHECK-NEXT:    r2 = r4
+; CHECK-NEXT:    exit
+  %5 = and i32 %1, 1048575
+  %6 = and i32 %2, 1048575
+  %7 = and i32 %3, 16777215
+  %8 = zext nneg i32 %5 to i64
+  %9 = shl nuw nsw i64 %8, 32
+  %10 = zext i32 %0 to i64
+  %11 = or disjoint i64 %9, %10
+  %12 = insertvalue [2 x i64] poison, i64 %11, 0
+  %13 = zext nneg i32 %7 to i64
+  %14 = shl nuw nsw i64 %13, 32
+  %15 = zext nneg i32 %6 to i64
+  %16 = or disjoint i64 %14, %15
+  %17 = insertvalue [2 x i64] %12, i64 %16, 1
+  ret [2 x i64] %17
+}
+
+define dso_local noundef i32 @check1(i32 noundef returned %0) 
local_unnamed_addr #1 {
+; CHECK-LABEL: check1:
+; CHECK:       .Lcheck1$local:
+; CHECK-NEXT:    .type .Lcheck1$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    r0 = r1
+; CHECK-NEXT:    exit
+  ret i32 %0
+}
+
+define dso_local i32 @check2(i32 noundef %0, i32 noundef %1) 
local_unnamed_addr #1 {
+; CHECK-LABEL: check2:
+; CHECK:       .Lcheck2$local:
+; CHECK-NEXT:    .type .Lcheck2$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call bar2
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+  %3 = tail call [2 x i64] @bar2(i32 noundef %0, i32 noundef %1)
+  %4 = extractvalue [2 x i64] %3, 0
+  %5 = extractvalue [2 x i64] %3, 1
+  %6 = add i64 %4, %5
+  %7 = trunc i64 %6 to i32
+  ret i32 %7
+}
+
+define dso_local i32 @check3(i32 noundef %0, i32 noundef %1, i32 noundef %2) 
local_unnamed_addr #1 {
+; CHECK-LABEL: check3:
+; CHECK:       .Lcheck3$local:
+; CHECK-NEXT:    .type .Lcheck3$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call bar3
+; CHECK-NEXT:    r1 = 4294967297 ll
+; CHECK-NEXT:    r0 *= r1
+; CHECK-NEXT:    r0 >>= 32
+; CHECK-NEXT:    r0 += r2
+; CHECK-NEXT:    exit
+  %4 = tail call [2 x i64] @bar3(i32 noundef %0, i32 noundef %1, i32 noundef 
%2)
+  %5 = extractvalue [2 x i64] %4, 0
+  %6 = extractvalue [2 x i64] %4, 1
+  %7 = mul i64 %5, 4294967297
+  %8 = lshr i64 %7, 32
+  %9 = add i64 %8, %6
+  %10 = trunc i64 %9 to i32
+  ret i32 %10
+}
+
+define dso_local i32 @check4(i32 noundef %0, i32 noundef %1, i32 noundef %2, 
i32 noundef %3) local_unnamed_addr #1 {
+; CHECK-LABEL: check4:
+; CHECK:       .Lcheck4$local:
+; CHECK-NEXT:    .type .Lcheck4$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    call bar4
+; CHECK-NEXT:    r1 = r0
+; CHECK-NEXT:    r1 <<= 12
+; CHECK-NEXT:    r1 s>>= 44
+; CHECK-NEXT:    r1 += r0
+; CHECK-NEXT:    r3 = r2
+; CHECK-NEXT:    r3 <<= 44
+; CHECK-NEXT:    r3 s>>= 44
+; CHECK-NEXT:    r1 += r3
+; CHECK-NEXT:    r2 <<= 8
+; CHECK-NEXT:    r2 s>>= 40
+; CHECK-NEXT:    r1 += r2
+; CHECK-NEXT:    r0 = r1
+; CHECK-NEXT:    exit
+  %5 = tail call [2 x i64] @bar4(i32 noundef %0, i32 noundef %1, i32 noundef 
%2, i32 noundef %3)
+  %6 = extractvalue [2 x i64] %5, 0
+  %7 = extractvalue [2 x i64] %5, 1
+  %8 = trunc i64 %6 to i32
+  %9 = lshr i64 %6, 20
+  %10 = trunc i64 %9 to i32
+  %11 = ashr i32 %10, 12
+  %12 = add nsw i32 %11, %8
+  %13 = trunc i64 %7 to i32
+  %14 = shl i32 %13, 12
+  %15 = ashr exact i32 %14, 12
+  %16 = add nsw i32 %12, %15
+  %17 = lshr i64 %7, 24
+  %18 = trunc i64 %17 to i32
+  %19 = ashr i32 %18, 8
+  %20 = add nsw i32 %16, %19
+  ret i32 %20
+}
+

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to