https://github.com/sebpop updated https://github.com/llvm/llvm-project/pull/159046
>From a3d5ea6a744e10d7aac06b6fb79cae620b026b78 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <s...@nvidia.com> Date: Tue, 16 Sep 2025 06:23:44 -0500 Subject: [PATCH] [clang] add array out-of-bounds access constraints using llvm.assume Following C and C++ standards, generate llvm.assume statements for array subscript bounds to provide optimization hints. For this code: ``` int arr[10]; int example(int i) { return arr[i]; } ``` clang now generates an `assume(i < 10)`: ``` define i32 @example(i32 noundef %i) local_unnamed_addr #0 { entry: %idxprom = zext nneg i32 %i to i64 %bounds.constraint = icmp ult i32 %i, 10 tail call void @llvm.assume(i1 %bounds.constraint) %arrayidx = getelementptr inbounds nuw i32, ptr @arr, i64 %idxprom %0 = load i32, ptr %arrayidx, align 4, !tbaa !2 ret i32 %0 } ``` --- clang/lib/CodeGen/CGExpr.cpp | 112 ++++++++++++++++++ clang/lib/CodeGen/CGExprScalar.cpp | 3 + clang/lib/CodeGen/CodeGenFunction.h | 7 ++ clang/test/CodeGen/array-bounds-constraints.c | 64 ++++++++++ 4 files changed, 186 insertions(+) create mode 100644 clang/test/CodeGen/array-bounds-constraints.c diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index e6e4947882544..d4425d76d10fe 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -4559,6 +4559,97 @@ void CodeGenFunction::EmitCountedByBoundsChecking( } } +/// Emit array bounds constraints using llvm.assume for optimization hints. +/// +/// C Standard (ISO/IEC 9899:2011 - C11) +/// Section J.2 (Undefined behavior): An array subscript is out of range, even +/// if an object is apparently accessible with the given subscript (as in the +/// lvalue expression a[1][7] given the declaration int a[4][5]) (6.5.6). +/// +/// Section 6.5.6 (Additive operators): If both the pointer operand and the +/// result point to elements of the same array object, or one past the last +/// element of the array object, the evaluation shall not produce an overflow; +/// otherwise, the behavior is undefined. +/// +/// C++ Standard (ISO/IEC 14882 - 2017) +/// Section 8.7 (Additive operators): +/// 4 When an expression that has integral type is added to or subtracted from a +/// pointer, the result has the type of the pointer operand. If the expression +/// P points to element x[i] of an array object x with n elements,^86 the +/// expressions P + J and J + P (where J has the value j) point to the +/// (possibly-hypothetical) element x[i + j] if 0 ≤ i + j ≤ n; otherwise, the +/// behavior is undefined. Likewise, the expression P - J points to the +/// (possibly-hypothetical) element x[i − j] if 0 ≤ i − j ≤ n; otherwise, the +/// behavior is undefined. +/// ^86 A pointer past the last element of an array x of n elements is +/// considered to be equivalent to a pointer to a hypothetical element x[n] +/// for this purpose; see 6.9.2. +/// +/// This function emits llvm.assume statements to inform the optimizer that +/// array subscripts are within bounds, enabling better optimization without +/// duplicating side effects from the subscript expression. The IndexVal +/// parameter should be the already-emitted index value to avoid re-evaluation. +void CodeGenFunction::EmitArrayBoundsConstraints(const ArraySubscriptExpr *E, + llvm::Value *IndexVal) { + const Expr *Base = E->getBase(); + const Expr *Idx = E->getIdx(); + QualType BaseType = Base->getType(); + + if (const auto *ICE = dyn_cast<ImplicitCastExpr>(Base)) { + if (ICE->getCastKind() == CK_ArrayToPointerDecay) { + BaseType = ICE->getSubExpr()->getType(); + } + } + + // For now: only handle constant array types. + const ConstantArrayType *CAT = getContext().getAsConstantArrayType(BaseType); + if (!CAT) + return; + + llvm::APInt ArraySize = CAT->getSize(); + if (ArraySize == 0) + return; + + QualType IdxType = Idx->getType(); + llvm::Type *IndexType = ConvertType(IdxType); + llvm::Value *Zero = llvm::ConstantInt::get(IndexType, 0); + + uint64_t ArraySizeValue = ArraySize.getLimitedValue(); + llvm::Value *ArraySizeVal = llvm::ConstantInt::get(IndexType, ArraySizeValue); + + // Use the provided IndexVal to avoid duplicating side effects. + // The caller has already emitted the index expression once. + if (!IndexVal) + return; + + // Ensure index value has the same type as our constants. + if (IndexVal->getType() != IndexType) { + bool IsSigned = IdxType->isSignedIntegerOrEnumerationType(); + IndexVal = Builder.CreateIntCast(IndexVal, IndexType, IsSigned, "idx.cast"); + } + + // Create bounds constraint: 0 <= index && index < size. + // C arrays are 0-based, so valid indices are [0, size-1]. + // This enforces the C18 standard requirement that array subscripts + // must be "greater than or equal to zero and less than the size of the + // array." + llvm::Value *LowerBound, *UpperBound; + if (IdxType->isSignedIntegerOrEnumerationType()) { + // For signed indices: index >= 0 && index < size. + LowerBound = Builder.CreateICmpSGE(IndexVal, Zero, "idx.ge.zero"); + UpperBound = Builder.CreateICmpSLT(IndexVal, ArraySizeVal, "idx.lt.size"); + } else { + // For unsigned indices: index < size (>= 0 is implicit). + LowerBound = Builder.getTrue(); + UpperBound = Builder.CreateICmpULT(IndexVal, ArraySizeVal, "idx.lt.size"); + } + + llvm::Value *BoundsConstraint = + Builder.CreateAnd(LowerBound, UpperBound, "bounds.constraint"); + llvm::Function *AssumeIntrinsic = CGM.getIntrinsic(llvm::Intrinsic::assume); + Builder.CreateCall(AssumeIntrinsic, BoundsConstraint); +} + LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, bool Accessed) { // The index must always be an integer, which is not an aggregate. Emit it @@ -4588,6 +4679,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, }; IdxPre = nullptr; + // Array bounds constraints will be emitted after index evaluation to avoid + // duplicating side effects from the index expression. + // If the base is a vector type, then we are forming a vector element lvalue // with this subscript. if (E->getBase()->getType()->isSubscriptableVectorType() && @@ -4595,6 +4689,10 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, // Emit the vector as an lvalue to get its address. LValue LHS = EmitLValue(E->getBase()); auto *Idx = EmitIdxAfterBase(/*Promote*/false); + + // Emit array bounds constraints for vector subscripts. + EmitArrayBoundsConstraints(E, Idx); + assert(LHS.isSimple() && "Can only subscript lvalue vectors here!"); return LValue::MakeVectorElt(LHS.getAddress(), Idx, E->getBase()->getType(), LHS.getBaseInfo(), TBAAAccessInfo()); @@ -4635,6 +4733,10 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, Addr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); auto *Idx = EmitIdxAfterBase(/*Promote*/true); + // Emit array bounds constraints for VLA access (though VLAs typically don't + // have constant bounds). + EmitArrayBoundsConstraints(E, Idx); + // The element count here is the total number of non-VLA elements. llvm::Value *numElements = getVLASize(vla).NumElts; @@ -4659,6 +4761,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, Addr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); auto *Idx = EmitIdxAfterBase(/*Promote*/true); + // Emit array bounds constraints for ObjC interface access. + EmitArrayBoundsConstraints(E, Idx); + CharUnits InterfaceSize = getContext().getTypeSizeInChars(OIT); llvm::Value *InterfaceSizeVal = llvm::ConstantInt::get(Idx->getType(), InterfaceSize.getQuantity()); @@ -4694,6 +4799,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, ArrayLV = EmitLValue(Array); auto *Idx = EmitIdxAfterBase(/*Promote*/true); + // Emit array bounds constraints for optimization. + EmitArrayBoundsConstraints(E, Idx); + if (SanOpts.has(SanitizerKind::ArrayBounds)) EmitCountedByBoundsChecking(Array, Idx, ArrayLV.getAddress(), E->getIdx()->getType(), Array->getType(), @@ -4737,6 +4845,10 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E, Address BaseAddr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo); auto *Idx = EmitIdxAfterBase(/*Promote*/true); + + // Emit array bounds constraints for pointer-based array access. + EmitArrayBoundsConstraints(E, Idx); + QualType ptrType = E->getBase()->getType(); Addr = emitArraySubscriptGEP(*this, BaseAddr, Idx, E->getType(), !getLangOpts().PointerOverflowDefined, diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 4fa25c5d66669..28f702f9237e4 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2100,6 +2100,9 @@ Value *ScalarExprEmitter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) { if (CGF.SanOpts.has(SanitizerKind::ArrayBounds)) CGF.EmitBoundsCheck(E, E->getBase(), Idx, IdxTy, /*Accessed*/true); + // Emit array bounds constraints for vector element access. + CGF.EmitArrayBoundsConstraints(E, Idx); + return Builder.CreateExtractElement(Base, Idx, "vecext"); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 727487b46054f..6283841b7b170 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3341,6 +3341,13 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *Index, QualType IndexType, QualType IndexedType, bool Accessed); + /// Emit array bounds constraints using llvm.assume for optimization hints. + /// Emits assume statements for array bounds without duplicating side effects. + /// Takes the already-emitted index value to avoid re-evaluating expressions + /// with side effects. Helps optimizer with vectorization and bounds analysis. + void EmitArrayBoundsConstraints(const ArraySubscriptExpr *E, + llvm::Value *IndexVal); + /// Returns debug info, with additional annotation if /// CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo[Ordinal] is enabled for /// any of the ordinals. diff --git a/clang/test/CodeGen/array-bounds-constraints.c b/clang/test/CodeGen/array-bounds-constraints.c new file mode 100644 index 0000000000000..7da78c3697c3d --- /dev/null +++ b/clang/test/CodeGen/array-bounds-constraints.c @@ -0,0 +1,64 @@ +// Test that array bounds constraints generate llvm.assume statements for optimization hints. +// RUN: %clang_cc1 -emit-llvm -O2 %s -o - | FileCheck %s + +// Run with sanitizers to verify no assume generation (avoid conflicts with bounds checking). +// RUN: %clang_cc1 -emit-llvm -O2 -fsanitize=array-bounds %s -o - | FileCheck %s -check-prefix=SANITIZER + +// C18 standard, section 6.3.2.1 "Array subscripting" (semantics): +// "If the array expression is an array object (that has a known constant size) +// and the integer expression is not a constant expression, the behavior is undefined +// unless the value of the integer expression is greater than or equal to zero +// and less than the size of the array." +// +// This test verifies that clang generates llvm.assume statements to inform the +// optimizer that array subscripts are within bounds, enabling better optimization +// while avoiding side effect duplication from subscript expressions. + +// CHECK-LABEL: define {{.*}} @test_simple_array +int test_simple_array(int i) { + int arr[10]; // C arrays are 0-based: valid indices are [0, 9] + // CHECK: %{{.*}} = icmp ult i32 %i, 10 + // CHECK: call void @llvm.assume(i1 %{{.*}}) + return arr[i]; +} + +// CHECK-LABEL: define {{.*}} @test_multidimensional_array +int test_multidimensional_array(int i, int j) { + int arr[5][8]; // Valid indices: i in [0, 4], j in [0, 7] + // CHECK: %{{.*}} = icmp ult i32 %i, 5 + // CHECK: call void @llvm.assume(i1 %{{.*}}) + // CHECK: %{{.*}} = icmp ult i32 %j, 8 + // CHECK: call void @llvm.assume(i1 %{{.*}}) + return arr[i][j]; +} + +// CHECK-LABEL: define {{.*}} @test_unsigned_index +int test_unsigned_index(unsigned int i) { + int arr[10]; + // CHECK: %{{.*}} = icmp ult i32 %i, 10 + // CHECK: call void @llvm.assume(i1 %{{.*}}) + return arr[i]; +} + +// CHECK-LABEL: define {{.*}} @test_store_undef +void test_store_undef(int i, int value) { + int arr[10]; + // CHECK: %{{.*}} = icmp ult i32 %i, 10 + // CHECK: call void @llvm.assume(i1 %{{.*}}) + arr[i] = value; +} + +// SANITIZER-LABEL: define {{.*}} @test_pointer_array +int test_pointer_array(int *ptr) { + // Should not generate assume for pointer access (no known bounds) + // SANITIZER-NOT: call void @llvm.assume + return ptr[5]; +} + +// SANITIZER-LABEL: define {{.*}} @test_variable_length_array +int test_variable_length_array(int n, int i) { + int arr[n]; + // Should not generate assume for VLA (dynamic bounds) + // SANITIZER-NOT: call void @llvm.assume + return arr[i]; +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits