================ @@ -0,0 +1,449 @@ +//===- BoundsChecking.cpp - Bounds checking related APIs --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines APIs for performing a bounds check (i.e. comparing a +// symbolic Offset value to zero and a symbolic Extent value) and composing +// descriptions that explain its results. +// +// This is intended as a replacement for `ProgramState::assumeInBound` to +// avoid its incorrect logic and compensate for deficiencies of other parts of +// the analyzer. +// +//===----------------------------------------------------------------------===// + +#include "clang/StaticAnalyzer/Checkers/BoundsChecking.h" +#include "clang/StaticAnalyzer/Checkers/Taint.h" + +using llvm::formatv; + +namespace clang { +namespace ento { + +const ArraySubscriptExpr * +getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) { + const auto *ASE = dyn_cast<ArraySubscriptExpr>(E); + if (!ASE) + return nullptr; + + const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion(); + if (!SubscriptBaseReg) + return nullptr; + + // The base of the subscript expression is affected by pointer arithmetics, + // so we want to report byte offsets instead of indices and we don't want to + // activate the "index is unsigned -> cannot be negative" shortcut. + if (isa<ElementRegion>(SubscriptBaseReg->StripCasts())) + return nullptr; + + return ASE; +} + +// NOTE: This function is the "heart" of this checker. It simplifies +// inequalities with transformations that are valid (and very elementary) in +// pure mathematics, but become invalid if we use them in C++ number model +// where the calculations may overflow. +// Due to the overflow issues I think it's impossible (or at least not +// practical) to integrate this kind of simplification into the resolution of +// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function +// produces valid results when the calculations are handling memory offsets +// and every value is well below SIZE_MAX. +// TODO: This algorithm should be moved to a central location where it's +// available for other checkers that need to compare memory offsets. +// NOTE: the simplification preserves the order of the two operands in a +// mathematical sense, but it may change the result produced by a C++ +// comparison operator (and the automatic type conversions). +// For example, consider a comparison "X+1 < 0", where the LHS is stored as a +// size_t and the RHS is stored in an int. (As size_t is unsigned, this +// comparison is false for all values of "X".) However, the simplification may +// turn it into "X < -1", which is still always false in a mathematical sense, +// but can produce a true result when evaluated by `evalBinOp` (which follows +// the rules of C++ and casts -1 to SIZE_MAX). +static std::pair<NonLoc, nonloc::ConcreteInt> +getSimplifiedOffsets(NonLoc Offset, nonloc::ConcreteInt Extent, + SValBuilder &SVB) { + const llvm::APSInt &ExtentVal = Extent.getValue(); + std::optional<nonloc::SymbolVal> SymVal = Offset.getAs<nonloc::SymbolVal>(); + if (SymVal && SymVal->isExpression()) { + if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) { + llvm::APSInt Constant = APSIntType(ExtentVal).convert(SIE->getRHS()); + switch (SIE->getOpcode()) { + case BO_Mul: + // The Constant should never be 0 here, becasue multiplication by zero + // is simplified by the engine. + if ((ExtentVal % Constant) != 0) + return std::pair<NonLoc, nonloc::ConcreteInt>(Offset, Extent); + else + return getSimplifiedOffsets(nonloc::SymbolVal(SIE->getLHS()), + SVB.makeIntVal(ExtentVal / Constant), + SVB); + case BO_Add: + return getSimplifiedOffsets(nonloc::SymbolVal(SIE->getLHS()), + SVB.makeIntVal(ExtentVal - Constant), SVB); + default: + break; + } + } + } + + return std::pair<NonLoc, nonloc::ConcreteInt>(Offset, Extent); +} + +static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) { + const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value); + return MaxV && MaxV->isNegative(); +} + +static bool isUnsigned(SValBuilder &SVB, NonLoc Value) { + QualType T = Value.getType(SVB.getContext()); + return T->isUnsignedIntegerType(); +} + +// Evaluate the comparison Value < Threshold with the help of the custom +// simplification algorithm defined for this checker. Return a pair of states, +// where the first one corresponds to "value below threshold" and the second +// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in +// the case when the evaluation fails. +// If the optional argument CheckEquality is true, then use BO_EQ instead of +// the default BO_LT after consistently applying the same simplification steps. +static std::pair<ProgramStateRef, ProgramStateRef> +compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, + SValBuilder &SVB, bool CheckEquality = false) { + if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) { + std::tie(Value, Threshold) = + getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); + } + + // We want to perform a _mathematical_ comparison between the numbers `Value` + // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may + // perform automatic conversions. For example the number -1 is less than the + // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int` + // -1 is converted to ULONGLONG_MAX. + // To avoid automatic conversions, we evaluate the "obvious" cases without + // calling `evalBinOpNN`: + if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) { + if (CheckEquality) { + // negative_value == unsigned_threshold is always false + return {nullptr, State}; + } + // negative_value < unsigned_threshold is always true + return {State, nullptr}; + } + if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) { + // unsigned_value == negative_threshold and + // unsigned_value < negative_threshold are both always false + return {nullptr, State}; + } + // FIXME: These special cases are sufficient for handling real-world + // comparisons, but in theory there could be contrived situations where + // automatic conversion of a symbolic value (which can be negative and can be + // positive) leads to incorrect results. + // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because + // we want to ensure that assumptions coming from this precondition and + // assumptions coming from regular C/C++ operator calls are represented by + // constraints on the same symbolic expression. A solution that would + // evaluate these "mathematical" comparisons through a separate pathway would + // be a step backwards in this sense. + + const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT; + auto BelowThreshold = + SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType()) + .getAs<NonLoc>(); + + if (BelowThreshold) + return State->assume(*BelowThreshold); + + return {nullptr, nullptr}; +} + +std::string getRegionName(const MemSpaceRegion *Space, + const SubRegion *Region) { + if (std::string RegName = Region->getDescriptiveName(); !RegName.empty()) + return RegName; + + // Field regions only have descriptive names when their parent has a + // descriptive name; so we provide a fallback representation for them: + if (const auto *FR = Region->getAs<FieldRegion>()) { + if (StringRef Name = FR->getDecl()->getName(); !Name.empty()) + return formatv("the field '{0}'", Name); + return "the unnamed field"; + } + + if (isa<AllocaRegion>(Region)) + return "the memory returned by 'alloca'"; + + if (isa<SymbolicRegion>(Region) && isa<HeapSpaceRegion>(Space)) + return "the heap area"; + + if (isa<StringRegion>(Region)) + return "the string literal"; + + return "the region"; +} + +static std::optional<int64_t> getConcreteValue(NonLoc SV) { + if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) { + return ConcreteVal->getValue()->tryExtValue(); + } + return std::nullopt; +} + +static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) { + return SV ? getConcreteValue(*SV) : std::nullopt; +} + +/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if +/// it can be performed (`Divisor` is nonzero and there is no remainder). The +/// values `Val1` and `Val2` may be nullopt and in that case the corresponding +/// division is considered to be successful. +bool SizeUnit::tryConvertValuesFromBytes(std::optional<int64_t> &Val1, + std::optional<int64_t> &Val2) const { + if (!AsCharUnits) + return false; + const bool Val1HasRemainder = Val1 && *Val1 % AsCharUnits; + const bool Val2HasRemainder = Val2 && *Val2 % AsCharUnits; + if (Val1HasRemainder || Val2HasRemainder) + return false; + if (Val1) + *Val1 /= AsCharUnits; + if (Val2) + *Val2 /= AsCharUnits; + return true; +} + +Messages getNonTaintMsgs(std::string RegName, SizeUnit SU, NonLoc Offset, + std::optional<NonLoc> Extent, BadOffsetKind Problem) { + + std::optional<int64_t> OffsetN = getConcreteValue(Offset); + std::optional<int64_t> ExtentN = getConcreteValue(Extent); + + if (Problem == BadOffsetKind::Negative) + ExtentN = std::nullopt; + + if (!SU.tryConvertValuesFromBytes(OffsetN, ExtentN)) + SU = SizeUnit::bytes(); + + SmallString<256> Buf; + llvm::raw_svector_ostream Out(Buf); + Out << "Access of "; + if (OffsetN && !ExtentN && !SU.isBytes()) { + // If the offset is reported as an index, then the report must mention the + // element type (because it is not always clear from the code). It's more + // natural to mention the element type later where the extent is described, + // but if the extent is unknown/irrelevant, then the element type can be + // inserted into the message at this point. + Out << SU.asElementName() << " in "; + } + Out << RegName << " at "; + if (OffsetN) { + if (Problem == BadOffsetKind::Negative) + Out << "negative "; + Out << SU.getOffsetName() << " " << *OffsetN; + } else { + Out << asAdjective(Problem) << " " << SU.getOffsetName(); + } + if (ExtentN) { + Out << ", while it holds only "; + if (*ExtentN != 1) + Out << *ExtentN; + else + Out << "a single"; + + Out << ' ' << SU.asElementName(); + + if (*ExtentN > 1) + Out << "s"; + } + + return {formatv("Out of bound access to memory {0} {1}", + asPreposition(Problem), RegName), + std::string(Buf)}; +} + +Messages getTaintMsgs(std::string RegName, const char *OffsetName, + bool AlsoMentionUnderflow) { + return {formatv("Potential out of bound access to {0} with tainted {1}", + RegName, OffsetName), + formatv("Access of {0} with a tainted {1} that may be {2}too large", + RegName, OffsetName, + AlsoMentionUnderflow ? "negative or " : "")}; +} + +std::string BoundsCheckResult::getMessage(PathSensitiveBugReport &BR, + StringRef RegName, + SizeUnit SU) const { + bool ShouldReportNonNegative = AssumedNonNegative; + if (!providesInformationAboutInteresting(Offset, BR)) { + if (AssumedUpperBound && providesInformationAboutInteresting(*Extent, BR)) { + // Even if the byte offset isn't interesting (e.g. it's a constant value), + // the assumption can still be interesting if it provides information + // about an interesting symbolic upper bound. + ShouldReportNonNegative = false; + } else { + // We don't have anything interesting, don't report the assumption. + return ""; + } + } + + std::optional<int64_t> OffsetN = getConcreteValue(Offset); + std::optional<int64_t> ExtentN = getConcreteValue(Extent); + + if (!SU.tryConvertValuesFromBytes(OffsetN, ExtentN)) + SU = SizeUnit::bytes(); + + SmallString<256> Buf; + llvm::raw_svector_ostream Out(Buf); + Out << "Assuming "; + if (!SU.isBytes()) { + Out << "index "; + if (OffsetN) + Out << "'" << OffsetN << "' "; + } else if (Extent) { + Out << "byte offset "; + if (OffsetN) + Out << "'" << OffsetN << "' "; ---------------- steakhal wrote:
Here you guard `"byte offset "` by `Extent` while in the original place it was guarded by `AssumedUpperBound` was this intentional? Wouldn't this bring observable differences, violating the stated NFC nature of this patch? I didn't really go into the details, but claude suggested: > Reachable example: int arr[10]; if (x < 10) return arr[x]; — x signed, sign > unknown, but x*4 < 40 is known, so AssumedNonNegative=true, > AssumedUpperBound=false, Extent=40. The note text diverges. https://github.com/llvm/llvm-project/pull/202372 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
