https://github.com/atrosinenko updated https://github.com/llvm/llvm-project/pull/138655
>From 69b6e028492a787934a4d5a4f23644988cb35af9 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko <atrosine...@accesssoftek.com> Date: Mon, 28 Apr 2025 18:35:48 +0300 Subject: [PATCH] [BOLT] Factor out MCInstReference from gadget scanner (NFC) Move MCInstReference representing a constant reference to an instruction inside a parent entity - either inside a basic block (which has a reference to its parent function) or directly to the function (when CFG information is not available). --- bolt/include/bolt/Core/MCInstUtils.h | 168 +++++++++++++++++ bolt/include/bolt/Passes/PAuthGadgetScanner.h | 178 +----------------- bolt/lib/Core/CMakeLists.txt | 1 + bolt/lib/Core/MCInstUtils.cpp | 57 ++++++ bolt/lib/Passes/PAuthGadgetScanner.cpp | 102 +++++----- 5 files changed, 269 insertions(+), 237 deletions(-) create mode 100644 bolt/include/bolt/Core/MCInstUtils.h create mode 100644 bolt/lib/Core/MCInstUtils.cpp diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h new file mode 100644 index 0000000000000..69bf5e6159b74 --- /dev/null +++ b/bolt/include/bolt/Core/MCInstUtils.h @@ -0,0 +1,168 @@ +//===- bolt/Core/MCInstUtils.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_CORE_MCINSTUTILS_H +#define BOLT_CORE_MCINSTUTILS_H + +#include "bolt/Core/BinaryBasicBlock.h" + +#include <map> +#include <tuple> +#include <variant> + +namespace llvm { +namespace bolt { + +class BinaryFunction; + +/// MCInstReference represents a reference to a constant MCInst as stored either +/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock +/// (after a CFG is created). +class MCInstReference { + using nocfg_const_iterator = std::map<uint32_t, MCInst>::const_iterator; + + // Two cases are possible: + // * functions with CFG reconstructed - a function stores a collection of + // basic blocks, each basic block stores a contiguous vector of MCInst + // * functions without CFG - there are no basic blocks created, + // the instructions are directly stored in std::map in BinaryFunction + // + // In both cases, the direct parent of MCInst is stored together with an + // iterator pointing to the instruction. + + // Helper struct: CFG is available, the direct parent is a basic block, + // iterator's type is `MCInst *`. + struct RefInBB { + RefInBB(const BinaryBasicBlock *BB, const MCInst *Inst) + : BB(BB), It(Inst) {} + RefInBB(const RefInBB &Other) = default; + RefInBB &operator=(const RefInBB &Other) = default; + + const BinaryBasicBlock *BB; + BinaryBasicBlock::const_iterator It; + + bool operator<(const RefInBB &Other) const { + return std::tie(BB, It) < std::tie(Other.BB, Other.It); + } + + bool operator==(const RefInBB &Other) const { + return BB == Other.BB && It == Other.It; + } + }; + + // Helper struct: CFG is *not* available, the direct parent is a function, + // iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value + // is an instruction's offset). + struct RefInBF { + RefInBF(const BinaryFunction *BF, nocfg_const_iterator It) + : BF(BF), It(It) {} + RefInBF(const RefInBF &Other) = default; + RefInBF &operator=(const RefInBF &Other) = default; + + const BinaryFunction *BF; + nocfg_const_iterator It; + + bool operator<(const RefInBF &Other) const { + return std::tie(BF, It->first) < std::tie(Other.BF, Other.It->first); + } + + bool operator==(const RefInBF &Other) const { + return BF == Other.BF && It->first == Other.It->first; + } + }; + + std::variant<RefInBB, RefInBF> Reference; + + // Utility methods to be used like this: + // + // if (auto *Ref = tryGetRefInBB()) + // return Ref->doSomething(...); + // return getRefInBF().doSomethingElse(...); + const RefInBB *tryGetRefInBB() const { + assert(std::get_if<RefInBB>(&Reference) || + std::get_if<RefInBF>(&Reference)); + return std::get_if<RefInBB>(&Reference); + } + const RefInBF &getRefInBF() const { + assert(std::get_if<RefInBF>(&Reference)); + return *std::get_if<RefInBF>(&Reference); + } + +public: + /// Constructs an empty reference. + MCInstReference() : Reference(RefInBB(nullptr, nullptr)) {} + /// Constructs a reference to the instruction inside the basic block. + MCInstReference(const BinaryBasicBlock *BB, const MCInst *Inst) + : Reference(RefInBB(BB, Inst)) { + assert(BB && Inst && "Neither BB nor Inst should be nullptr"); + } + /// Constructs a reference to the instruction inside the basic block. + MCInstReference(const BinaryBasicBlock *BB, unsigned Index) + : Reference(RefInBB(BB, &BB->getInstructionAtIndex(Index))) { + assert(BB && "Basic block should not be nullptr"); + } + /// Constructs a reference to the instruction inside the function without + /// CFG information. + MCInstReference(const BinaryFunction *BF, nocfg_const_iterator It) + : Reference(RefInBF(BF, It)) { + assert(BF && "Function should not be nullptr"); + } + + /// Locates an instruction inside a function and returns a reference. + static MCInstReference get(const MCInst *Inst, const BinaryFunction &BF); + + bool operator<(const MCInstReference &Other) const { + return Reference < Other.Reference; + } + + bool operator==(const MCInstReference &Other) const { + return Reference == Other.Reference; + } + + const MCInst &getMCInst() const { + if (auto *Ref = tryGetRefInBB()) + return *Ref->It; + return getRefInBF().It->second; + } + + operator const MCInst &() const { return getMCInst(); } + + operator bool() const { + if (auto *Ref = tryGetRefInBB()) + return Ref->BB != nullptr; + return getRefInBF().BF != nullptr; + } + + bool hasCFG() const { + return static_cast<bool>(*this) && tryGetRefInBB() != nullptr; + } + + const BinaryFunction *getFunction() const { + if (auto *Ref = tryGetRefInBB()) + return Ref->BB->getFunction(); + return getRefInBF().BF; + } + + const BinaryBasicBlock *getBasicBlock() const { + if (auto *Ref = tryGetRefInBB()) + return Ref->BB; + return nullptr; + } + + raw_ostream &print(raw_ostream &OS) const; +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, + const MCInstReference &Ref) { + return Ref.print(OS); +} + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h index de5bf25a4e540..fab5fd1016e33 100644 --- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h +++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h @@ -11,189 +11,13 @@ #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/MCInstUtils.h" #include "bolt/Passes/BinaryPasses.h" #include "llvm/Support/raw_ostream.h" #include <memory> namespace llvm { namespace bolt { - -/// @brief MCInstReference represents a reference to an MCInst as stored either -/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock -/// (after a CFG is created). It aims to store the necessary information to be -/// able to find the specific MCInst in either the BinaryFunction or -/// BinaryBasicBlock data structures later, so that e.g. the InputAddress of -/// the corresponding instruction can be computed. - -struct MCInstInBBReference { - BinaryBasicBlock *BB; - int64_t BBIndex; - MCInstInBBReference(BinaryBasicBlock *BB, int64_t BBIndex) - : BB(BB), BBIndex(BBIndex) {} - MCInstInBBReference() : BB(nullptr), BBIndex(0) {} - static MCInstInBBReference get(const MCInst *Inst, BinaryFunction &BF) { - for (BinaryBasicBlock &BB : BF) - for (size_t I = 0; I < BB.size(); ++I) - if (Inst == &BB.getInstructionAtIndex(I)) - return MCInstInBBReference(&BB, I); - return {}; - } - bool operator==(const MCInstInBBReference &RHS) const { - return BB == RHS.BB && BBIndex == RHS.BBIndex; - } - bool operator<(const MCInstInBBReference &RHS) const { - return std::tie(BB, BBIndex) < std::tie(RHS.BB, RHS.BBIndex); - } - operator MCInst &() const { - assert(BB != nullptr); - return BB->getInstructionAtIndex(BBIndex); - } - uint64_t getAddress() const { - // 4 bytes per instruction on AArch64. - // FIXME: the assumption of 4 byte per instruction needs to be fixed before - // this method gets used on any non-AArch64 binaries (but should be fine for - // pac-ret analysis, as that is an AArch64-specific feature). - return BB->getFunction()->getAddress() + BB->getOffset() + BBIndex * 4; - } -}; - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &); - -struct MCInstInBFReference { - BinaryFunction *BF; - uint64_t Offset; - MCInstInBFReference(BinaryFunction *BF, uint64_t Offset) - : BF(BF), Offset(Offset) {} - - static MCInstInBFReference get(const MCInst *Inst, BinaryFunction &BF) { - for (auto &I : BF.instrs()) - if (Inst == &I.second) - return MCInstInBFReference(&BF, I.first); - return {}; - } - - MCInstInBFReference() : BF(nullptr), Offset(0) {} - bool operator==(const MCInstInBFReference &RHS) const { - return BF == RHS.BF && Offset == RHS.Offset; - } - bool operator<(const MCInstInBFReference &RHS) const { - if (BF != RHS.BF) - return BF < RHS.BF; - return Offset < RHS.Offset; - } - operator MCInst &() const { - assert(BF != nullptr); - return *BF->getInstructionAtOffset(Offset); - } - - uint64_t getOffset() const { return Offset; } - - uint64_t getAddress() const { return BF->getAddress() + getOffset(); } -}; - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &); - -struct MCInstReference { - enum Kind { FunctionParent, BasicBlockParent }; - Kind ParentKind; - union U { - MCInstInBBReference BBRef; - MCInstInBFReference BFRef; - U(MCInstInBBReference BBRef) : BBRef(BBRef) {} - U(MCInstInBFReference BFRef) : BFRef(BFRef) {} - } U; - MCInstReference(MCInstInBBReference BBRef) - : ParentKind(BasicBlockParent), U(BBRef) {} - MCInstReference(MCInstInBFReference BFRef) - : ParentKind(FunctionParent), U(BFRef) {} - MCInstReference(BinaryBasicBlock *BB, int64_t BBIndex) - : MCInstReference(MCInstInBBReference(BB, BBIndex)) {} - MCInstReference(BinaryFunction *BF, uint32_t Offset) - : MCInstReference(MCInstInBFReference(BF, Offset)) {} - - static MCInstReference get(const MCInst *Inst, BinaryFunction &BF) { - if (BF.hasCFG()) - return MCInstInBBReference::get(Inst, BF); - return MCInstInBFReference::get(Inst, BF); - } - - bool operator<(const MCInstReference &RHS) const { - if (ParentKind != RHS.ParentKind) - return ParentKind < RHS.ParentKind; - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef < RHS.U.BBRef; - case FunctionParent: - return U.BFRef < RHS.U.BFRef; - } - llvm_unreachable(""); - } - - bool operator==(const MCInstReference &RHS) const { - if (ParentKind != RHS.ParentKind) - return false; - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef == RHS.U.BBRef; - case FunctionParent: - return U.BFRef == RHS.U.BFRef; - } - llvm_unreachable(""); - } - - operator MCInst &() const { - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef; - case FunctionParent: - return U.BFRef; - } - llvm_unreachable(""); - } - - operator bool() const { - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef.BB != nullptr; - case FunctionParent: - return U.BFRef.BF != nullptr; - } - llvm_unreachable(""); - } - - uint64_t getAddress() const { - switch (ParentKind) { - case BasicBlockParent: - return U.BBRef.getAddress(); - case FunctionParent: - return U.BFRef.getAddress(); - } - llvm_unreachable(""); - } - - BinaryFunction *getFunction() const { - switch (ParentKind) { - case FunctionParent: - return U.BFRef.BF; - case BasicBlockParent: - return U.BBRef.BB->getFunction(); - } - llvm_unreachable(""); - } - - BinaryBasicBlock *getBasicBlock() const { - switch (ParentKind) { - case FunctionParent: - return nullptr; - case BasicBlockParent: - return U.BBRef.BB; - } - llvm_unreachable(""); - } -}; - -raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &); - namespace PAuthGadgetScanner { // The report classes are designed to be used in an immutable manner. diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt index 8c1f5d0bb37b5..05afd5b333de5 100644 --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -31,6 +31,7 @@ add_llvm_library(LLVMBOLTCore GDBIndex.cpp HashUtilities.cpp JumpTable.cpp + MCInstUtils.cpp MCPlusBuilder.cpp ParallelUtilities.cpp Relocation.cpp diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp new file mode 100644 index 0000000000000..40f6edd59135c --- /dev/null +++ b/bolt/lib/Core/MCInstUtils.cpp @@ -0,0 +1,57 @@ +//===- bolt/Passes/MCInstUtils.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "bolt/Core/MCInstUtils.h" + +#include "bolt/Core/BinaryBasicBlock.h" +#include "bolt/Core/BinaryFunction.h" + +#include <iterator> + +using namespace llvm; +using namespace llvm::bolt; + +MCInstReference MCInstReference::get(const MCInst *Inst, + const BinaryFunction &BF) { + if (BF.hasCFG()) { + for (BinaryBasicBlock &BB : BF) + for (MCInst &MI : BB) + if (&MI == Inst) + return MCInstReference(&BB, Inst); + return {}; + } + + for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) { + if (&I->second == Inst) + return MCInstReference(&BF, I); + } + return {}; +} + +raw_ostream &MCInstReference::print(raw_ostream &OS) const { + if (const RefInBB *Ref = tryGetRefInBB()) { + OS << "MCInstBBRef<"; + if (Ref->BB == nullptr) { + OS << "BB:(null)"; + } else { + unsigned IndexInBB = std::distance(Ref->BB->begin(), Ref->It); + OS << "BB:" << Ref->BB->getName() << ":" << IndexInBB; + } + OS << ">"; + return OS; + } + + const RefInBF &Ref = getRefInBF(); + OS << "MCInstBFRef<"; + if (Ref.BF == nullptr) + OS << "BF:(null)"; + else + OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.It->first; + OS << ">"; + return OS; +} diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 835ee26aaf08a..5e08ae3fbf767 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -24,39 +24,6 @@ namespace llvm { namespace bolt { - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &Ref) { - OS << "MCInstBBRef<"; - if (Ref.BB == nullptr) - OS << "BB:(null)"; - else - OS << "BB:" << Ref.BB->getName() << ":" << Ref.BBIndex; - OS << ">"; - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &Ref) { - OS << "MCInstBFRef<"; - if (Ref.BF == nullptr) - OS << "BF:(null)"; - else - OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.getOffset(); - OS << ">"; - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &Ref) { - switch (Ref.ParentKind) { - case MCInstReference::BasicBlockParent: - OS << Ref.U.BBRef; - return OS; - case MCInstReference::FunctionParent: - OS << Ref.U.BFRef; - return OS; - } - llvm_unreachable(""); -} - namespace PAuthGadgetScanner { [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label, @@ -93,10 +60,10 @@ template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) { if (BF.hasCFG()) { for (BinaryBasicBlock &BB : BF) for (int64_t I = 0, E = BB.size(); I < E; ++I) - Fn(MCInstInBBReference(&BB, I)); + Fn(MCInstReference(&BB, I)); } else { - for (auto I : BF.instrs()) - Fn(MCInstInBFReference(&BF, I.first)); + for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) + Fn(MCInstReference(&BF, I)); } } @@ -1303,8 +1270,7 @@ static bool shouldAnalyzeTailCallInst(const BinaryContext &BC, // (such as isBranch at the time of writing this comment), some don't (such // as isCall). For that reason, call MCInstrDesc's methods explicitly when // it is important. - const MCInstrDesc &Desc = - BC.MII->get(static_cast<const MCInst &>(Inst).getOpcode()); + const MCInstrDesc &Desc = BC.MII->get(Inst.getMCInst().getOpcode()); // Tail call should be a branch (but not necessarily an indirect one). if (!Desc.isBranch()) return false; @@ -1499,7 +1465,7 @@ void FunctionAnalysisContext::findUnsafeUses( // Arbitrarily attach the report to the first instruction of BB. Reports.push_back( - make_generic_report(MCInstReference::get(FirstInst, BF), + make_generic_report(MCInstReference(&BB, FirstInst), "Warning: the function has unreachable basic " "blocks (possibly incomplete CFG)")); UnreachableBBReported = true; @@ -1660,31 +1626,49 @@ void Analysis::runOnFunction(BinaryFunction &BF, } } +// Compute the instruction address for printing (may be slow). +static uint64_t getAddress(const MCInstReference &Inst) { + const BinaryFunction *BF = Inst.getFunction(); + + if (Inst.hasCFG()) { + const BinaryBasicBlock *BB = Inst.getBasicBlock(); + + auto It = static_cast<BinaryBasicBlock::const_iterator>(&Inst.getMCInst()); + unsigned IndexInBB = std::distance(BB->begin(), It); + + // FIXME: this assumes all instructions are 4 bytes in size. This is true + // for AArch64, but it might be good to extract this function so it can be + // used elsewhere and for other targets too. + return BF->getAddress() + BB->getOffset() + IndexInBB * 4; + } + + for (auto I = BF->instrs().begin(), E = BF->instrs().end(); I != E; ++I) { + if (&I->second == &Inst.getMCInst()) + return BF->getAddress() + I->first; + } + llvm_unreachable("Instruction not found in function"); +} + static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB, size_t StartIndex = 0, size_t EndIndex = -1) { if (EndIndex == (size_t)-1) EndIndex = BB->size() - 1; const BinaryFunction *BF = BB->getFunction(); for (unsigned I = StartIndex; I <= EndIndex; ++I) { - // FIXME: this assumes all instructions are 4 bytes in size. This is true - // for AArch64, but it might be good to extract this function so it can be - // used elsewhere and for other targets too. - uint64_t Address = BB->getOffset() + BF->getAddress() + 4 * I; - const MCInst &Inst = BB->getInstructionAtIndex(I); + MCInstReference Inst(BB, I); if (BC.MIB->isCFI(Inst)) continue; - BC.printInstruction(outs(), Inst, Address, BF); + BC.printInstruction(outs(), Inst, getAddress(Inst), BF); } } static void reportFoundGadgetInSingleBBSingleRelatedInst( raw_ostream &OS, const BinaryContext &BC, const MCInstReference RelatedInst, const MCInstReference Location) { - BinaryBasicBlock *BB = Location.getBasicBlock(); - assert(RelatedInst.ParentKind == MCInstReference::BasicBlockParent); - assert(Location.ParentKind == MCInstReference::BasicBlockParent); - MCInstInBBReference RelatedInstBB = RelatedInst.U.BBRef; - if (BB == RelatedInstBB.BB) { + const BinaryBasicBlock *BB = Location.getBasicBlock(); + assert(RelatedInst.hasCFG()); + assert(Location.hasCFG()); + if (BB == RelatedInst.getBasicBlock()) { OS << " This happens in the following basic block:\n"; printBB(BC, BB); } @@ -1692,16 +1676,16 @@ static void reportFoundGadgetInSingleBBSingleRelatedInst( void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC, StringRef IssueKind) const { - BinaryFunction *BF = Location.getFunction(); - BinaryBasicBlock *BB = Location.getBasicBlock(); + const BinaryBasicBlock *BB = Location.getBasicBlock(); + const BinaryFunction *BF = Location.getFunction(); OS << "\nGS-PAUTH: " << IssueKind; OS << " in function " << BF->getPrintName(); if (BB) OS << ", basic block " << BB->getName(); - OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n"; + OS << ", at address " << llvm::format("%x", getAddress(Location)) << "\n"; OS << " The instruction is "; - BC.printInstruction(OS, Location, Location.getAddress(), BF); + BC.printInstruction(OS, Location, getAddress(Location), BF); } void GadgetDiagnostic::generateReport(raw_ostream &OS, @@ -1714,22 +1698,20 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location, const BinaryFunction &BF = *Location.getFunction(); const BinaryContext &BC = BF.getBinaryContext(); - // Sort by address to ensure output is deterministic. + // Sort the references to make output deterministic. SmallVector<MCInstReference> RI(RelatedInstrs); - llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) { - return A.getAddress() < B.getAddress(); - }); + llvm::sort(RI); for (unsigned I = 0; I < RI.size(); ++I) { MCInstReference InstRef = RI[I]; OS << " " << (I + 1) << ". "; - BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF); + BC.printInstruction(OS, InstRef, getAddress(InstRef), &BF); }; if (RelatedInstrs.size() == 1) { const MCInstReference RelatedInst = RelatedInstrs[0]; // Printing the details for the MCInstReference::FunctionParent case // is not implemented not to overcomplicate the code, as most functions // are expected to have CFG information. - if (RelatedInst.ParentKind == MCInstReference::BasicBlockParent) + if (RelatedInst.hasCFG()) reportFoundGadgetInSingleBBSingleRelatedInst(OS, BC, RelatedInst, Location); } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits