Issue 130509
Summary Illegal Instruction Fault From Improper CPU Feature Detection
Labels new issue
Assignees
Reporter EmmaJaneBonestell
    **LLVM Versions Tested**: 18, 19, 20, current tip

### Issue Summary
LLVM components incorrectly detect supported CPU instructions, resulting in the generation of illegal instructions, resulting in faulting binaries.

### Detailed Description
The compiler infrastructure appears to use static CPU model mappings to determine available instruction sets, instead of querying what the CPU actually supports (via `/proc/cpuinfo` or hwcap on AArch64). This causes particular problems on:

1. Arm-V9 CPUs from Qualcomm SoCs that do not implement SVE despite the Arm-V9 specification requiring it
2. Potentially any system where the Linux kernel is not configured with `CONFIG_ARM64_SVE=Y`

But is not necessarily limited to AArch64 or the above.

This affects instruction selection/codegen/runtime dispatching for all of LLVM like Clang, Flang, OpenMP, ORC JIT, etc.

### Reproduction Steps
Below is a relatively minimal test case using ORC JIT that demonstrates the issue. A Termux environment on Android devices using Qualcomm chips is likely the easiest target for reproduction. This could also be reproduced with a vectorizable loop in C code, with Clang, using the "-march=native" flag.

```cpp
#include <iostream>
#include <vector>
#include <string>

#include "llvm/ExecutionEngine/Orc/LLJIT.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/TargetParser/Host.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Target/TargetMachine.h"

using namespace llvm;
using namespace llvm::orc;

// Diagnostic handler to suppress remarks
class SilenceRemarksHandler : public DiagnosticHandler {
public:
  bool handleDiagnostics(const DiagnosticInfo &DI) override {
    // Ignore remarks, pass through other diagnostics
    if (DI.getSeverity() == DS_Remark) {
      return true;
    }
    return false;
 }
};

std::unique_ptr<Module> createVectorModule(LLVMContext &Context) {
 auto M = std::make_unique<Module>("VecTest", Context);
  
  auto *FloatTy = Type::getFloatTy(Context);
  auto *FloatPtrTy = PointerType::get(FloatTy, 0);
  auto *Int32Ty = Type::getInt32Ty(Context);
 
  FunctionType *FT = FunctionType::get(
      Type::getVoidTy(Context),
 {FloatPtrTy, FloatPtrTy, FloatPtrTy, Int32Ty},
      false);
  
 Function *F = Function::Create(FT, Function::ExternalLinkage, "vector_op", M.get());
  F->addFnAttr(Attribute::NoUnwind);
  
  auto Args = F->arg_begin();
  Value *A = &*Args++;
  Value *B = &*Args++;
  Value *Result = &*Args++;
  Value *Length = &*Args++;
  
  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
  BasicBlock *LoopBB = BasicBlock::Create(Context, "loop", F);
  BasicBlock *ExitBB = BasicBlock::Create(Context, "exit", F);
  
  IRBuilder<> Builder(Context);
  
  Builder.SetInsertPoint(EntryBB);
  Value *IndexAlloca = Builder.CreateAlloca(Int32Ty, nullptr, "i");
 Builder.CreateStore(ConstantInt::get(Int32Ty, 0), IndexAlloca);
 Builder.CreateBr(LoopBB);
  
  Builder.SetInsertPoint(LoopBB);
  Value *Index = Builder.CreateLoad(Int32Ty, IndexAlloca, "idx");
  Value *LoopCond = Builder.CreateICmpSLT(Index, Length, "cond");
  
  Value *APtr = Builder.CreateGEP(FloatTy, A, Index, "a_ptr");
  Value *BPtr = Builder.CreateGEP(FloatTy, B, Index, "b_ptr");
  Value *ResultPtr = Builder.CreateGEP(FloatTy, Result, Index, "result_ptr");
  
  MDNode *AccessGroup = MDNode::get(Context, {});
  
  Value *AVal = Builder.CreateLoad(FloatTy, APtr, "a_val");
  Value *BVal = Builder.CreateLoad(FloatTy, BPtr, "b_val");
  
 cast<Instruction>(AVal)->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
 cast<Instruction>(BVal)->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
  
  Value *Square = Builder.CreateFMul(AVal, AVal, "square");
  Value *AddResult = Builder.CreateFAdd(Square, BVal, "add");
  
  auto *StoreInst = Builder.CreateStore(AddResult, ResultPtr);
  StoreInst->setMetadata("llvm.mem.parallel_loop_access", AccessGroup);
  
  Value *NextIndex = Builder.CreateAdd(Index, ConstantInt::get(Int32Ty, 1), "next_idx");
 Builder.CreateStore(NextIndex, IndexAlloca);
  
  // Loop metadata to force vectorization
  MDNode *ForcedVec = MDNode::get(Context, {
 MDString::get(Context, "llvm.loop.vectorize.enable"),
 ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 1))
 });
  
  MDNode *LoopID = MDNode::get(Context, {MDNode::get(Context, {}), ForcedVec});
  LoopID->replaceOperandWith(0, LoopID);
  
 Builder.CreateCondBr(LoopCond, LoopBB, ExitBB)->setMetadata("llvm.loop", LoopID);
  Builder.SetInsertPoint(ExitBB);
  Builder.CreateRetVoid();
  
 verifyFunction(*F);
  return M;
}

// Apply optimization passes to force vectorization
void optimizeModule(Module &M, TargetMachine *TM) {
 PassBuilder PB;
  
  LoopAnalysisManager LAM;
  FunctionAnalysisManager FAM;
  CGSCCAnalysisManager CGAM;
  ModuleAnalysisManager MAM;
  
 FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
  
 PB.registerModuleAnalyses(MAM);
  PB.registerCGSCCAnalyses(CGAM);
 PB.registerFunctionAnalyses(FAM);
  PB.registerLoopAnalyses(LAM);
 PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
  
  ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(OptimizationLevel::O3);
  
  MPM.run(M, MAM);
}

int main(int argc, char** argv) {
  // Parse command line arguments
  bool useNoSVE = false;
  for (int i = 1; i < argc; i++) {
 if (std::string(argv[i]) == "--use-nosve") {
      useNoSVE = true;
 }
  }

  InitializeNativeTarget();
 InitializeNativeTargetAsmPrinter();
  InitializeNativeTargetAsmParser();
 
  // Silence remarks
  LLVMContext Context;
 Context.setDiagnosticHandler(std::make_unique<SilenceRemarksHandler>());
 
  auto JTMB = cantFail(JITTargetMachineBuilder::detectHost());
 JTMB.setCodeGenOptLevel(CodeGenOptLevel::Aggressive);
  
  if (useNoSVE) {
    JTMB.addFeatures(std::vector<std::string>{"-sve"});
  }
  
 std::unique_ptr<TargetMachine> TM(cantFail(JTMB.createTargetMachine()));
 auto M = createVectorModule(Context);
 M->setDataLayout(TM->createDataLayout());
  
  // Apply optimization passes to ensure and force vectorization
  optimizeModule(*M, TM.get());
  
  // Set-up JIT compiled function
  auto JIT = cantFail(LLJITBuilder().setJITTargetMachineBuilder(std::move(JTMB)).create());
 cantFail(JIT->addIRModule(ThreadSafeModule(std::move(M), std::make_unique<LLVMContext>())));
  auto VecOpAddr = cantFail(JIT->lookup("vector_op"));
  auto *VectorOp = (void(*)(float*, float*, float*, int))VecOpAddr.getValue();
  const int Length = 1024;
 std::vector<float> A(Length), B(Length), Result(Length);
  for (int i = 0; i < Length; i++) {
    A[i] = i;
    B[i] = i * 2;
  }
  
  // Execute JIT-compiled function
  // It should fault with an illegal instruction on such devices
  VectorOp(A.data(), B.data(), Result.data(), Length);
  
 // Will only reach here if execution succeeds
  std::cout << "Result[10]: " << Result[10] << std::endl;
  
  return 0;
}
```

When executed normally, the program generates illegal instructions on hardware that meets the specified conditions. It will also accept an argument `--use-nosve` to add `-sve` to the JIT's features list which should cause it not crash.

### Additional Context
Attempting to workaround this issue locally revealed frustrating inconsistentencies in how LLVM CPU features are specified across different LLVM interfaces:
- `-march=`
- `-mcpu=`
- `-Xclang -target-feature`
- `llvm::orc::JITTargetMachineBuilder::addFeatures()`

Each of these accepts a different set of feature flags with inconsistent naming conventions and limited documentation.
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to