Issue 164617
Summary [llvm] Miscompilation with SROA (and `fabs`?)
Labels new issue
Assignees
Reporter thewilsonator
    The following code:
```llvm
; ModuleID = './gh4997.d'
source_filename = "./gh4997.d"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macos14"

@.str = private unnamed_addr constant [11 x i8] c"./gh4997.d\00" ; [#uses = 1]

; [#uses = 0]
; Function Attrs: uwtable(sync)
define i32 @main(i32 %__param_0_arg, ptr %__param_1_arg) #0 {
  %__param_0 = alloca i32, align 4                ; [#uses = 1, size/byte = 4]
  %__param_1 = alloca ptr, align 8                ; [#uses = 1, size/byte = 8]
  %v = alloca [2 x double], align 8               ; [#uses = 3, size/byte = 16]
  %maxip = alloca double, align 8                 ; [#uses = 4, size/byte = 8]
  %d = alloca i32, align 4 ; [#uses = 6, size/byte = 4]
  %w = alloca [2 x double], align 8 ; [#uses = 4, size/byte = 16]
  %ip = alloca double, align 8 ; [#uses = 3, size/byte = 8]
  store i32 %__param_0_arg, ptr %__param_0, align 4
  store ptr %__param_1_arg, ptr %__param_1, align 8
 store [2 x double] [double -1.000000e+00, double 0.000000e+00], ptr %v, align 8
  store double 0.000000e+00, ptr %maxip, align 8
  store i32 0, ptr %d, align 4
  store i32 0, ptr %d, align 4
  br label %forcond

forcond: ; preds = %forinc, %0
  %1 = load i32, ptr %d, align 4                  ; [#uses = 1]
  %2 = icmp slt i32 %1, 2 ; [#uses = 1]
  br i1 %2, label %forbody, label %endfor

forbody:                                          ; preds = %forcond
  store [2 x double] [double 0.000000e+00, double 1.000000e+00], ptr %w, align 8
  %3 = load i32, ptr %d, align 4                  ; [#uses = 1]
  %4 = sext i32 %3 to i64                         ; [#uses = 3]
 %bounds.cmp = icmp ult i64 %4, 2                ; [#uses = 1]
  br i1 %bounds.cmp, label %bounds.ok, label %bounds.fail

bounds.ok: ; preds = %forbody
  %5 = getelementptr inbounds [2 x double], ptr %w, i32 0, i64 %4 ; [#uses = 1, type = ptr]
  store double 1.000000e+00, ptr %5, align 8
  %6 = getelementptr inbounds [2 x double], ptr %v, i32 0, i64 0 ; [#uses = 1, type = ptr]
  %7 = load double, ptr %6, align 8               ; [#uses = 1]
  %8 = getelementptr inbounds [2 x double], ptr %w, i32 0, i64 0 ; [#uses = 1, type = ptr]
  %9 = load double, ptr %8, align 8               ; [#uses = 1]
  %10 = fmul double %7, %9 ; [#uses = 1]
  %11 = getelementptr inbounds [2 x double], ptr %v, i32 0, i64 1 ; [#uses = 1, type = ptr]
  %12 = load double, ptr %11, align 8             ; [#uses = 1]
  %13 = getelementptr inbounds [2 x double], ptr %w, i32 0, i64 1 ; [#uses = 1, type = ptr]
  %14 = load double, ptr %13, align 8             ; [#uses = 1]
  %15 = fmul double %12, %14 ; [#uses = 1]
  %16 = fadd double %10, %15 ; [#uses = 1]
  store double %16, ptr %ip, align 8
  %17 = load double, ptr %ip, align 8             ; [#uses = 1]
  %18 = call double @llvm.fabs.f64(double %17) #2 ; [#uses = 1]
  %19 = load double, ptr %maxip, align 8          ; [#uses = 1]
  %20 = call double @llvm.fabs.f64(double %19) #2 ; [#uses = 1]
  %21 = fcmp ogt double %18, %20                  ; [#uses = 1]
  br i1 %21, label %if, label %endif

if: ; preds = %bounds.ok
  %22 = load double, ptr %ip, align 8             ; [#uses = 1]
  store double %22, ptr %maxip, align 8
 br label %endif

endif:                                            ; preds = %if, %bounds.ok
  br label %forinc

bounds.fail: ; preds = %forbody
  %.lcssa = phi i64 [ %4, %forbody ] ; [#uses = 1, type = i64]
  call void @_d_arraybounds_index({ i64, ptr } { i64 10, ptr @.str }, i32 15, i64 %.lcssa, i64 2) #1
 unreachable

forinc:                                           ; preds = %endif
  %23 = load i32, ptr %d, align 4                 ; [#uses = 1]
 %24 = add i32 %23, 1                            ; [#uses = 1]
  store i32 %24, ptr %d, align 4
  br label %forcond

endfor: ; preds = %forcond
  %25 = load double, ptr %maxip, align 8 ; [#uses = 1]
  %26 = fcmp oge double %25, 0.000000e+00         ; [#uses = 1]
  %27 = zext i1 %26 to i32                        ; [#uses = 1]
  ret i32 %27
}

; [#uses = 1]
; Function Attrs: cold noreturn uwtable(sync)
declare void @_d_arraybounds_index({ i64, ptr }, i32, i64, i64) #1

; [#uses = 2]
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare double @llvm.fabs.f64(double) #2

attributes #0 = { uwtable(sync) "frame-pointer"="all" "target-cpu"="generic" }
attributes #1 = { cold noreturn uwtable(sync) }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }

!llvm.ident = !{!0}

!0 = !{!"ldc version 1.41.0"}
```
Generated from the following D code from LDC 1.41 on LLVM 20.1.5
```d
pragma(LDC_intrinsic, "llvm.fabs.f#")
    T llvm_fabs(T)(T val)
        if (__traits(isFloating, T));
pragma(LDC_no_moduleinfo);

extern(C) int main(int, char**)
{
    double[2] v = [-1.0, 0.0];
    double maxip = 0;
 int d;
    version(none)
    for (d = 0; d < 2; d++)
    {
 double[2] w = [0.0, 1.0];
        w[d] = 1.0;
        double ip = v[0] * w[0] + v[1] * w[1];
        if (llvm_fabs(ip) > llvm_fabs(maxip))
 maxip = ip;
    }
    return maxip >= 0;
}
```
with`-O0` returns `0`. With `-O1` returns 1. The corresponding C compiled by clang 
```c
int main(int, char**)
{
    double v[2] = {-1.0, 0.0};
    double maxip = 0;
 for (int d = 0; d < 2; d++)
    {
        double w[2] = {0.0, 1.0};
 w[d] = 1.0;
        double ip = v[0] * w[0] + v[1] * w[1];
        if (__builtin_fabs(ip) > __builtin_fabs(maxip))
            maxip = ip;
 }
    return maxip >= 0;
}
```
is optimised to `ret i32 0` with `-O2`.
Compiling and running the results of  `--print-module-scope --print-before-pass-number=N`:
```llvm
; *** IR Dump Before 47-SROAPass on main ***
source_filename = "./gh4997.d"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macos14"

define range(i32 0, 2) i32 @main(i32 %__param_0_arg, ptr nocapture readnone %__param_1_arg) local_unnamed_addr #0 {
  %w = alloca [2 x double], align 8
  %.fca.1.gep = getelementptr inbounds nuw i8, ptr %w, i64 8
  br label %forbody

forbody: ; preds = %0
  store double 0.000000e+00, ptr %w, align 8
  store double 1.000000e+00, ptr %.fca.1.gep, align 8
  store double 1.000000e+00, ptr %w, align 8
  %1 = load double, ptr %w, align 8
 %2 = load double, ptr %.fca.1.gep, align 8
  %3 = fmul double %2, 0.000000e+00
  %4 = fsub double %3, %1
  %5 = call double @llvm.fabs.f64(double %4) #2
  %6 = fcmp ogt double %5, 0.000000e+00
 %maxip.1 = select i1 %6, double %4, double 0.000000e+00
  store double 0.000000e+00, ptr %w, align 8
  store double 1.000000e+00, ptr %.fca.1.gep, align 8
  %7 = getelementptr inbounds nuw [2 x double], ptr %w, i64 0, i64 1
  store double 1.000000e+00, ptr %7, align 8
  %8 = load double, ptr %w, align 8
  %9 = load double, ptr %.fca.1.gep, align 8
  %10 = fmul double %9, 0.000000e+00
  %11 = fsub double %10, %8
  %12 = call double @llvm.fabs.f64(double %11) #2
  %13 = fcmp ogt double %12, %maxip.1
 %maxip.1.1 = select i1 %13, double %11, double %maxip.1
  %14 = fcmp oge double %maxip.1.1, 0.000000e+00
  %15 = zext i1 %14 to i32
  ret i32 %15
}
declare double @llvm.fabs.f64(double) #1

attributes #0 = { uwtable(sync) "frame-pointer"="all" "target-cpu"="generic" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
```
returns with exit code 0 (for N <= 47), after running SROA (N>48), it is optimised to 
```llvm
; *** IR Dump Before 48-MemCpyOptPass on main ***
source_filename = "./gh4997.d"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macos14"

; Function Attrs: uwtable(sync)
define range(i32 0, 2) i32 @main(i32 %__param_0_arg, ptr nocapture readnone %__param_1_arg) local_unnamed_addr #0 {
  br label %forbody

forbody: ; preds = %0
  %1 = fmul double 1.000000e+00, 0.000000e+00
  %2 = fsub double %1, 1.000000e+00
  %3 = call double @llvm.fabs.f64(double %2) #2
  %4 = fcmp ogt double %3, 0.000000e+00
  %maxip.1 = select i1 %4, double %2, double 0.000000e+00
  %5 = fmul double 1.000000e+00, 0.000000e+00
  %6 = fsub double %5, 0.000000e+00
  %7 = call double @llvm.fabs.f64(double %6) #2
  %8 = fcmp ogt double %7, %maxip.1
  %maxip.1.1 = select i1 %8, double %6, double %maxip.1
  %9 = fcmp oge double %maxip.1.1, 0.000000e+00
  %10 = zext i1 %9 to i32
  ret i32 %10
}

declare double @llvm.fabs.f64(double) #1

attributes #0 = { uwtable(sync) "frame-pointer"="all" "target-cpu"="generic" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
```
which returns with exit code 1. LDC's optimisation pipeline is: [pipeline-ldc.txt](https://github.com/user-attachments/files/23053556/pipeline-ldc.txt)
When the unoptimised output is optimised with `opt -O1` the program succeeds with exit code 0. `opt` runs the following pipeline:
[pipeline-opt.txt](https://github.com/user-attachments/files/23053557/pipeline-opt.txt)
The diff is:
```diff
diff --git a/pipeline-opt.txt b/pipeline-ldc.txt
index 79ceda9..1a769cc 100644
--- a/pipeline-opt.txt
+++ b/pipeline-ldc.txt
@@ -29,10 +29,21 @@
  Running pass  ReassociatePass on main
  Running pass LoopSimplifyPass on main
  Running pass  LCSSAPass on main
+ Running pass LoopInstSimplifyPass on loop %forcond in function main
+ Running pass LoopSimplifyCFGPass on loop %forcond in function main
+ Running pass LICMPass on loop %forcond in function main
+ Running pass  LoopRotatePass on loop %forcond in function main
+ Running pass  LICMPass on loop %forbody in function main
+ Running pass  SimpleLoopUnswitchPass on loop %forbody in function main
  Running pass  SimplifyCFGPass on main
  Running pass InstCombinePass on main
  Running pass  LoopSimplifyPass on main
  Running pass  LCSSAPass on main
+ Running pass  LoopIdiomRecognizePass on loop %forbody in function main
+ Running pass  IndVarSimplifyPass on loop %forbody in function main
+ Running pass  LoopIdiomVectorizePass on loop %forbody in function main
+ Running pass  LoopDeletionPass on loop %forbody in function main
+ Running pass  LoopFullUnrollPass on loop %forbody in function main
  Running pass  SROAPass on main
  Running pass MemCpyOptPass on main
  Running pass  SCCPPass on main
@@ -80,6 +91,8 @@
 Running pass  DivRemPairsPass on main
  Running pass  TailCallElimPass on main
  Running pass  SimplifyCFGPass on main
+ Running pass  StripExternals on [module]
+ Running pass  GlobalDCEPass on [module]
  Running pass GlobalDCEPass on [module]
  Running pass  ConstantMergePass on [module]
 Running pass  CGProfilePass on [module]
```


_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to