| Issue |
164617
|
| Summary |
[llvm] Miscompilation with SROA (and `fabs`?)
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
thewilsonator
|
The following code:
```llvm
; ModuleID = './gh4997.d'
source_filename = "./gh4997.d"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macos14"
@.str = private unnamed_addr constant [11 x i8] c"./gh4997.d\00" ; [#uses = 1]
; [#uses = 0]
; Function Attrs: uwtable(sync)
define i32 @main(i32 %__param_0_arg, ptr %__param_1_arg) #0 {
%__param_0 = alloca i32, align 4 ; [#uses = 1, size/byte = 4]
%__param_1 = alloca ptr, align 8 ; [#uses = 1, size/byte = 8]
%v = alloca [2 x double], align 8 ; [#uses = 3, size/byte = 16]
%maxip = alloca double, align 8 ; [#uses = 4, size/byte = 8]
%d = alloca i32, align 4 ; [#uses = 6, size/byte = 4]
%w = alloca [2 x double], align 8 ; [#uses = 4, size/byte = 16]
%ip = alloca double, align 8 ; [#uses = 3, size/byte = 8]
store i32 %__param_0_arg, ptr %__param_0, align 4
store ptr %__param_1_arg, ptr %__param_1, align 8
store [2 x double] [double -1.000000e+00, double 0.000000e+00], ptr %v, align 8
store double 0.000000e+00, ptr %maxip, align 8
store i32 0, ptr %d, align 4
store i32 0, ptr %d, align 4
br label %forcond
forcond: ; preds = %forinc, %0
%1 = load i32, ptr %d, align 4 ; [#uses = 1]
%2 = icmp slt i32 %1, 2 ; [#uses = 1]
br i1 %2, label %forbody, label %endfor
forbody: ; preds = %forcond
store [2 x double] [double 0.000000e+00, double 1.000000e+00], ptr %w, align 8
%3 = load i32, ptr %d, align 4 ; [#uses = 1]
%4 = sext i32 %3 to i64 ; [#uses = 3]
%bounds.cmp = icmp ult i64 %4, 2 ; [#uses = 1]
br i1 %bounds.cmp, label %bounds.ok, label %bounds.fail
bounds.ok: ; preds = %forbody
%5 = getelementptr inbounds [2 x double], ptr %w, i32 0, i64 %4 ; [#uses = 1, type = ptr]
store double 1.000000e+00, ptr %5, align 8
%6 = getelementptr inbounds [2 x double], ptr %v, i32 0, i64 0 ; [#uses = 1, type = ptr]
%7 = load double, ptr %6, align 8 ; [#uses = 1]
%8 = getelementptr inbounds [2 x double], ptr %w, i32 0, i64 0 ; [#uses = 1, type = ptr]
%9 = load double, ptr %8, align 8 ; [#uses = 1]
%10 = fmul double %7, %9 ; [#uses = 1]
%11 = getelementptr inbounds [2 x double], ptr %v, i32 0, i64 1 ; [#uses = 1, type = ptr]
%12 = load double, ptr %11, align 8 ; [#uses = 1]
%13 = getelementptr inbounds [2 x double], ptr %w, i32 0, i64 1 ; [#uses = 1, type = ptr]
%14 = load double, ptr %13, align 8 ; [#uses = 1]
%15 = fmul double %12, %14 ; [#uses = 1]
%16 = fadd double %10, %15 ; [#uses = 1]
store double %16, ptr %ip, align 8
%17 = load double, ptr %ip, align 8 ; [#uses = 1]
%18 = call double @llvm.fabs.f64(double %17) #2 ; [#uses = 1]
%19 = load double, ptr %maxip, align 8 ; [#uses = 1]
%20 = call double @llvm.fabs.f64(double %19) #2 ; [#uses = 1]
%21 = fcmp ogt double %18, %20 ; [#uses = 1]
br i1 %21, label %if, label %endif
if: ; preds = %bounds.ok
%22 = load double, ptr %ip, align 8 ; [#uses = 1]
store double %22, ptr %maxip, align 8
br label %endif
endif: ; preds = %if, %bounds.ok
br label %forinc
bounds.fail: ; preds = %forbody
%.lcssa = phi i64 [ %4, %forbody ] ; [#uses = 1, type = i64]
call void @_d_arraybounds_index({ i64, ptr } { i64 10, ptr @.str }, i32 15, i64 %.lcssa, i64 2) #1
unreachable
forinc: ; preds = %endif
%23 = load i32, ptr %d, align 4 ; [#uses = 1]
%24 = add i32 %23, 1 ; [#uses = 1]
store i32 %24, ptr %d, align 4
br label %forcond
endfor: ; preds = %forcond
%25 = load double, ptr %maxip, align 8 ; [#uses = 1]
%26 = fcmp oge double %25, 0.000000e+00 ; [#uses = 1]
%27 = zext i1 %26 to i32 ; [#uses = 1]
ret i32 %27
}
; [#uses = 1]
; Function Attrs: cold noreturn uwtable(sync)
declare void @_d_arraybounds_index({ i64, ptr }, i32, i64, i64) #1
; [#uses = 2]
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare double @llvm.fabs.f64(double) #2
attributes #0 = { uwtable(sync) "frame-pointer"="all" "target-cpu"="generic" }
attributes #1 = { cold noreturn uwtable(sync) }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
!llvm.ident = !{!0}
!0 = !{!"ldc version 1.41.0"}
```
Generated from the following D code from LDC 1.41 on LLVM 20.1.5
```d
pragma(LDC_intrinsic, "llvm.fabs.f#")
T llvm_fabs(T)(T val)
if (__traits(isFloating, T));
pragma(LDC_no_moduleinfo);
extern(C) int main(int, char**)
{
double[2] v = [-1.0, 0.0];
double maxip = 0;
int d;
version(none)
for (d = 0; d < 2; d++)
{
double[2] w = [0.0, 1.0];
w[d] = 1.0;
double ip = v[0] * w[0] + v[1] * w[1];
if (llvm_fabs(ip) > llvm_fabs(maxip))
maxip = ip;
}
return maxip >= 0;
}
```
with`-O0` returns `0`. With `-O1` returns 1. The corresponding C compiled by clang
```c
int main(int, char**)
{
double v[2] = {-1.0, 0.0};
double maxip = 0;
for (int d = 0; d < 2; d++)
{
double w[2] = {0.0, 1.0};
w[d] = 1.0;
double ip = v[0] * w[0] + v[1] * w[1];
if (__builtin_fabs(ip) > __builtin_fabs(maxip))
maxip = ip;
}
return maxip >= 0;
}
```
is optimised to `ret i32 0` with `-O2`.
Compiling and running the results of `--print-module-scope --print-before-pass-number=N`:
```llvm
; *** IR Dump Before 47-SROAPass on main ***
source_filename = "./gh4997.d"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macos14"
define range(i32 0, 2) i32 @main(i32 %__param_0_arg, ptr nocapture readnone %__param_1_arg) local_unnamed_addr #0 {
%w = alloca [2 x double], align 8
%.fca.1.gep = getelementptr inbounds nuw i8, ptr %w, i64 8
br label %forbody
forbody: ; preds = %0
store double 0.000000e+00, ptr %w, align 8
store double 1.000000e+00, ptr %.fca.1.gep, align 8
store double 1.000000e+00, ptr %w, align 8
%1 = load double, ptr %w, align 8
%2 = load double, ptr %.fca.1.gep, align 8
%3 = fmul double %2, 0.000000e+00
%4 = fsub double %3, %1
%5 = call double @llvm.fabs.f64(double %4) #2
%6 = fcmp ogt double %5, 0.000000e+00
%maxip.1 = select i1 %6, double %4, double 0.000000e+00
store double 0.000000e+00, ptr %w, align 8
store double 1.000000e+00, ptr %.fca.1.gep, align 8
%7 = getelementptr inbounds nuw [2 x double], ptr %w, i64 0, i64 1
store double 1.000000e+00, ptr %7, align 8
%8 = load double, ptr %w, align 8
%9 = load double, ptr %.fca.1.gep, align 8
%10 = fmul double %9, 0.000000e+00
%11 = fsub double %10, %8
%12 = call double @llvm.fabs.f64(double %11) #2
%13 = fcmp ogt double %12, %maxip.1
%maxip.1.1 = select i1 %13, double %11, double %maxip.1
%14 = fcmp oge double %maxip.1.1, 0.000000e+00
%15 = zext i1 %14 to i32
ret i32 %15
}
declare double @llvm.fabs.f64(double) #1
attributes #0 = { uwtable(sync) "frame-pointer"="all" "target-cpu"="generic" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
```
returns with exit code 0 (for N <= 47), after running SROA (N>48), it is optimised to
```llvm
; *** IR Dump Before 48-MemCpyOptPass on main ***
source_filename = "./gh4997.d"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macos14"
; Function Attrs: uwtable(sync)
define range(i32 0, 2) i32 @main(i32 %__param_0_arg, ptr nocapture readnone %__param_1_arg) local_unnamed_addr #0 {
br label %forbody
forbody: ; preds = %0
%1 = fmul double 1.000000e+00, 0.000000e+00
%2 = fsub double %1, 1.000000e+00
%3 = call double @llvm.fabs.f64(double %2) #2
%4 = fcmp ogt double %3, 0.000000e+00
%maxip.1 = select i1 %4, double %2, double 0.000000e+00
%5 = fmul double 1.000000e+00, 0.000000e+00
%6 = fsub double %5, 0.000000e+00
%7 = call double @llvm.fabs.f64(double %6) #2
%8 = fcmp ogt double %7, %maxip.1
%maxip.1.1 = select i1 %8, double %6, double %maxip.1
%9 = fcmp oge double %maxip.1.1, 0.000000e+00
%10 = zext i1 %9 to i32
ret i32 %10
}
declare double @llvm.fabs.f64(double) #1
attributes #0 = { uwtable(sync) "frame-pointer"="all" "target-cpu"="generic" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="generic" }
```
which returns with exit code 1. LDC's optimisation pipeline is: [pipeline-ldc.txt](https://github.com/user-attachments/files/23053556/pipeline-ldc.txt)
When the unoptimised output is optimised with `opt -O1` the program succeeds with exit code 0. `opt` runs the following pipeline:
[pipeline-opt.txt](https://github.com/user-attachments/files/23053557/pipeline-opt.txt)
The diff is:
```diff
diff --git a/pipeline-opt.txt b/pipeline-ldc.txt
index 79ceda9..1a769cc 100644
--- a/pipeline-opt.txt
+++ b/pipeline-ldc.txt
@@ -29,10 +29,21 @@
Running pass ReassociatePass on main
Running pass LoopSimplifyPass on main
Running pass LCSSAPass on main
+ Running pass LoopInstSimplifyPass on loop %forcond in function main
+ Running pass LoopSimplifyCFGPass on loop %forcond in function main
+ Running pass LICMPass on loop %forcond in function main
+ Running pass LoopRotatePass on loop %forcond in function main
+ Running pass LICMPass on loop %forbody in function main
+ Running pass SimpleLoopUnswitchPass on loop %forbody in function main
Running pass SimplifyCFGPass on main
Running pass InstCombinePass on main
Running pass LoopSimplifyPass on main
Running pass LCSSAPass on main
+ Running pass LoopIdiomRecognizePass on loop %forbody in function main
+ Running pass IndVarSimplifyPass on loop %forbody in function main
+ Running pass LoopIdiomVectorizePass on loop %forbody in function main
+ Running pass LoopDeletionPass on loop %forbody in function main
+ Running pass LoopFullUnrollPass on loop %forbody in function main
Running pass SROAPass on main
Running pass MemCpyOptPass on main
Running pass SCCPPass on main
@@ -80,6 +91,8 @@
Running pass DivRemPairsPass on main
Running pass TailCallElimPass on main
Running pass SimplifyCFGPass on main
+ Running pass StripExternals on [module]
+ Running pass GlobalDCEPass on [module]
Running pass GlobalDCEPass on [module]
Running pass ConstantMergePass on [module]
Running pass CGProfilePass on [module]
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs