Torbjörn, The test (as written today) doesn't really make sense on ILP32 (where sizeof(int) == sizeof(long)). We'll look into whether to disable (gate on LP64) or to explicitly use unsigned long long.
Thanks for the report, Philipp. On Fri, 3 Jul 2026 at 20:00, Torbjorn SVENSSON <[email protected]> wrote: > > Hi, > > The gcc.dg/pr124545-2.c test does not work for arm-none-eabi. > Is this suppose to work or is it missing some dg-require-effective-target? > > Testing gcc.dg/pr124545-2.c > doing compile > Executing on host: /build/r17-2109-g2b8f4671103159/bin/arm-none-eabi-gcc > /build/gcc_src/gcc/testsuite/gcc.dg/pr124545-2.c -mthumb -march=armv7ve+neon > -mtune=cortex-a7 -mfloat-abi=hard -mfpu=auto -dumpbase "" > -fdiagnostics-plain-output -O2 --specs=rdimon.specs -Wl,--start-group > -lc -lm -Wl,--end-group --specs=nosys.specs -Wl,--allow-multiple-definition > -Wl,-u,_isatty,-u,_fstat -Wl,-wrap,exit -Wl,-wrap,_exit -Wl,-wrap,main > -Wl,-wrap,abort -Wl,gcc_tg.o -lm -o ./pr124545-2.exe (timeout = 800) > spawn -ignore SIGHUP /build/r17-2109-g2b8f4671103159/bin/arm-none-eabi-gcc > /build/gcc_src/gcc/testsuite/gcc.dg/pr124545-2.c -mthumb -march=armv7ve+neon > -mtune=cortex-a7 -mfloat-abi=hard -mfpu=auto -dumpbase > -fdiagnostics-plain-output -O2 --specs=rdimon.specs -Wl,--start-group -lc -lm > -Wl,--end-group --specs=nosys.specs -Wl,--allow-multiple-definition > -Wl,-u,_isatty,-u,_fstat -Wl,-wrap,exit -Wl,-wrap,_exit -Wl,-wrap,main > -Wl,-wrap,abort -Wl,gcc_tg.o -lm -o ./pr124545-2.exe > pid is 165557 -165557 > pid is -1 > output is status 0 > PASS: gcc.dg/pr124545-2.c (test for excess errors) > spawning command qemu-system-arm -nographic -machine virt -cpu cortex-a7 -m > 256 -semihosting -monitor /dev/null -kernel ./pr124545-2.exe > spawn qemu-system-arm -nographic -machine virt -cpu cortex-a7 -m 256 > -semihosting -monitor /dev/null -kernel ./pr124545-2.exe > > *** EXIT code 4242 > > *** EXIT code 1 > pid is -1 > Shell closed. > Output is > *** EXIT code 4242 > > *** EXIT code 1 > > FAIL: gcc.dg/pr124545-2.c execution test > > > > This is the assembly: > $ /build/r17-2109-g2b8f4671103159/bin/arm-none-eabi-gcc > /build/gcc_src/gcc/testsuite/gcc.dg/pr124545-2.c -mthumb -march=armv7ve+neon > -mtune=cortex-a7 -mfloat-abi=hard -mfpu=auto -O2 -dp -S -o - > .arch armv7-a > .arch_extension virt > .arch_extension idiv > .arch_extension sec > .arch_extension mp > .fpu neon > .eabi_attribute 28, 1 > .eabi_attribute 20, 1 > .eabi_attribute 21, 1 > .eabi_attribute 23, 3 > .eabi_attribute 24, 1 > .eabi_attribute 25, 1 > .eabi_attribute 26, 1 > .eabi_attribute 30, 2 > .eabi_attribute 34, 1 > .eabi_attribute 18, 4 > .file "pr124545-2.c" > .text > .align 1 > .p2align 2,,3 > .global oor_eq > .syntax unified > .thumb > .thumb_func > .type oor_eq, %function > oor_eq: > @ args = 0, pretend = 0, frame = 0 > @ frame_needed = 0, uses_anonymous_args = 0 > @ link register save eliminated. > movs r0, #0 @ 10 [c=4 l=2] *thumb2_movsi_shortim > bx lr @ 17 [c=8 l=4] *thumb2_return > .size oor_eq, .-oor_eq > .align 1 > .p2align 2,,3 > .global oor_val > .syntax unified > .thumb > .thumb_func > .type oor_val, %function > oor_val: > @ args = 0, pretend = 0, frame = 0 > @ frame_needed = 0, uses_anonymous_args = 0 > @ link register save eliminated. > asrs r1, r0, #31 @ 6 [c=4 l=2] *thumb2_shiftsi3_short/1 > adds r1, r1, #1 @ 21 [c=4 l=2] *thumb2_addsi_short/0 > bx lr @ 27 [c=8 l=4] *thumb2_return > .size oor_val, .-oor_val > .align 1 > .p2align 2,,3 > .global uns_carry > .syntax unified > .thumb > .thumb_func > .type uns_carry, %function > uns_carry: > @ args = 0, pretend = 0, frame = 0 > @ frame_needed = 0, uses_anonymous_args = 0 > @ link register save eliminated. > movs r0, #1 @ 10 [c=4 l=2] *thumb2_movsi_shortim > bx lr @ 17 [c=8 l=4] *thumb2_return > .size uns_carry, .-uns_carry > .align 1 > .p2align 2,,3 > .global inrange_eq > .syntax unified > .thumb > .thumb_func > .type inrange_eq, %function > inrange_eq: > @ args = 0, pretend = 0, frame = 0 > @ frame_needed = 0, uses_anonymous_args = 0 > @ link register save eliminated. > movs r0, #1 @ 11 [c=4 l=2] *thumb2_movsi_shortim > bx lr @ 18 [c=8 l=4] *thumb2_return > .size inrange_eq, .-inrange_eq > .section .text.startup,"ax",%progbits > .align 1 > .p2align 2,,3 > .global main > .syntax unified > .thumb > .thumb_func > .type main, %function > main: > @ args = 0, pretend = 0, frame = 16 > @ frame_needed = 0, uses_anonymous_args = 0 > push {r4, lr} @ 108 [c=8 l=2] *push_multi > movs r0, #5 @ 5 [c=4 l=2] *thumb2_movsi_shortim > sub sp, sp, #16 @ 109 [c=4 l=4] *arm_addsi3/11 > bl oor_eq @ 6 [c=4 l=4] *call_value_symbol > cbnz r0, .L8 @ 9 [c=16 l=2] *thumb2_cbnz/0 > mov r0, #-1 @ 15 [c=4 l=4] *thumb2_movsi_vfp/1 > bl oor_eq @ 16 [c=4 l=4] *call_value_symbol > cbnz r0, .L8 @ 20 [c=16 l=2] *thumb2_cbnz/0 > movs r0, #5 @ 22 [c=4 l=2] *thumb2_movsi_shortim > bl oor_val @ 23 [c=4 l=4] *call_value_symbol > cmp r1, #1 @ 26 [c=20 l=6] *cmp_ior/0 > it eq > cmpeq r0, #5 > bne .L8 @ 27 [c=16 l=2] arm_cond_branch > mvn r0, #15 @ 29 [c=4 l=4] *thumb2_movsi_vfp/3 > bl uns_carry @ 30 [c=4 l=4] *call_value_symbol > mov r4, r0 @ 93 [c=4 l=2] *thumb2_movsi_vfp/0 > cbnz r0, .L8 @ 33 [c=16 l=2] *thumb2_cbnz/0 > movs r0, #10 @ 35 [c=4 l=2] *thumb2_movsi_shortim > bl uns_carry @ 36 [c=4 l=4] *call_value_symbol > cmp r0, #1 @ 38 [c=4 l=2] *arm_cmpsi_insn/0 > bne .L8 @ 39 [c=16 l=2] arm_cond_branch > movw r3, #:lower16:.LANCHOR0 @ 106 [c=4 l=4] > *thumb2_movsi_vfp/4 > movt r3, #:upper16:.LANCHOR0 @ 107 [c=4 l=4] *arm_movt/0 > ldm r3, {r0, r1, r2, r3} @ 44 [c=8 l=4] *ldm4_ > stm sp, {r0, r1, r2, r3} @ 45 [c=8 l=4] *stm4_ > movs r1, #2 @ 47 [c=4 l=2] *thumb2_movsi_shortim > mov r0, sp @ 48 [c=4 l=2] *thumb2_movsi_vfp/0 > bl inrange_eq @ 49 [c=4 l=4] *call_value_symbol > cmp r0, #1 @ 51 [c=4 l=2] *arm_cmpsi_insn/0 > bne .L8 @ 52 [c=16 l=2] arm_cond_branch > mov r0, r4 @ 58 [c=4 l=2] *thumb2_movsi_vfp/0 > add sp, sp, #16 @ 113 [c=4 l=4] *arm_addsi3/5 > @ sp needed @ 114 [c=8 l=0] force_register_use > pop {r4, pc} @ 115 [c=8 l=2] > *pop_multiple_with_writeback_and_return > .L8: > bl abort @ 11 [c=8 l=4] *call_symbol > .size main, .-main > .section .rodata > .align 2 > .set .LANCHOR0,. + 0 > .LC0: > .word 7 > .word 7 > .word 7 > .word 7 > .ident "GCC: (r17-2109-g2b8f4671103159) 17.0.0 20260703 > (experimental)" > > > Let me know if you need anything else or want me to test some potential fix. > > Kind regards, > Torbjörn > > On 2026-07-02 08:56, Richard Biener wrote: > > On Wed, 1 Jul 2026, Philipp Tomsich wrote: > > > >> visit_nary_op canonicalises (T)(A + C) into (T)A + (T)C for its VN > >> lookup, but not the reverse -- so whether VN discovers (T)A + C == > >> (T)(A + C) depends on which form it sees first. Add a match.pd rule > >> that rewrites (T)A +- CST into (T)(A +- CST') using the op! qualifier, > >> so the fold only fires when the narrow expression already has a value > >> number -- i.e. only inside VN via mprts_hook. > >> > >> Restrict to TYPE_OVERFLOW_UNDEFINED inner types: for unsigned inner the > >> narrow op wraps mod 2^prec (defined) while the widened outer op does > >> not, changing the observed value (bitfld-5.c is the concrete miscompile > >> when the guard is loosened). > >> > >> Use wi::min_precision (CST, SIGNED) rather than int_fits_type_p for the > >> fits-check, so sign-encoded small negatives (e.g. -1 as sizetype's > >> 0xFFFF...FFFF) qualify. > > > > OK. > > > > Thanks, > > Richard. > > > >> PR tree-optimization/124545 > >> > >> gcc/ChangeLog: > >> > >> * match.pd: Add (T)A +- CST -> (T)(A +- CST') for widening > >> conversions from a signed inner type with undefined overflow. > >> > >> gcc/testsuite/ChangeLog: > >> > >> * gcc.dg/pr124545.c: New test. > >> * gcc.dg/pr124545-2.c: New test. > >> > >> Signed-off-by: Philipp Tomsich <[email protected]> > >> > >> --- > >> > >> gcc/match.pd | 32 ++++++++++++++++++ > >> gcc/testsuite/gcc.dg/pr124545-2.c | 55 +++++++++++++++++++++++++++++++ > >> gcc/testsuite/gcc.dg/pr124545.c | 29 ++++++++++++++++ > >> 3 files changed, 116 insertions(+) > >> create mode 100644 gcc/testsuite/gcc.dg/pr124545-2.c > >> create mode 100644 gcc/testsuite/gcc.dg/pr124545.c > >> > >> diff --git a/gcc/match.pd b/gcc/match.pd > >> index ddf3b61638ce..817a52499128 100644 > >> --- a/gcc/match.pd > >> +++ b/gcc/match.pd > >> @@ -4067,6 +4067,38 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > >> (plus (convert @0) (op @2 (convert @1)))))) > >> #endif > >> > >> +/* Inverse of the above: (T)(A) +- CST -> (T)(A +- CST') when T is a > >> + widening conversion from a type with undefined overflow and the outer > >> + type wraps. This allows VN to discover that (T)A + (T)C == (T)(A + C) > >> + regardless of which form appears first in program order. PR124545. > >> + The rewrite is unsound for unsigned inner types: the narrow op wraps > >> + mod 2^prec (defined) while the widened op does not, changing the > >> + observed value. Cover the unsigned case separately once ranger can > >> + prove no wrap. */ > >> +#if GIMPLE > >> + (for op (plus minus) > >> + (simplify > >> + (op (convert @0) INTEGER_CST@1) > >> + (if (TREE_CODE (TREE_TYPE (@0)) == INTEGER_TYPE > >> + && TREE_CODE (type) == INTEGER_TYPE > >> + && TYPE_PRECISION (type) > TYPE_PRECISION (TREE_TYPE (@0)) > >> + && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)) > >> + && !TYPE_OVERFLOW_SANITIZED (TREE_TYPE (@0)) > >> + && TYPE_OVERFLOW_WRAPS (type) > >> + /* CST must be the sign-extension of its low inner-precision bits, > >> + otherwise narrowing changes the value. Use min_precision (.., > >> + SIGNED) rather than int_fits_type_p so that small negative > >> offsets > >> + encoded as large unsigned constants (e.g. -1 as sizetype) still > >> + qualify. */ > >> + && wi::min_precision (wi::to_wide (@1), SIGNED) > >> + <= TYPE_PRECISION (TREE_TYPE (@0))) > >> + (with { > >> + wide_int c1 = wi::to_wide (@1); > >> + tree inner_cst = wide_int_to_tree (TREE_TYPE (@0), > >> + wi::sext (c1, TYPE_PRECISION (TREE_TYPE (@0)))); > >> } > >> + (convert (op! @0 { inner_cst; })))))) > >> +#endif > >> + > >> /* (T)(A) +- (T)(B) -> (T)(A +- B) only when (A +- B) could be simplified > >> to a simple value. */ > >> (for op (plus minus) > >> diff --git a/gcc/testsuite/gcc.dg/pr124545-2.c > >> b/gcc/testsuite/gcc.dg/pr124545-2.c > >> new file mode 100644 > >> index 000000000000..b4806567acce > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/pr124545-2.c > >> @@ -0,0 +1,55 @@ > >> +/* PR tree-optimization/124545 */ > >> +/* Runtime correctness for the inverse-widening VN rewrite > >> + (T)A +- CST -> (T)(A +- CST'). The rewrite must never change the > >> + computed value. In particular it must NOT fire when CST is not > >> + representable in the inner type (which would silently drop the bits > >> + above the inner precision), and it must stay correct for unsigned > >> + inner types where the narrow operation wraps. */ > >> +/* { dg-do run } */ > >> +/* { dg-options "-O2" } */ > >> + > >> +/* CST = 2^32 does not fit in int: the value must be preserved. > >> + Before the fix this comparison folded to a constant 1. */ > >> +__attribute__((noipa)) int > >> +oor_eq (int a) > >> +{ > >> + return ((unsigned long long) a + 0x100000000ULL) == (unsigned long > >> long) a; > >> +} > >> + > >> +__attribute__((noipa)) unsigned long long > >> +oor_val (int a) > >> +{ > >> + return (unsigned long long) a + 0x100000000ULL; > >> +} > >> + > >> +/* Unsigned inner: narrow add wraps mod 2^32; the widened add does not. > >> + The result must match the wide arithmetic for every input. */ > >> +__attribute__((noipa)) int > >> +uns_carry (unsigned int a) > >> +{ > >> + unsigned int t = a + 100u; > >> + unsigned long w = (unsigned long) a + 100; > >> + return w == (unsigned long) t; > >> +} > >> + > >> +/* Legitimate in-range case (matches the PR): k == j - 1, so the two > >> + loads are the same address and the rewrite may fire. */ > >> +__attribute__((noipa)) int > >> +inrange_eq (int *p, int j) > >> +{ > >> + int k = j - 1; > >> + return p[j - 1] == p[k]; > >> +} > >> + > >> +int > >> +main (void) > >> +{ > >> + if (oor_eq (5) != 0) __builtin_abort (); > >> + if (oor_eq (-1) != 0) __builtin_abort (); > >> + if (oor_val (5) != 5ULL + 0x100000000ULL) __builtin_abort (); > >> + if (uns_carry (0xfffffff0u) != 0) __builtin_abort (); > >> + if (uns_carry (10) != 1) __builtin_abort (); > >> + int arr[4] = { 7, 7, 7, 7 }; > >> + if (inrange_eq (arr, 2) != 1) __builtin_abort (); > >> + return 0; > >> +} > >> diff --git a/gcc/testsuite/gcc.dg/pr124545.c > >> b/gcc/testsuite/gcc.dg/pr124545.c > >> new file mode 100644 > >> index 000000000000..a21346b179c7 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/pr124545.c > >> @@ -0,0 +1,29 @@ > >> +/* PR tree-optimization/124545 */ > >> +/* Verify that VN recognizes (T)A + C == (T)(A + C') regardless of > >> + operand order in the equality comparison. */ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-O2 -fdump-tree-fre1" } */ > >> + > >> +int func1(int *a, int j) { > >> + int k = j - 1; > >> + return a[j - 1] == a[k]; > >> +} > >> + > >> +int func2(int *a, int j) { > >> + int k = j - 1; > >> + return a[k] == a[j - 1]; > >> +} > >> + > >> +int func3(int *a, int j) { > >> + int k = j - 3; > >> + return a[k] == a[j - 3]; > >> +} > >> + > >> +int func4(int *a, int j) { > >> + int k = j + 2; > >> + return a[k] == a[j + 2]; > >> +} > >> + > >> +/* All four functions should fold to return 1 after FRE. */ > >> +/* The pattern is not applied on ilp32 targets (PR116845). */ > >> +/* { dg-final { scan-tree-dump-times "return 1;" 4 "fre1" { xfail { ilp32 > >> } } } } */ > >> > > >
