Hi, I previously sent a patch for PR94071 and was asked to make some fixes.
I’ve prepared a v2 with the requested changes, but it seems it might have been missed, so I’m sending it again here. Thanks, Denis -- Denis Dolya (Ferki) GCC contributor GitHub: https://github.com/Ferki-git-creator
From e8cfde9d6ef16ace0c947828442ab9ee5dca7095 Mon Sep 17 00:00:00 2001 From: Denis Dolya <[email protected]> Date: Wed, 4 Feb 2026 09:45:21 +0000 Subject: [PATCH v2] tree-optimization: handle non-constant SSA offsets in store merging (PR94071) (v2) Rework offset handling to reuse GCC's affine decomposition. Teach the store-merging pass to compare offsets using tree_to_aff_combination_expand, so adjacent byte loads can be merged even when the index is computed through temporaries or helper expressions. Reusing the generic affine decomposition avoids duplicating ad-hoc SSA offset walking and naturally supports more cases than simple +/- constants. Add an aarch64 testsuite case to verify adjacent byte-load merging. Tested on aarch64-linux-gnu: make check-gcc RUNTESTFLAGS="aarch64.exp=gcc.target/aarch64/adjacent-byte-load-merge.c" Signed-off-by: Denis Dolya <[email protected]> --- gcc/gimple-ssa-store-merging.cc | 76 ++++++++++++++++++- .../aarch64/adjacent-byte-load-merge.c | 76 +++++++++++++++++++ 2 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc index 2c22ec3baf2..75c33f00b37 100644 --- a/gcc/gimple-ssa-store-merging.cc +++ b/gcc/gimple-ssa-store-merging.cc @@ -153,6 +153,7 @@ #include "fold-const.h" #include "print-tree.h" #include "tree-hash-traits.h" +#include "tree-affine.h" #include "gimple-iterator.h" #include "gimplify.h" #include "gimple-fold.h" @@ -438,6 +439,69 @@ find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */ +/* If OFF1 and OFF2 represent the same affine expression (same non-constant + part), return in *DELTA the constant difference OFF2 - OFF1. This allows + store-merging to compare adjacent loads even if offsets are computed via + SSA temporaries. */ + +static bool +const_offset_delta_p (tree off1, tree off2, HOST_WIDE_INT *delta) +{ + if (off1 == NULL_TREE || off2 == NULL_TREE) + { + if (off1 == off2) + { + *delta = 0; + return true; + } + return false; + } + + hash_map<tree, name_expansion *> *cache = NULL; + aff_tree a1, a2; + bool ok = false; + + tree_to_aff_combination_expand (off1, TREE_TYPE (off1), &a1, &cache); + tree_to_aff_combination_expand (off2, TREE_TYPE (off2), &a2, &cache); + + if (a1.n == a2.n + && ((a1.rest == NULL_TREE && a2.rest == NULL_TREE) + || (a1.rest && a2.rest && operand_equal_p (a1.rest, a2.rest, 0)))) + { + unsigned i; + ok = true; + + for (i = 0; i < a1.n; i++) + if (a1.elts[i].coef != a2.elts[i].coef + || !operand_equal_p (a1.elts[i].val, a2.elts[i].val, 0)) + { + ok = false; + break; + } + + if (ok) + { + widest_int c1, c2; + + if (a1.offset.is_constant (&c1) + && a2.offset.is_constant (&c2)) + { + widest_int diff = c2 - c1; + + if (wi::fits_shwi_p (diff)) + *delta = diff.to_shwi (); + else + ok = false; + } + else + ok = false; + } + } + + free_affine_expand_cache (&cache); + return ok; +} + gimple * perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, gimple *source_stmt2, struct symbolic_number *n2, @@ -470,13 +534,19 @@ perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, || !operand_equal_p (n1->base_addr, n2->base_addr, 0)) return NULL; - if (!n1->offset != !n2->offset - || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0))) - return NULL; + HOST_WIDE_INT offset_delta = 0; + + if (!n1->offset != !n2->offset) + return NULL; + + if (n1->offset + && !const_offset_delta_p (n1->offset, n2->offset, &offset_delta)) + return NULL; start1 = 0; if (!(n2->bytepos - n1->bytepos).is_constant (&start2)) return NULL; + start2 += offset_delta; if (start1 < start2) { diff --git a/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c b/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c new file mode 100644 index 00000000000..237b2f9896f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c @@ -0,0 +1,76 @@ +/* { dg-do compile { target aarch64*-*-* } } */ +/* { dg-options "-O3" } */ +/* PR tree-optimization/94071. */ + +/* Ensure adjacent byte loads are merged into a halfword load. */ +/* { dg-final { scan-assembler-times {[[:space:]]ldrh[[:space:]]} 4 } } */ +/* { dg-final { scan-assembler-not {[[:space:]]ldrb[[:space:]]} } } */ + +#include <stdint.h> + +extern uint8_t data[1024]; + +/* Simple helper for byte index variants. */ +static inline int +idx_plus (int base, int add) +{ + return base + add; +} + +uint16_t +getU16_basic (int addr) +{ + /* Direct pattern. */ + return (uint16_t) data[addr] + | ((uint16_t) data[addr + 1] << 8); +} + +uint16_t +getU16_tmp (int addr) +{ + int a1 = addr + 1; + + /* SSA temp for offset. */ + return (uint16_t) data[addr] + | ((uint16_t) data[a1] << 8); +} + +uint16_t +getU16_helper (int addr) +{ + int a0 = idx_plus (addr, 0); + int a1 = idx_plus (addr, 1); + + /* Helper call in the index chain. */ + return (uint16_t) data[a0] + | ((uint16_t) data[a1] << 8); +} + +uint16_t +getU16_state (int addr) +{ + enum { S_LOAD0 = 0, S_LOAD1 = 1, S_DONE = 2 } state; + uint16_t out; + int i; + + /* Minimal state machine. */ + state = S_LOAD0; + out = 0; + i = addr; + + while (state != S_DONE) + { + if (state == S_LOAD0) + { + out = (uint16_t) data[i]; + state = S_LOAD1; + } + else if (state == S_LOAD1) + { + out |= (uint16_t) data[i + 1] << 8; + state = S_DONE; + } + } + + return out; +} -- 2.51.0
