Hello, this patch fixes PR tree-optimization/94071 by improving the store-merging pass to recognize adjacent byte loads even when offsets are computed through simple SSA expressions.
The change teaches the pass to decompose offset expressions into base plus constant, allowing patterns like data[i] and data[i + 1] to be merged into halfword loads even when temporaries or helper functions are involved. An AArch64 testsuite case is added to verify the optimization. Tested on aarch64-linux-gnu: make check-gcc RUNTESTFLAGS="aarch64.exp=gcc.target/aarch64/adjacent-byte-load-merge.c" The patch is attached. Any feedback is welcome. Best regards, Denis Dolya -- Denis Dolya (Ferki) GCC contributor GitHub: https://github.com/Ferki-git-creator
From a30d41f951b17d0a9b663feba8053d394597d481 Mon Sep 17 00:00:00 2001 From: Denis Dolya <[email protected]> Date: Wed, 4 Feb 2026 09:45:21 +0000 Subject: [PATCH] tree-optimization: handle non-constant SSA offsets in store merging (PR94071) Teach the store-merging pass to decompose simple SSA offset expressions into base plus constant. This allows combining adjacent byte loads even when the index is computed through temporaries or helper functions. This fixes cases like data[i] and data[i + 1] not being merged into halfword loads. Add an aarch64 testsuite case to verify adjacent byte-load merging. Note: assembler scan patterns are whitespace-agnostic, since formatting is not stable across assemblers. Tested on aarch64-linux-gnu: make check-gcc RUNTESTFLAGS="aarch64.exp=gcc.target/aarch64/adjacent-byte-load-merge.c" Signed-off-by: Denis Dolya <[email protected]> --- gcc/gimple-ssa-store-merging.cc | 135 +++++++++++++++++- .../aarch64/adjacent-byte-load-merge.c | 76 ++++++++++ 2 files changed, 209 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc index 2c22ec3baf2..54fb0599f22 100644 --- a/gcc/gimple-ssa-store-merging.cc +++ b/gcc/gimple-ssa-store-merging.cc @@ -438,6 +438,115 @@ find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */ +/* Decompose OFF into BASE + CST, following simple SSA_NAME chains of + PLUS_EXPR/MINUS_EXPR with an integer constant. This lets the bswap pass + handle patterns like data[i] and data[i + 1]. */ + +enum { MAX_OFFSET_DECOMPOSE_DEPTH = 8 }; + +static bool +decompose_const_offset (tree off, tree *base, HOST_WIDE_INT *cst) +{ + HOST_WIDE_INT acc = 0; + int depth = 0; + + if (off == NULL_TREE) + { + *base = NULL_TREE; + *cst = 0; + return true; + } + + if (TREE_CODE (off) == INTEGER_CST) + { + if (!tree_fits_shwi_p (off)) + return false; + + *base = NULL_TREE; + *cst = tree_to_shwi (off); + return true; + } + + /* Limit SSA walk depth to avoid pathological chains. */ + while (depth++ < MAX_OFFSET_DECOMPOSE_DEPTH) + { + if (CONVERT_EXPR_P (off)) + { + off = TREE_OPERAND (off, 0); + continue; + } + + if (TREE_CODE (off) == PLUS_EXPR || TREE_CODE (off) == MINUS_EXPR) + { + tree op0 = TREE_OPERAND (off, 0); + tree op1 = TREE_OPERAND (off, 1); + + if (TREE_CODE (op1) == INTEGER_CST && tree_fits_shwi_p (op1)) + { + HOST_WIDE_INT delta = tree_to_shwi (op1); + + if (TREE_CODE (off) == MINUS_EXPR) + delta = -delta; + + acc += delta; + off = op0; + continue; + } + + if (TREE_CODE (off) == PLUS_EXPR + && TREE_CODE (op0) == INTEGER_CST + && tree_fits_shwi_p (op0)) + { + acc += tree_to_shwi (op0); + off = op1; + continue; + } + } + + if (TREE_CODE (off) != SSA_NAME) + break; + + gimple *def = SSA_NAME_DEF_STMT (off); + if (!is_gimple_assign (def)) + break; + + enum tree_code code = gimple_assign_rhs_code (def); + if (code != PLUS_EXPR && code != MINUS_EXPR) + break; + + tree rhs1 = gimple_assign_rhs1 (def); + tree rhs2 = gimple_assign_rhs2 (def); + + if (TREE_CODE (rhs2) == INTEGER_CST && tree_fits_shwi_p (rhs2)) + { + HOST_WIDE_INT delta = tree_to_shwi (rhs2); + + if (code == MINUS_EXPR) + delta = -delta; + + acc += delta; + off = rhs1; + continue; + } + + if (code == PLUS_EXPR + && TREE_CODE (rhs1) == INTEGER_CST + && tree_fits_shwi_p (rhs1)) + { + acc += tree_to_shwi (rhs1); + off = rhs2; + continue; + } + + break; + } + + *base = off; + *cst = acc; + return true; +} + + gimple * perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, gimple *source_stmt2, struct symbolic_number *n2, @@ -470,13 +579,35 @@ perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, || !operand_equal_p (n1->base_addr, n2->base_addr, 0)) return NULL; - if (!n1->offset != !n2->offset - || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0))) + HOST_WIDE_INT offset_delta = 0; + + if (!n1->offset != !n2->offset) return NULL; + if (n1->offset) + { + tree base1, base2; + HOST_WIDE_INT cst1, cst2; + + if (!decompose_const_offset (n1->offset, &base1, &cst1) + || !decompose_const_offset (n2->offset, &base2, &cst2)) + return NULL; + + if (base1 == NULL_TREE || base2 == NULL_TREE) + { + if (base1 != base2) + return NULL; + } + else if (!operand_equal_p (base1, base2, 0)) + return NULL; + + offset_delta = cst2 - cst1; + } + start1 = 0; if (!(n2->bytepos - n1->bytepos).is_constant (&start2)) return NULL; + start2 += offset_delta; if (start1 < start2) { diff --git a/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c b/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c new file mode 100644 index 00000000000..237b2f9896f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c @@ -0,0 +1,76 @@ +/* { dg-do compile { target aarch64*-*-* } } */ +/* { dg-options "-O3" } */ +/* PR tree-optimization/94071. */ + +/* Ensure adjacent byte loads are merged into a halfword load. */ +/* { dg-final { scan-assembler-times {[[:space:]]ldrh[[:space:]]} 4 } } */ +/* { dg-final { scan-assembler-not {[[:space:]]ldrb[[:space:]]} } } */ + +#include <stdint.h> + +extern uint8_t data[1024]; + +/* Simple helper for byte index variants. */ +static inline int +idx_plus (int base, int add) +{ + return base + add; +} + +uint16_t +getU16_basic (int addr) +{ + /* Direct pattern. */ + return (uint16_t) data[addr] + | ((uint16_t) data[addr + 1] << 8); +} + +uint16_t +getU16_tmp (int addr) +{ + int a1 = addr + 1; + + /* SSA temp for offset. */ + return (uint16_t) data[addr] + | ((uint16_t) data[a1] << 8); +} + +uint16_t +getU16_helper (int addr) +{ + int a0 = idx_plus (addr, 0); + int a1 = idx_plus (addr, 1); + + /* Helper call in the index chain. */ + return (uint16_t) data[a0] + | ((uint16_t) data[a1] << 8); +} + +uint16_t +getU16_state (int addr) +{ + enum { S_LOAD0 = 0, S_LOAD1 = 1, S_DONE = 2 } state; + uint16_t out; + int i; + + /* Minimal state machine. */ + state = S_LOAD0; + out = 0; + i = addr; + + while (state != S_DONE) + { + if (state == S_LOAD0) + { + out = (uint16_t) data[i]; + state = S_LOAD1; + } + else if (state == S_LOAD1) + { + out |= (uint16_t) data[i + 1] << 8; + state = S_DONE; + } + } + + return out; +} -- 2.51.0
