Hi,

I previously sent a patch for PR94071 and was asked to make some fixes.

I’ve prepared a v2 with the requested changes, but it seems it might have
been missed, so I’m sending it again here.

Thanks,
Denis

--
Denis Dolya (Ferki)
GCC contributor
GitHub: https://github.com/Ferki-git-creator
From e8cfde9d6ef16ace0c947828442ab9ee5dca7095 Mon Sep 17 00:00:00 2001
From: Denis Dolya <[email protected]>
Date: Wed, 4 Feb 2026 09:45:21 +0000
Subject: [PATCH v2] tree-optimization: handle non-constant SSA offsets in
 store merging (PR94071)

(v2) Rework offset handling to reuse GCC's affine decomposition.

Teach the store-merging pass to compare offsets using
tree_to_aff_combination_expand, so adjacent byte loads can be merged even
when the index is computed through temporaries or helper expressions.

Reusing the generic affine decomposition avoids duplicating ad-hoc SSA
offset walking and naturally supports more cases than simple +/- constants.

Add an aarch64 testsuite case to verify adjacent byte-load merging.

Tested on aarch64-linux-gnu:
  make check-gcc RUNTESTFLAGS="aarch64.exp=gcc.target/aarch64/adjacent-byte-load-merge.c"

Signed-off-by: Denis Dolya <[email protected]>
---
 gcc/gimple-ssa-store-merging.cc               | 76 ++++++++++++++++++-
 .../aarch64/adjacent-byte-load-merge.c        | 76 +++++++++++++++++++
 2 files changed, 149 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c

diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index 2c22ec3baf2..75c33f00b37 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -153,6 +153,7 @@
 #include "fold-const.h"
 #include "print-tree.h"
 #include "tree-hash-traits.h"
+#include "tree-affine.h"
 #include "gimple-iterator.h"
 #include "gimplify.h"
 #include "gimple-fold.h"
@@ -438,6 +439,69 @@ find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
    bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements
    are respectively SOURCE_STMT1 and SOURCE_STMT2.  CODE is the operation.  */
 
+/* If OFF1 and OFF2 represent the same affine expression (same non-constant
+   part), return in *DELTA the constant difference OFF2 - OFF1.  This allows
+   store-merging to compare adjacent loads even if offsets are computed via
+   SSA temporaries.  */
+
+static bool
+const_offset_delta_p (tree off1, tree off2, HOST_WIDE_INT *delta)
+{
+  if (off1 == NULL_TREE || off2 == NULL_TREE)
+    {
+      if (off1 == off2)
+        {
+          *delta = 0;
+          return true;
+        }
+      return false;
+    }
+
+  hash_map<tree, name_expansion *> *cache = NULL;
+  aff_tree a1, a2;
+  bool ok = false;
+
+  tree_to_aff_combination_expand (off1, TREE_TYPE (off1), &a1, &cache);
+  tree_to_aff_combination_expand (off2, TREE_TYPE (off2), &a2, &cache);
+
+  if (a1.n == a2.n
+      && ((a1.rest == NULL_TREE && a2.rest == NULL_TREE)
+          || (a1.rest && a2.rest && operand_equal_p (a1.rest, a2.rest, 0))))
+    {
+      unsigned i;
+      ok = true;
+
+      for (i = 0; i < a1.n; i++)
+        if (a1.elts[i].coef != a2.elts[i].coef
+            || !operand_equal_p (a1.elts[i].val, a2.elts[i].val, 0))
+          {
+            ok = false;
+            break;
+          }
+
+      if (ok)
+        {
+          widest_int c1, c2;
+
+          if (a1.offset.is_constant (&c1)
+              && a2.offset.is_constant (&c2))
+            {
+              widest_int diff = c2 - c1;
+
+              if (wi::fits_shwi_p (diff))
+                *delta = diff.to_shwi ();
+              else
+                ok = false;
+            }
+          else
+            ok = false;
+        }
+    }
+
+  free_affine_expand_cache (&cache);
+  return ok;
+}
+
 gimple *
 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
 			gimple *source_stmt2, struct symbolic_number *n2,
@@ -470,13 +534,19 @@ perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
 	  || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
 	return NULL;
 
-      if (!n1->offset != !n2->offset
-	  || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
-	return NULL;
+      HOST_WIDE_INT offset_delta = 0;
+
+      if (!n1->offset != !n2->offset)
+        return NULL;
+
+      if (n1->offset
+          && !const_offset_delta_p (n1->offset, n2->offset, &offset_delta))
+        return NULL;
 
       start1 = 0;
       if (!(n2->bytepos - n1->bytepos).is_constant (&start2))
 	return NULL;
+      start2 += offset_delta;
 
       if (start1 < start2)
 	{
diff --git a/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c b/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c
new file mode 100644
index 00000000000..237b2f9896f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/adjacent-byte-load-merge.c
@@ -0,0 +1,76 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+/* { dg-options "-O3" } */
+/* PR tree-optimization/94071.  */
+
+/* Ensure adjacent byte loads are merged into a halfword load.  */
+/* { dg-final { scan-assembler-times {[[:space:]]ldrh[[:space:]]} 4 } } */
+/* { dg-final { scan-assembler-not {[[:space:]]ldrb[[:space:]]} } } */
+
+#include <stdint.h>
+
+extern uint8_t data[1024];
+
+/* Simple helper for byte index variants.  */
+static inline int
+idx_plus (int base, int add)
+{
+  return base + add;
+}
+
+uint16_t
+getU16_basic (int addr)
+{
+  /* Direct pattern.  */
+  return (uint16_t) data[addr]
+    | ((uint16_t) data[addr + 1] << 8);
+}
+
+uint16_t
+getU16_tmp (int addr)
+{
+  int a1 = addr + 1;
+
+  /* SSA temp for offset.  */
+  return (uint16_t) data[addr]
+    | ((uint16_t) data[a1] << 8);
+}
+
+uint16_t
+getU16_helper (int addr)
+{
+  int a0 = idx_plus (addr, 0);
+  int a1 = idx_plus (addr, 1);
+
+  /* Helper call in the index chain.  */
+  return (uint16_t) data[a0]
+    | ((uint16_t) data[a1] << 8);
+}
+
+uint16_t
+getU16_state (int addr)
+{
+  enum { S_LOAD0 = 0, S_LOAD1 = 1, S_DONE = 2 } state;
+  uint16_t out;
+  int i;
+
+  /* Minimal state machine.  */
+  state = S_LOAD0;
+  out = 0;
+  i = addr;
+
+  while (state != S_DONE)
+    {
+      if (state == S_LOAD0)
+        {
+          out = (uint16_t) data[i];
+          state = S_LOAD1;
+        }
+      else if (state == S_LOAD1)
+        {
+          out |= (uint16_t) data[i + 1] << 8;
+          state = S_DONE;
+        }
+    }
+
+  return out;
+}
-- 
2.51.0

Reply via email to