On Tue, 20 Jan 2026 10:01:31 GMT, Eric Fang <[email protected]> wrote:
>> This patch adds intrinsic support for UMIN and UMAX reduction operations in
>> the Vector API on AArch64, enabling direct hardware instruction mapping for
>> better performance.
>>
>> Changes:
>> --------
>>
>> 1. C2 mid-end:
>> - Added UMinReductionVNode and UMaxReductionVNode
>>
>> 2. AArch64 Backend:
>> - Added uminp/umaxp/sve_uminv/sve_umaxv instructions
>> - Updated match rules for all vector sizes and element types
>> - Both NEON and SVE implementation are supported
>>
>> 3. Test:
>> - Added UMIN_REDUCTION_V and UMAX_REDUCTION_V to IRNode.java
>> - Added assembly tests in aarch64-asmtest.py for new instructions
>> - Added a JTReg test file VectorUMinMaxReductionTest.java
>>
>> Different configurations were tested on aarch64 and x86 machines, and all
>> tests passed.
>>
>> Test results of JMH benchmarks from the panama-vector project:
>> --------
>>
>> On a Nvidia Grace machine with 128-bit SVE:
>>
>> Benchmark Unit Before Error After
>> Error Uplift
>> Byte128Vector.UMAXLanes ops/ms 411.60 42.18 25226.51
>> 33.92 61.29
>> Byte128Vector.UMAXMaskedLanes ops/ms 558.56 85.12 25182.90
>> 28.74 45.09
>> Byte128Vector.UMINLanes ops/ms 645.58 780.76 28396.29
>> 103.11 43.99
>> Byte128Vector.UMINMaskedLanes ops/ms 621.09 718.27 26122.62
>> 42.68 42.06
>> Byte64Vector.UMAXLanes ops/ms 296.33 34.44 14357.74
>> 15.95 48.45
>> Byte64Vector.UMAXMaskedLanes ops/ms 376.54 44.01 14269.24
>> 21.41 37.90
>> Byte64Vector.UMINLanes ops/ms 373.45 426.51 15425.36
>> 66.20 41.31
>> Byte64Vector.UMINMaskedLanes ops/ms 353.32 346.87 14201.37
>> 13.79 40.19
>> Int128Vector.UMAXLanes ops/ms 174.79 192.51 9906.07
>> 286.93 56.67
>> Int128Vector.UMAXMaskedLanes ops/ms 157.23 206.68 10246.77
>> 11.44 65.17
>> Int64Vector.UMAXLanes ops/ms 95.30 126.49 4719.30
>> 98.57 49.52
>> Int64Vector.UMAXMaskedLanes ops/ms 88.19 87.44 4693.18
>> 19.76 53.22
>> Long128Vector.UMAXLanes ops/ms 80.62 97.82 5064.01
>> 35.52 62.82
>> Long128Vector.UMAXMaskedLanes ops/ms 78.15 102.91 5028.24 8.74
>> 64.34
>> Long64Vector.UMAXLanes ops/ms 47.56 62.01 46.76
>> 52.28 0.98
>> Long64Vector.UMAXMaskedLanes ops/ms 45.44 46.76 45.79
>> 42.91 1.01
>> Short128Vector.UMAXLanes ops/ms 316.65 410.30 14814.82
>> 23.65 46.79
>> ...
>
> Eric Fang has updated the pull request with a new target base due to a merge
> or a rebase. The pull request now contains four commits:
>
> - Rebase commit 56d7b52
> - Merge branch 'master' into JDK-8372980-umin-umax-intrinsic
> - 8372980: [VectorAPI] AArch64: Add intrinsic support for unsigned min/max
> reduction operations
>
> This patch adds intrinsic support for UMIN and UMAX reduction operations
> in the Vector API on AArch64, enabling direct hardware instruction mapping
> for better performance.
>
> Changes:
> --------
>
> 1. C2 mid-end:
> - Added UMinReductionVNode and UMaxReductionVNode
>
> 2. AArch64 Backend:
> - Added uminp/umaxp/sve_uminv/sve_umaxv instructions
> - Updated match rules for all vector sizes and element types
> - Both NEON and SVE implementation are supported
>
> 3. Test:
> - Added UMIN_REDUCTION_V and UMAX_REDUCTION_V to IRNode.java
> - Added assembly tests in aarch64-asmtest.py for new instructions
> - Added a JTReg test file VectorUMinMaxReductionTest.java
>
> Different configurations were tested on aarch64 and x86 machines, and
> all tests passed.
>
> Test results of JMH benchmarks from the panama-vector project:
> --------
>
> On a Nvidia Grace machine with 128-bit SVE:
> ```
> Benchmark Unit Before Error After Error
> Uplift
> Byte128Vector.UMAXLanes ops/ms 411.60 42.18 25226.51
> 33.92 61.29
> Byte128Vector.UMAXMaskedLanes ops/ms 558.56 85.12 25182.90
> 28.74 45.09
> Byte128Vector.UMINLanes ops/ms 645.58 780.76 28396.29
> 103.11 43.99
> Byte128Vector.UMINMaskedLanes ops/ms 621.09 718.27 26122.62
> 42.68 42.06
> Byte64Vector.UMAXLanes ops/ms 296.33 34.44 14357.74
> 15.95 48.45
> Byte64Vector.UMAXMaskedLanes ops/ms 376.54 44.01 14269.24
> 21.41 37.90
> Byte64Vector.UMINLanes ops/ms 373.45 426.51 15425.36
> 66.20 41.31
> Byte64Vector.UMINMaskedLanes ops/ms 353.32 346.87 14201.37
> 13.79 40.19
> Int128Vector.UMAXLanes ops/ms 174.79 192.51 9906.07
> 286.93 56.67
> Int128Vector.UMAXMaskedLanes ops/ms 157.23 206.68 10246.77
> 11.44 65.17
> Int64Vector.UMAXLanes ops/ms 95.30 126.49 4719.30
> 98.57 49.52
> Int64Vector.UMAXMaskedLanes ops/ms 88.19 87.44 4693.18
> 19.76 53.22
> Long128Vector.UMAXLanes ops/ms 80.62 97.82 5064.01
> 35.52 62.82
> Long128Vector.UMAXMaskedLanes ops/ms 78.15 102.91 5028.24
> 8.74 64.34
> Long64Vector.UMAXLanes ops/ms 47.56 62.01 46.76
> 52.28 0.98
> Long64Vector.UMAXMaskedLanes ops/ms 45.44 46.76 45.79
> 42.91 1.01
> Short128Vector.UMAXLanes ops/ms 316.65 4...
I found the selection logic getting more complex and harder to follow.
If you refactor things a little by making signed/unsigned a parameter to the
assembly instruction, you can do this. While this approach makes the assembler
a bit more fiddly, `neon_reduce_minmax_integral()` is better. See what you
think.
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index fc6e58b801c..35f29db1675 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -2625,17 +2625,27 @@ template<typename R, typename... Rx>
#undef INSN
// Advanced SIMD three different
-#define INSN(NAME, opc, opc2, acceptT2D)
\
- void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn,
FloatRegister Vm) { \
+
+#define neon3different(U, opc, acceptT2D, Vd, T, Vn, Vm)
\
+ {
\
guarantee(T != T1Q && T != T1D, "incorrect arrangement");
\
if (!acceptT2D) guarantee(T != T2D, "incorrect arrangement");
\
- if (opc2 == 0b101101) guarantee(T != T8B && T != T16B, "incorrect
arrangement"); \
+ if (opc == 0b101101) guarantee(T != T8B && T != T16B, "incorrect
arrangement"); \
starti;
\
- f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);
\
- f((int)T >> 1, 23, 22), f(1, 21), rf(Vm, 16), f(opc2, 15, 10);
\
+ f(0, 31), f((int)T & 1, 30), f(U, 29), f(0b01110, 28, 24);
\
+ f((int)T >> 1, 23, 22), f(1, 21), rf(Vm, 16), f(opc, 15, 10);
\
rf(Vn, 5), rf(Vd, 0);
\
}
+#define INSN(NAME, U, opc, acceptT2D)
\
+ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn,
FloatRegister Vm) { \
+ neon3different(U, opc, acceptT2D, Vd, T, Vn, Vm);
\
+ }
+#define INSN2(NAME, opc, acceptT2D)
\
+void NAME(int U, FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn,
FloatRegister Vm) { \
+ neon3different(U, opc, acceptT2D, Vd, T, Vn, Vm);
\
+ }
+
INSN(addv, 0, 0b100001, true); // accepted arrangements: T8B, T16B, T4H,
T8H, T2S, T4S, T2D
INSN(subv, 1, 0b100001, true); // accepted arrangements: T8B, T16B, T4H,
T8H, T2S, T4S, T2D
INSN(sqaddv, 0, 0b000011, true); // accepted arrangements: T8B, T16B, T4H,
T8H, T2S, T4S, T2D
@@ -2663,21 +2673,33 @@ template<typename R, typename... Rx>
INSN(sqdmulh,0, 0b101101, false); // accepted arrangements: T4H, T8H, T2S,
T4S
INSN(shsubv, 0, 0b001001, false); // accepted arrangements: T8B, T16B, T4H,
T8H, T2S, T4S
+ INSN2(maxp, 0b101001, false); // accepted arrangements: T8B, T16B, T4H, T8H,
T2S, T4S
+ INSN2(minp, 0b101011, false); // accepted arrangements: T8B, T16B, T4H, T8H,
T2S, T4S
+
#undef INSN
+#undef INSN2
// Advanced SIMD across lanes
-#define INSN(NAME, opc, opc2, accepted) \
- void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
\
+#define neonAcrossLanes(U, opc, accepted, Vd, T, Vn)
\
+ {
\
guarantee(T != T1Q && T != T1D, "incorrect arrangement");
\
if (accepted < 3) guarantee(T != T2D, "incorrect arrangement");
\
if (accepted < 2) guarantee(T != T2S, "incorrect arrangement");
\
if (accepted < 1) guarantee(T == T8B || T == T16B, "incorrect
arrangement"); \
starti;
\
- f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);
\
- f((int)T >> 1, 23, 22), f(opc2, 21, 10);
\
+ f(0, 31), f((int)T & 1, 30), f(U, 29), f(0b01110, 28, 24);
\
+ f((int)T >> 1, 23, 22), f(opc, 21, 10);
\
rf(Vn, 5), rf(Vd, 0);
\
}
+#define INSN(NAME, U, opc, accepted) \
+ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) { \
+ neonAcrossLanes(U, opc, accepted, Vd, T, Vn); \
+ }
+#define INSN2(NAME, opc, accepted)
\
+ void NAME(int U, FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
\
+ neonAcrossLanes(U, opc, accepted, Vd, T, Vn);
\
+ }
INSN(absr, 0, 0b100000101110, 3); // accepted arrangements: T8B, T16B,
T4H, T8H, T2S, T4S, T2D
INSN(negr, 1, 0b100000101110, 3); // accepted arrangements: T8B, T16B,
T4H, T8H, T2S, T4S, T2D
INSN(notr, 1, 0b100000010110, 0); // accepted arrangements: T8B, T16B
@@ -2692,7 +2714,11 @@ template<typename R, typename... Rx>
INSN(uaddlp, 1, 0b100000001010, 2); // accepted arrangements: T8B, T16B,
T4H, T8H, T2S, T4S
INSN(uaddlv, 1, 0b110000001110, 1); // accepted arrangements: T8B, T16B,
T4H, T8H, T4S
+ INSN2(maxv, 0b110000101010, 1); // accepted arrangements: T8B, T16B,
T4H, T8H, T4S
+ INSN2(minv, 0b110001101010, 1); // accepted arrangements: T8B, T16B,
T4H, T8H, T4S
+
#undef INSN
+#undef INSN2
#define INSN(NAME, opc) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
\
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
index 3431b4f700a..b6e5e04e1e8 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@@ -1973,10 +1973,22 @@ void C2_MacroAssembler::neon_reduce_minmax_integral(int
opc, Register dst, Basic
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG,
"unsupported");
assert_different_registers(dst, isrc);
bool isQ = vector_length_in_bytes == 16;
- bool is_min = (opc == Op_MinReductionV || opc == Op_UMinReductionV);
- bool is_unsigned = (opc == Op_UMinReductionV || opc == Op_UMaxReductionV);
- Assembler::Condition cond = is_min ? (is_unsigned ? Assembler::LO :
Assembler::LT)
- : (is_unsigned ? Assembler::HI :
Assembler::GT);
+
+ bool is_min;
+ int is_unsigned;
+ Condition cond;
+ switch(opc) {
+ case Op_MinReductionV:
+ is_min = true; is_unsigned = false; cond = LT; break;
+ case Op_MaxReductionV:
+ is_min = false; is_unsigned = false; cond = GT; break;
+ case Op_UMinReductionV:
+ is_min = true; is_unsigned = true; cond = LO; break;
+ case Op_UMaxReductionV:
+ is_min = false; is_unsigned = true; cond = HI; break;
+ default:
+ ShouldNotReachHere();
+ }
BLOCK_COMMENT("neon_reduce_minmax_integral {");
if (bt == T_LONG) {
@@ -1993,18 +2005,12 @@ void C2_MacroAssembler::neon_reduce_minmax_integral(int
opc, Register dst, Basic
if (size == T2S) {
// For T2S (2x32-bit elements), use pairwise instructions because
// uminv/umaxv/sminv/smaxv don't support arrangement 2S.
- if (is_unsigned) {
- is_min ? uminp(vtmp, size, vsrc, vsrc) : umaxp(vtmp, size, vsrc,
vsrc);
- } else {
- is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc,
vsrc);
- }
+ is_min ? minp(is_unsigned, vtmp, size, vsrc, vsrc)
+ : maxp(is_unsigned, vtmp, size, vsrc, vsrc);
} else {
// For other sizes, use reduction to scalar instructions.
- if (is_unsigned) {
- is_min ? uminv(vtmp, size, vsrc) : umaxv(vtmp, size, vsrc);
- } else {
- is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
- }
+ is_min ? minv(is_unsigned, vtmp, size, vsrc)
+ : maxv(is_unsigned, vtmp, size, vsrc);
}
if (bt == T_INT) {
umov(dst, vtmp, S, 0);
-------------
PR Comment: https://git.openjdk.org/jdk/pull/28693#issuecomment-3773303625