On Fri, 15 May 2026 09:52:20 GMT, Ferenc Rakoczi <[email protected]> wrote:
>> An aarch64 implementation of the MontgomeryIntegerPolynomial256.mult() >> method and IntegerPolynomial.conditionalAssign(). Since 64-bit >> multiplication is not supported on Neon and manually performing this >> operation with 32-bit limbs is slower than with GPRs, a hybrid neon/gpr >> approach is used. Neon instructions are used to compute intermediate values >> used in the last two iterations of the main "loop", while the GPRs compute >> the first few iterations. At the method level this improves performance by >> ~9% and at the API level roughly 5%. >> >> >> >> --------- >> - [x] I confirm that I make this contribution in accordance with the >> [OpenJDK Interim AI Policy](https://openjdk.org/legal/ai). > > Ferenc Rakoczi has updated the pull request incrementally with one additional > commit since the last revision: > > Accepting more suggestions from Andrew Dinn. @ferakocz I have made some suggestions for comments that will aid maintainers in understanding what the Neon-based subset of the implementation is doing. I still need to work through the GPR-based code. src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7863: > 7861: __ sub(sp, sp, 128); > 7862: __ mov(mul_ptr, sp); > 7863: Suggestion: // neon and gpr computations are interleaved to maximize parallelism src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7864: > 7862: __ mov(mul_ptr, sp); > 7863: > 7864: neon_partial_mult_64(A, b_lows, a_vals, 0); Suggestion: // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel neon_partial_mult_64(A, b_lows, a_vals, 0); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7871: > 7869: __ andr(n, low, limb_mask); > 7870: > 7871: neon_partial_mult_64(B, b_highs, a_vals, 0); Suggestion: // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel neon_partial_mult_64(B, b_highs, a_vals, 0); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7880: > 7878: __ add(c_i, c_i, high); > 7879: > 7880: neon_partial_mult_64(C, b_lows, a_vals, 1); Suggestion: // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel neon_partial_mult_64(C, b_lows, a_vals, 1); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7885: > 7883: gpr_partial_mult_52(a_i, b_1, high, low, tmp, limb_mask); > 7884: > 7885: neon_partial_mult_64(D, b_highs, a_vals, 1); Suggestion: // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel neon_partial_mult_64(D, b_highs, a_vals, 1); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7894: > 7892: __ mov(c_i, high); > 7893: > 7894: vs_addv(B, __ T2D, B, C); // Store (B+C) in B Suggestion: // combine neon 32-bit partial products, regrouping to produce // 8*52-bit low products in A and 8*52-bit high products in D // add low*high/high*low intermediate products before regrouping vs_addv(B, __ T2D, B, C); // Store (B+C) in B src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7902: > 7900: __ mov(c_i, high); > 7901: > 7902: vs_shl(D, __ T2D, D, montMulP256Shift1); Suggestion: // shift high*high (40-bit) product up into 52-bits of output vs_shl(D, __ T2D, D, montMulP256Shift1); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7907: > 7905: gpr_partial_mult_52(a_i, b_3, high, low, tmp, limb_mask); > 7906: > 7907: vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) > >>> 20) Suggestion: // shift high 32 (or 33) bits of intermediate products for addition to D vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20) src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7916: > 7914: __ mov(c_i, high); > 7915: > 7916: vs_shl(B, __ T2D, B, 32); Suggestion: // shift low 32 bits of intermediate product up for masking and addition to A vs_shl(B, __ T2D, B, 32); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7921: > 7919: gpr_partial_mult_52(a_i, b_4, high, low, tmp, limb_mask); > 7920: > 7921: vs_addv(D, __ T2D, D, C); Suggestion: // add high bits of intermediate product into D vs_addv(D, __ T2D, D, C); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7930: > 7928: __ str(high, Address(c_ptr, 32)); > 7929: > 7930: vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) Suggestion: // top 12 bits of 32*32 bit product in A need adding into high 56-bit output vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7931: > 7929: > 7930: vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) > 7931: vs_andr(B, B, limb_mask_vec); Suggestion: // Only 20 of the 32 bits now in the top of B should be added into A vs_andr(B, B, limb_mask_vec); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7932: > 7930: vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52) > 7931: vs_andr(B, B, limb_mask_vec); > 7932: vs_andr(A, A, limb_mask_vec); Suggestion: // reduce original 64-bit product to 52-bits vs_andr(A, A, limb_mask_vec); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7933: > 7931: vs_andr(B, B, limb_mask_vec); > 7932: vs_andr(A, A, limb_mask_vec); > 7933: vs_addv(D, __ T2D, D, C); Suggestion: // add intermediate products to high 52-bit result in D vs_addv(D, __ T2D, D, C); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7934: > 7932: vs_andr(A, A, limb_mask_vec); > 7933: vs_addv(D, __ T2D, D, C); > 7934: vs_addv(A, __ T2D, A, B); Suggestion: // add 20/21 bits of intermediate product in top of B into low 52-bit result vs_addv(A, __ T2D, A, B); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7936: > 7934: vs_addv(A, __ T2D, A, B); > 7935: vs_ushr(B, __ T2D, A, montMulP256Shift2); > 7936: vs_andr(A, A, limb_mask_vec); Suggestion: // save and then mask off any overflow bit from computing low 52-bit result vs_ushr(B, __ T2D, A, montMulP256Shift2); vs_andr(A, A, limb_mask_vec); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7937: > 7935: vs_ushr(B, __ T2D, A, montMulP256Shift2); > 7936: vs_andr(A, A, limb_mask_vec); > 7937: vs_addv(D, __ T2D, D, B); Suggestion: // add any remaining carry into the high 52-bit result vs_addv(D, __ T2D, D, B); src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7939: > 7937: vs_addv(D, __ T2D, D, B); > 7938: > 7939: vs_st1_interleaved(A, D, mul_ptr); Suggestion: // interleaved write outputs 8 successive (low, high) 56-bit pairs vs_st1_interleaved(A, D, mul_ptr); ------------- PR Review: https://git.openjdk.org/jdk/pull/30941#pullrequestreview-4308782729 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257665772 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257692695 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257707239 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257708650 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257711021 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257755691 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257862389 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257944144 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257950111 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257961046 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257976234 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257998559 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258013794 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258019610 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258131651 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258154087 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258082909 PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258089648
