Re: RFR: 8355216: Accelerate P-256 arithmetic on aarch64 [v5]

Andrew Dinn Mon, 18 May 2026 04:56:12 -0700

On Fri, 15 May 2026 09:52:20 GMT, Ferenc Rakoczi <[email protected]> wrote:


>> An aarch64 implementation of the MontgomeryIntegerPolynomial256.mult() 
>> method and IntegerPolynomial.conditionalAssign(). Since 64-bit 
>> multiplication is not supported on Neon and manually performing this 
>> operation with 32-bit limbs is slower than with GPRs, a hybrid neon/gpr 
>> approach is used. Neon instructions are used to compute intermediate values 
>> used in the last two iterations of the main "loop", while the GPRs compute 
>> the first few iterations. At the method level this improves performance by 
>> ~9% and at the API level roughly 5%.
>> 
>> 
>> 
>> ---------
>> - [x] I confirm that I make this contribution in accordance with the 
>> [OpenJDK Interim AI Policy](https://openjdk.org/legal/ai).
>
> Ferenc Rakoczi has updated the pull request incrementally with one additional 
> commit since the last revision:
> 
>   Accepting more suggestions from Andrew Dinn.

@ferakocz I have made some suggestions for comments that will aid maintainers 
in understanding what the Neon-based subset of the implementation is doing. I 
still need to work through the GPR-based code.

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7863:

> 7861:     __ sub(sp, sp, 128);
> 7862:     __ mov(mul_ptr, sp);
> 7863: 

Suggestion:

    // neon and gpr computations are interleaved to maximize parallelism

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7864:

> 7862:     __ mov(mul_ptr, sp);
> 7863: 
> 7864:     neon_partial_mult_64(A, b_lows, a_vals, 0);

Suggestion:

    // cross-multiply low * low for limbs b0-b3 and a3-a4 in parallel
    neon_partial_mult_64(A, b_lows, a_vals, 0);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7871:

> 7869:     __ andr(n, low, limb_mask);
> 7870: 
> 7871:     neon_partial_mult_64(B, b_highs, a_vals, 0);

Suggestion:

    // cross-multiply high * low for limbs b0-b3 and a3-a4 in parallel
    neon_partial_mult_64(B, b_highs, a_vals, 0);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7880:

> 7878:     __ add(c_i, c_i, high);
> 7879: 
> 7880:     neon_partial_mult_64(C, b_lows, a_vals, 1);

Suggestion:

    // cross-multiply low * high for limbs b0-b3 and a3-a4 in parallel
    neon_partial_mult_64(C, b_lows, a_vals, 1);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7885:

> 7883:     gpr_partial_mult_52(a_i, b_1, high, low, tmp, limb_mask);
> 7884: 
> 7885:     neon_partial_mult_64(D, b_highs, a_vals, 1);

Suggestion:

    // cross-multiply high * high for limbs b0-b3 and a3-a4 in parallel
    neon_partial_mult_64(D, b_highs, a_vals, 1);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7894:

> 7892:     __ mov(c_i, high);
> 7893: 
> 7894:     vs_addv(B, __ T2D, B, C); // Store (B+C) in B

Suggestion:

    // combine neon 32-bit partial products, regrouping to produce
    // 8*52-bit low products in A and 8*52-bit high products in D

    // add low*high/high*low intermediate products before regrouping
    vs_addv(B, __ T2D, B, C); // Store (B+C) in B

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7902:

> 7900:     __ mov(c_i, high);
> 7901: 
> 7902:     vs_shl(D, __ T2D, D, montMulP256Shift1);

Suggestion:

    // shift high*high (40-bit) product up into 52-bits of output
    vs_shl(D, __ T2D, D, montMulP256Shift1);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7907:

> 7905:     gpr_partial_mult_52(a_i, b_3, high, low, tmp, limb_mask);
> 7906: 
> 7907:     vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) 
> >>> 20)

Suggestion:

    // shift high 32 (or 33) bits of intermediate products for addition to D
    vs_ushr(C, __ T2D, B, 32 - montMulP256Shift1); // Use C for ((B+C) >>> 20)

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7916:

> 7914:     __ mov(c_i, high);
> 7915: 
> 7916:     vs_shl(B, __ T2D, B, 32);

Suggestion:

    // shift low 32 bits of intermediate product up for masking and addition to 
A
    vs_shl(B, __ T2D, B, 32);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7921:

> 7919:     gpr_partial_mult_52(a_i, b_4, high, low, tmp, limb_mask);
> 7920: 
> 7921:     vs_addv(D, __ T2D, D, C);

Suggestion:

    // add high bits of intermediate product into D 
    vs_addv(D, __ T2D, D, C);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7930:

> 7928:     __ str(high, Address(c_ptr, 32));
> 7929: 
> 7930:     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)

Suggestion:

    // top 12 bits of 32*32 bit product in A need adding into high 56-bit output
    vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7931:

> 7929: 
> 7930:     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
> 7931:     vs_andr(B, B, limb_mask_vec);

Suggestion:

    // Only 20 of the 32 bits now in the top of B should be added into A
    vs_andr(B, B, limb_mask_vec);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7932:

> 7930:     vs_ushr(C, __ T2D, A, 52); // C now holds (A >>> 52)
> 7931:     vs_andr(B, B, limb_mask_vec);
> 7932:     vs_andr(A, A, limb_mask_vec);

Suggestion:

    // reduce original 64-bit product to 52-bits
    vs_andr(A, A, limb_mask_vec);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7933:

> 7931:     vs_andr(B, B, limb_mask_vec);
> 7932:     vs_andr(A, A, limb_mask_vec);
> 7933:     vs_addv(D, __ T2D, D, C);

Suggestion:

    // add intermediate products to high 52-bit result in D
    vs_addv(D, __ T2D, D, C);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7934:

> 7932:     vs_andr(A, A, limb_mask_vec);
> 7933:     vs_addv(D, __ T2D, D, C);
> 7934:     vs_addv(A, __ T2D, A, B);

Suggestion:

    // add 20/21 bits of intermediate product in top of B into low 52-bit result
    vs_addv(A, __ T2D, A, B);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7936:

> 7934:     vs_addv(A, __ T2D, A, B);
> 7935:     vs_ushr(B, __ T2D, A, montMulP256Shift2);
> 7936:     vs_andr(A, A, limb_mask_vec);

Suggestion:

    // save and then mask off any overflow bit from computing low 52-bit result
    vs_ushr(B, __ T2D, A, montMulP256Shift2);
    vs_andr(A, A, limb_mask_vec);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7937:

> 7935:     vs_ushr(B, __ T2D, A, montMulP256Shift2);
> 7936:     vs_andr(A, A, limb_mask_vec);
> 7937:     vs_addv(D, __ T2D, D, B);

Suggestion:

    // add any remaining carry into the high 52-bit result
    vs_addv(D, __ T2D, D, B);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp line 7939:

> 7937:     vs_addv(D, __ T2D, D, B);
> 7938: 
> 7939:     vs_st1_interleaved(A, D, mul_ptr);

Suggestion:

    // interleaved write outputs 8 successive (low, high) 56-bit pairs
    vs_st1_interleaved(A, D, mul_ptr);

-------------

PR Review: https://git.openjdk.org/jdk/pull/30941#pullrequestreview-4308782729
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257665772
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257692695
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257707239
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257708650
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257711021
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257755691
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257862389
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257944144
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257950111
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257961046
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257976234
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3257998559
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258013794
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258019610
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258131651
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258154087
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258082909
PR Review Comment: https://git.openjdk.org/jdk/pull/30941#discussion_r3258089648

Re: RFR: 8355216: Accelerate P-256 arithmetic on aarch64 [v5]

Reply via email to