On Sat, 13 Dec 2025 01:21:14 GMT, Vladimir Ivanov <[email protected]> wrote:
> Good work, Xiaohong!
>
Thanks so much for your review!
> Can you, please, include samples of machine code generated before/after the
> patch (for AVX2 and AVX512)?
Sure. The generated code has no difference for cases that just need **1 gather
load**. For cases that need **2/4 times** of gather loads, the main differences
come from the **duplicate initializing instructions** before iterations of 8B
gather loads (which could be optimized in future), and the additional code
generated for **vector slice and merging**.
Following is an example of loading a `Short256Vector` under `-XX:UseAVX=2`,
which needs 2 times of gather loads. The corresponding Java code is:
private static final VectorSpecies<Short> S_SPECIES =
ShortVector.SPECIES_PREFERRED;
static void gather_short() {
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
ShortVector.fromArray(S_SPECIES, sa, i, index, i)
.intoArray(sr, i);
}
}
static void gather_short_masked() {
VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
ShortVector.fromArray(S_SPECIES, sa, i, index, i, mask)
.intoArray(sr, i);
}
}
Here is the kernel code generated **without** this patch:
0x00007a0e8c06ecb0: vmovd %r9d,%xmm1
0x00007a0e8c06ecb5: lea 0x10(%rbx,%rsi,2),%r14
0x00007a0e8c06ecba: mov %r13,%r8
0x00007a0e8c06ecbd: mov $0x10,%r9d
0x00007a0e8c06ecc3: vpxor %ymm5,%ymm5,%ymm5
0x00007a0e8c06ecc7: vpxor %ymm4,%ymm4,%ymm4
0x00007a0e8c06eccb: vpcmpeqd %ymm6,%ymm6,%ymm6
0x00007a0e8c06eccf: vpsubd %ymm6,%ymm5,%ymm6
0x00007a0e8c06ecd3: vpslld $0x1,%ymm6,%ymm6
0x00007a0e8c06ecd8: vmovdqu 0x41020(%rip),%ymm5 # Stub::Stub
Generator vector_iota_indices_stub+128 0x00007a0e8c0afd00
; {external_word}
0x00007a0e8c06ece0: vpxor %ymm3,%ymm3,%ymm3
0x00007a0e8c06ece4: mov (%r8),%r11d
0x00007a0e8c06ece7: vpinsrw $0x0,(%r14,%r11,2),%xmm3,%xmm3
0x00007a0e8c06ecee: mov 0x4(%r8),%r11d
0x00007a0e8c06ecf2: vpinsrw $0x1,(%r14,%r11,2),%xmm3,%xmm3
0x00007a0e8c06ecf9: mov 0x8(%r8),%r11d
0x00007a0e8c06ecfd: vpinsrw $0x2,(%r14,%r11,2),%xmm3,%xmm3
0x00007a0e8c06ed04: mov 0xc(%r8),%r11d
0x00007a0e8c06ed08: vpinsrw $0x3,(%r14,%r11,2),%xmm3,%xmm3
0x00007a0e8c06ed0f: vpermd %ymm3,%ymm5,%ymm3
0x00007a0e8c06ed14: vpsubd %ymm6,%ymm5,%ymm5
0x00007a0e8c06ed18: vpor %ymm3,%ymm4,%ymm4
0x00007a0e8c06ed1c: add $0x10,%r8
0x00007a0e8c06ed20: sub $0x4,%r9d
0x00007a0e8c06ed24: jne 0x00007a0e8c06ece0
0x00007a0e8c06ed26: vmovdqu %ymm4,0x10(%rbp,%rsi,2)
And here is the kernel code generated **with** this patch:
0x000070118c06a033: vmovd %edi,%xmm5
0x000070118c06a037: vmovq %rbp,%xmm3
0x000070118c06a03c: vmovd %ecx,%xmm2
0x000070118c06a040: mov %r9d,(%rsp)
0x000070118c06a044: lea 0x10(%rsi,%r10,2),%r14 #
start of the second gather_load operation
0x000070118c06a049: mov %r11,%rbp
0x000070118c06a04c: mov $0x8,%ecx
0x000070118c06a051: vpxor %xmm4,%xmm4,%xmm4
0x000070118c06a055: vpxor %xmm10,%xmm10,%xmm10
0x000070118c06a05a: vpcmpeqd %xmm11,%xmm11,%xmm11
0x000070118c06a05f: vpsubd %xmm11,%xmm4,%xmm11
0x000070118c06a064: vpslld $0x1,%xmm11,%xmm11
0x000070118c06a06a: vmovdqu 0x45cgt8e(%rip),%xmm4 # Stub::Stub
Generator vector_iota_indices_stub+128 0x000070118c0afd00
; {external_word}
0x000070118c06a072: vpxor %xmm6,%xmm6,%xmm6
0x000070118c06a076: mov 0x0(%rbp),%edi
0x000070118c06a079: vpinsrw $0x0,(%r14,%rdi,2),%xmm6,%xmm6
0x000070118c06a080: mov 0x4(%rbp),%edi
0x000070118c06a083: vpinsrw $0x1,(%r14,%rdi,2),%xmm6,%xmm6
0x000070118c06a08a: mov 0x8(%rbp),%edi
0x000070118c06a08d: vpinsrw $0x2,(%r14,%rdi,2),%xmm6,%xmm6
0x000070118c06a094: mov 0xc(%rbp),%edi
0x000070118c06a097: vpinsrw $0x3,(%r14,%rdi,2),%xmm6,%xmm6
0x000070118c06a09e: vpermd %ymm6,%ymm4,%ymm6
0x000070118c06a0a3: vpsubd %xmm11,%xmm4,%xmm4
0x000070118c06a0a8: vpor %xmm6,%xmm10,%xmm10
0x000070118c06a0ac: add $0x10,%rbp
0x000070118c06a0b0: sub $0x4,%ecx
0x000070118c06a0b3: jne 0x000070118c06a072
0x000070118c06a0b5: vmovdqu %xmm10,%xmm4 #
vector reinterpret, the end of second gather_load
0x000070118c06a0ba: vperm2i128 $0x21,%ymm4,%ymm9,%ymm6 # vector slice
0x000070118c06a0c0: lea 0x10(%rsi,%r10,2),%r11
# start of the first gather_load operation
0x000070118c06a0c5: mov %rax,%rcx
0x000070118c06a0c8: mov $0x8,%r8d
0x000070118c06a0ce: vpxor %xmm10,%xmm10,%xmm10
0x000070118c06a0d3: vpxor %xmm4,%xmm4,%xmm4
0x000070118c06a0d7: vpcmpeqd %xmm13,%xmm13,%xmm13
0x000070118c06a0dc: vpsubd %xmm13,%xmm10,%xmm13
0x000070118c06a0e1: vpslld $0x1,%xmm13,%xmm13
0x000070118c06a0e7: vmovdqu 0x45c11(%rip),%xmm10 # Stub::Stub
Generator vector_iota_indices_stub+128 0x000070118c0afd00
; {external_word}
0x000070118c06a0ef: vpxor %xmm12,%xmm12,%xmm12
0x000070118c06a0f4: mov (%rcx),%r9d
0x000070118c06a0f7: vpinsrw $0x0,(%r11,%r9,2),%xmm12,%xmm12
0x000070118c06a0fe: mov 0x4(%rcx),%r9d
0x000070118c06a102: vpinsrw $0x1,(%r11,%r9,2),%xmm12,%xmm12
0x000070118c06a109: mov 0x8(%rcx),%r9d
0x000070118c06a10d: vpinsrw $0x2,(%r11,%r9,2),%xmm12,%xmm12
0x000070118c06a114: mov 0xc(%rcx),%r9d
0x000070118c06a118: vpinsrw $0x3,(%r11,%r9,2),%xmm12,%xmm12
0x000070118c06a11f: vpermd %ymm12,%ymm10,%ymm12
0x000070118c06a124: vpsubd %xmm13,%xmm10,%xmm10
0x000070118c06a129: vpor %xmm12,%xmm4,%xmm4
0x000070118c06a12e: add $0x10,%rcx
0x000070118c06a132: sub $0x4,%r8d
0x000070118c06a136: jne 0x000070118c06a0ef
0x000070118c06a138: vmovdqu %xmm4,%xmm4 ; vector
reinterpret, the end of the first gather_load
0x000070118c06a13c: vpor %ymm6,%ymm4,%ymm4 ; final merge
0x000070118c06a140: vmovq %xmm3,%r11
0x000070118c06a145: vmovdqu %ymm4,0x10(%r11,%r10,2) ;*invokestatic
store {reexecute=0 rethrow=0 return_oop=0}
; -
jdk.incubator.vector.ShortVector::intoArray@44 (line 3514)
; -
VectorAPITest::gather_short@38 (line 116)
For the masked cases, besides additional added instructions, there are more
code generated for the **mask slice** operations.
I also attached the full code for kinds of cases. Please kindly share your
feedback. Thanks a lot!
[avx2_short_max_after.txt](https://github.com/user-attachments/files/24160578/avx2_short_max_after.txt)
[avx2_short_max_before.txt](https://github.com/user-attachments/files/24160581/avx2_short_max_before.txt)
[avx3_short_max_before.txt](https://github.com/user-attachments/files/24160582/avx3_short_max_before.txt)
[avx3_short_max_after.txt](https://github.com/user-attachments/files/24160584/avx3_short_max_after.txt)
[avx3_short_max_masked_after.txt](https://github.com/user-attachments/files/24160638/avx3_short_max_masked_after.txt)
[avx3_short_max_masked_before.txt](https://github.com/user-attachments/files/24160644/avx3_short_max_masked_before.txt)
-------------
PR Comment: https://git.openjdk.org/jdk/pull/28520#issuecomment-3654421400