Combine 64-bit input vectors into 128-bit vectors before starting the transpose to make use of the full vector bandwidth. --- source/common/aarch64/arm64-utils.cpp | 105 ++++++++++++-------------- 1 file changed, 49 insertions(+), 56 deletions(-)
diff --git a/source/common/aarch64/arm64-utils.cpp b/source/common/aarch64/arm64-utils.cpp index af93729f1..7293b2e72 100644 --- a/source/common/aarch64/arm64-utils.cpp +++ b/source/common/aarch64/arm64-utils.cpp @@ -2,6 +2,7 @@ #include "x265.h" #include "arm64-utils.h" #include <arm_neon.h> +#include "mem-neon.h" namespace X265_NS { @@ -10,65 +11,57 @@ namespace X265_NS void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride) { - uint8x8_t a0 = vld1_u8(src + 0 * sstride); - uint8x8_t a1 = vld1_u8(src + 1 * sstride); - uint8x8_t a2 = vld1_u8(src + 2 * sstride); - uint8x8_t a3 = vld1_u8(src + 3 * sstride); - uint8x8_t a4 = vld1_u8(src + 4 * sstride); - uint8x8_t a5 = vld1_u8(src + 5 * sstride); - uint8x8_t a6 = vld1_u8(src + 6 * sstride); - uint8x8_t a7 = vld1_u8(src + 7 * sstride); - - uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); - uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); - uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); - uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); - uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); - uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); - uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); - uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); - - uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0), - vreinterpret_u16_u32(b2)); - uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1), - vreinterpret_u16_u32(b3)); - uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0), - vreinterpret_u16_u32(b2)); - uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1), - vreinterpret_u16_u32(b3)); - uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4), - vreinterpret_u16_u32(b6)); - uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5), - vreinterpret_u16_u32(b7)); - uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4), - vreinterpret_u16_u32(b6)); - uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5), - vreinterpret_u16_u32(b7)); - - uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1)); - uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1)); - uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3)); - uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3)); - uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5)); - uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5)); - uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7)); - uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7)); - - vst1_u8(dst + 0 * dstride, d0); - vst1_u8(dst + 1 * dstride, d1); - vst1_u8(dst + 2 * dstride, d2); - vst1_u8(dst + 3 * dstride, d3); - vst1_u8(dst + 4 * dstride, d4); - vst1_u8(dst + 5 * dstride, d5); - vst1_u8(dst + 6 * dstride, d6); - vst1_u8(dst + 7 * dstride, d7); + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + uint8x8_t a[8]; + load_u8x8xn<8>(src, sstride, a); + + // a04: 00 40 01 41 02 42 03 43 04 44 05 45 06 46 07 47 + // a15: 10 50 11 51 12 52 13 53 14 54 15 55 16 56 17 57 + // a26: 20 60 21 61 22 62 23 63 24 64 25 65 26 66 27 67 + // a37: 30 70 31 71 32 72 33 73 34 74 35 75 36 76 37 77 + // Combine with 0 vector will be optimized away by the compiler + // as the load will zero the upper half of the register. + uint8x16_t a04 = vzip1q_u8(vcombine_u8(a[0], vdup_n_u8(0)), + vcombine_u8(a[4], vdup_n_u8(0))); + uint8x16_t a15 = vzip1q_u8(vcombine_u8(a[1], vdup_n_u8(0)), + vcombine_u8(a[5], vdup_n_u8(0))); + uint8x16_t a26 = vzip1q_u8(vcombine_u8(a[2], vdup_n_u8(0)), + vcombine_u8(a[6], vdup_n_u8(0))); + uint8x16_t a37 = vzip1q_u8(vcombine_u8(a[3], vdup_n_u8(0)), + vcombine_u8(a[7], vdup_n_u8(0))); + + // a0246[0]: 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 + // a0246[1]: 04 24 44 64 05 25 45 65 06 26 46 66 07 27 47 67 + // a1357[0]: 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 + // a1357[1]: 14 34 54 74 15 35 55 75 16 36 56 76 17 37 57 77 + uint8x16x2_t a0246 = vzipq_u8(a04, a26); + uint8x16x2_t a1357 = vzipq_u8(a15, a37); + + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + uint8x16x2_t d0 = vzipq_u8(a0246.val[0],a1357.val[0]); + uint8x16x2_t d1 = vzipq_u8(a0246.val[1],a1357.val[1]); + + vst1_u8(dst + 0 * dstride, vget_low_u8(d0.val[0])); + vst1_u8(dst + 1 * dstride, vget_high_u8(d0.val[0])); + vst1_u8(dst + 2 * dstride, vget_low_u8(d0.val[1])); + vst1_u8(dst + 3 * dstride, vget_high_u8(d0.val[1])); + vst1_u8(dst + 4 * dstride, vget_low_u8(d1.val[0])); + vst1_u8(dst + 5 * dstride, vget_high_u8(d1.val[0])); + vst1_u8(dst + 6 * dstride, vget_low_u8(d1.val[1])); + vst1_u8(dst + 7 * dstride, vget_high_u8(d1.val[1])); } - - - - void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride) { uint8x16_t a0 = vld1q_u8(src + 0 * sstride); -- 2.39.5 (Apple Git-154)
>From d5c6ad3270b070c4fe27ae5796e4c7df07e77d2b Mon Sep 17 00:00:00 2001 Message-Id: <d5c6ad3270b070c4fe27ae5796e4c7df07e77d2b.1747738665.git.li.zha...@arm.com> From: Li Zhang <li.zha...@arm.com> Date: Mon, 5 May 2025 10:46:58 +0200 Subject: [PATCH] AArch64: Optimize standard bit-depth Neon transpose8x8 Combine 64-bit input vectors into 128-bit vectors before starting the transpose to make use of the full vector bandwidth. --- source/common/aarch64/arm64-utils.cpp | 105 ++++++++++++-------------- 1 file changed, 49 insertions(+), 56 deletions(-) diff --git a/source/common/aarch64/arm64-utils.cpp b/source/common/aarch64/arm64-utils.cpp index af93729f1..7293b2e72 100644 --- a/source/common/aarch64/arm64-utils.cpp +++ b/source/common/aarch64/arm64-utils.cpp @@ -2,6 +2,7 @@ #include "x265.h" #include "arm64-utils.h" #include <arm_neon.h> +#include "mem-neon.h" namespace X265_NS { @@ -10,65 +11,57 @@ namespace X265_NS void transpose8x8(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride) { - uint8x8_t a0 = vld1_u8(src + 0 * sstride); - uint8x8_t a1 = vld1_u8(src + 1 * sstride); - uint8x8_t a2 = vld1_u8(src + 2 * sstride); - uint8x8_t a3 = vld1_u8(src + 3 * sstride); - uint8x8_t a4 = vld1_u8(src + 4 * sstride); - uint8x8_t a5 = vld1_u8(src + 5 * sstride); - uint8x8_t a6 = vld1_u8(src + 6 * sstride); - uint8x8_t a7 = vld1_u8(src + 7 * sstride); - - uint32x2_t b0 = vtrn1_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); - uint32x2_t b1 = vtrn1_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); - uint32x2_t b2 = vtrn1_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); - uint32x2_t b3 = vtrn1_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); - uint32x2_t b4 = vtrn2_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); - uint32x2_t b5 = vtrn2_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); - uint32x2_t b6 = vtrn2_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); - uint32x2_t b7 = vtrn2_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); - - uint16x4_t c0 = vtrn1_u16(vreinterpret_u16_u32(b0), - vreinterpret_u16_u32(b2)); - uint16x4_t c1 = vtrn1_u16(vreinterpret_u16_u32(b1), - vreinterpret_u16_u32(b3)); - uint16x4_t c2 = vtrn2_u16(vreinterpret_u16_u32(b0), - vreinterpret_u16_u32(b2)); - uint16x4_t c3 = vtrn2_u16(vreinterpret_u16_u32(b1), - vreinterpret_u16_u32(b3)); - uint16x4_t c4 = vtrn1_u16(vreinterpret_u16_u32(b4), - vreinterpret_u16_u32(b6)); - uint16x4_t c5 = vtrn1_u16(vreinterpret_u16_u32(b5), - vreinterpret_u16_u32(b7)); - uint16x4_t c6 = vtrn2_u16(vreinterpret_u16_u32(b4), - vreinterpret_u16_u32(b6)); - uint16x4_t c7 = vtrn2_u16(vreinterpret_u16_u32(b5), - vreinterpret_u16_u32(b7)); - - uint8x8_t d0 = vtrn1_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1)); - uint8x8_t d1 = vtrn2_u8(vreinterpret_u8_u16(c0), vreinterpret_u8_u16(c1)); - uint8x8_t d2 = vtrn1_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3)); - uint8x8_t d3 = vtrn2_u8(vreinterpret_u8_u16(c2), vreinterpret_u8_u16(c3)); - uint8x8_t d4 = vtrn1_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5)); - uint8x8_t d5 = vtrn2_u8(vreinterpret_u8_u16(c4), vreinterpret_u8_u16(c5)); - uint8x8_t d6 = vtrn1_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7)); - uint8x8_t d7 = vtrn2_u8(vreinterpret_u8_u16(c6), vreinterpret_u8_u16(c7)); - - vst1_u8(dst + 0 * dstride, d0); - vst1_u8(dst + 1 * dstride, d1); - vst1_u8(dst + 2 * dstride, d2); - vst1_u8(dst + 3 * dstride, d3); - vst1_u8(dst + 4 * dstride, d4); - vst1_u8(dst + 5 * dstride, d5); - vst1_u8(dst + 6 * dstride, d6); - vst1_u8(dst + 7 * dstride, d7); + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + uint8x8_t a[8]; + load_u8x8xn<8>(src, sstride, a); + + // a04: 00 40 01 41 02 42 03 43 04 44 05 45 06 46 07 47 + // a15: 10 50 11 51 12 52 13 53 14 54 15 55 16 56 17 57 + // a26: 20 60 21 61 22 62 23 63 24 64 25 65 26 66 27 67 + // a37: 30 70 31 71 32 72 33 73 34 74 35 75 36 76 37 77 + // Combine with 0 vector will be optimized away by the compiler + // as the load will zero the upper half of the register. + uint8x16_t a04 = vzip1q_u8(vcombine_u8(a[0], vdup_n_u8(0)), + vcombine_u8(a[4], vdup_n_u8(0))); + uint8x16_t a15 = vzip1q_u8(vcombine_u8(a[1], vdup_n_u8(0)), + vcombine_u8(a[5], vdup_n_u8(0))); + uint8x16_t a26 = vzip1q_u8(vcombine_u8(a[2], vdup_n_u8(0)), + vcombine_u8(a[6], vdup_n_u8(0))); + uint8x16_t a37 = vzip1q_u8(vcombine_u8(a[3], vdup_n_u8(0)), + vcombine_u8(a[7], vdup_n_u8(0))); + + // a0246[0]: 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 + // a0246[1]: 04 24 44 64 05 25 45 65 06 26 46 66 07 27 47 67 + // a1357[0]: 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 + // a1357[1]: 14 34 54 74 15 35 55 75 16 36 56 76 17 37 57 77 + uint8x16x2_t a0246 = vzipq_u8(a04, a26); + uint8x16x2_t a1357 = vzipq_u8(a15, a37); + + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + uint8x16x2_t d0 = vzipq_u8(a0246.val[0],a1357.val[0]); + uint8x16x2_t d1 = vzipq_u8(a0246.val[1],a1357.val[1]); + + vst1_u8(dst + 0 * dstride, vget_low_u8(d0.val[0])); + vst1_u8(dst + 1 * dstride, vget_high_u8(d0.val[0])); + vst1_u8(dst + 2 * dstride, vget_low_u8(d0.val[1])); + vst1_u8(dst + 3 * dstride, vget_high_u8(d0.val[1])); + vst1_u8(dst + 4 * dstride, vget_low_u8(d1.val[0])); + vst1_u8(dst + 5 * dstride, vget_high_u8(d1.val[0])); + vst1_u8(dst + 6 * dstride, vget_low_u8(d1.val[1])); + vst1_u8(dst + 7 * dstride, vget_high_u8(d1.val[1])); } - - - - void transpose16x16(uint8_t *dst, const uint8_t *src, intptr_t dstride, intptr_t sstride) { uint8x16_t a0 = vld1q_u8(src + 0 * sstride); -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel