The existing filterPixelToShort_48x64 high bit-depth code for vectors of at least 256-bits uses predication to ensure that only the low 256 bits of the vector are operated on, however the address arithmetic is performed relative to the vector length so for vectors of 512-bits or longer this is incorrect. Since we are operating on the fixed low 256 bits of the vectors here, fix the code by hard-coding the address offset to multiples of 32 bytes. --- source/common/aarch64/p2s-sve.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S index 85bb14b3d..11e63ddab 100644 --- a/source/common/aarch64/p2s-sve.S +++ b/source/common/aarch64/p2s-sve.S @@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve) ret .vl_gt_16_filterPixelToShort_high_48x64: ptrue p0.h, vl16 + mov x4, #16 + mov x5, #32 .rept 64 ld1h {z0.h}, p0/z, [x0] - ld1h {z1.h}, p0/z, [x0, #1, mul vl] - ld1h {z2.h}, p0/z, [x0, #2, mul vl] + ld1h {z1.h}, p0/z, [x0, x4, lsl #1] + ld1h {z2.h}, p0/z, [x0, x5, lsl #1] add x0, x0, x1 lsl z0.h, p0/m, z0.h, #P2S_SHIFT lsl z1.h, p0/m, z1.h, #P2S_SHIFT @@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve) add z1.h, p0/m, z1.h, z31.h add z2.h, p0/m, z2.h, z31.h st1h {z0.h}, p0, [x2] - st1h {z1.h}, p0, [x2, #1, mul vl] - st1h {z2.h}, p0, [x2, #2, mul vl] + st1h {z1.h}, p0, [x2, x4, lsl #1] + st1h {z2.h}, p0, [x2, x5, lsl #1] add x2, x2, x3 .endr ret -- 2.34.1
>From 9b2d75b095b847bca9b0ad315132ff17e96b1506 Mon Sep 17 00:00:00 2001 From: George Steed <george.st...@arm.com> Date: Sun, 9 Mar 2025 18:03:36 +0000 Subject: [PATCH] p2s-sve.S: Fix filterPixelToShort_48x64 for longer SVE vectors The existing filterPixelToShort_48x64 high bit-depth code for vectors of at least 256-bits uses predication to ensure that only the low 256 bits of the vector are operated on, however the address arithmetic is performed relative to the vector length so for vectors of 512-bits or longer this is incorrect. Since we are operating on the fixed low 256 bits of the vectors here, fix the code by hard-coding the address offset to multiples of 32 bytes. --- source/common/aarch64/p2s-sve.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S index 85bb14b3d..11e63ddab 100644 --- a/source/common/aarch64/p2s-sve.S +++ b/source/common/aarch64/p2s-sve.S @@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve) ret .vl_gt_16_filterPixelToShort_high_48x64: ptrue p0.h, vl16 + mov x4, #16 + mov x5, #32 .rept 64 ld1h {z0.h}, p0/z, [x0] - ld1h {z1.h}, p0/z, [x0, #1, mul vl] - ld1h {z2.h}, p0/z, [x0, #2, mul vl] + ld1h {z1.h}, p0/z, [x0, x4, lsl #1] + ld1h {z2.h}, p0/z, [x0, x5, lsl #1] add x0, x0, x1 lsl z0.h, p0/m, z0.h, #P2S_SHIFT lsl z1.h, p0/m, z1.h, #P2S_SHIFT @@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve) add z1.h, p0/m, z1.h, z31.h add z2.h, p0/m, z2.h, z31.h st1h {z0.h}, p0, [x2] - st1h {z1.h}, p0, [x2, #1, mul vl] - st1h {z2.h}, p0, [x2, #2, mul vl] + st1h {z1.h}, p0, [x2, x4, lsl #1] + st1h {z2.h}, p0, [x2, x5, lsl #1] add x2, x2, x3 .endr ret -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel