The existing filterPixelToShort_48x64 high bit-depth code for vectors of
at least 256-bits uses predication to ensure that only the low 256 bits
of the vector are operated on, however the address arithmetic is
performed relative to the vector length so for vectors of 512-bits or
longer this is incorrect. Since we are operating on the fixed low 256
bits of the vectors here, fix the code by hard-coding the address offset
to multiples of 32 bytes.
---
 source/common/aarch64/p2s-sve.S | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index 85bb14b3d..11e63ddab 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve)
     ret
 .vl_gt_16_filterPixelToShort_high_48x64:
     ptrue           p0.h, vl16
+    mov             x4, #16
+    mov             x5, #32
 .rept 64
     ld1h            {z0.h}, p0/z, [x0]
-    ld1h            {z1.h}, p0/z, [x0, #1, mul vl]
-    ld1h            {z2.h}, p0/z, [x0, #2, mul vl]
+    ld1h            {z1.h}, p0/z, [x0, x4, lsl #1]
+    ld1h            {z2.h}, p0/z, [x0, x5, lsl #1]
     add             x0, x0, x1
     lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
     lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
@@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve)
     add             z1.h, p0/m, z1.h, z31.h
     add             z2.h, p0/m, z2.h, z31.h
     st1h            {z0.h}, p0, [x2]
-    st1h            {z1.h}, p0, [x2, #1, mul vl]
-    st1h            {z2.h}, p0, [x2, #2, mul vl]
+    st1h            {z1.h}, p0, [x2, x4, lsl #1]
+    st1h            {z2.h}, p0, [x2, x5, lsl #1]
     add             x2, x2, x3
 .endr
     ret
-- 
2.34.1

>From 9b2d75b095b847bca9b0ad315132ff17e96b1506 Mon Sep 17 00:00:00 2001
From: George Steed <george.st...@arm.com>
Date: Sun, 9 Mar 2025 18:03:36 +0000
Subject: [PATCH] p2s-sve.S: Fix filterPixelToShort_48x64 for longer SVE
 vectors

The existing filterPixelToShort_48x64 high bit-depth code for vectors of
at least 256-bits uses predication to ensure that only the low 256 bits
of the vector are operated on, however the address arithmetic is
performed relative to the vector length so for vectors of 512-bits or
longer this is incorrect. Since we are operating on the fixed low 256
bits of the vectors here, fix the code by hard-coding the address offset
to multiples of 32 bytes.
---
 source/common/aarch64/p2s-sve.S | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index 85bb14b3d..11e63ddab 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve)
     ret
 .vl_gt_16_filterPixelToShort_high_48x64:
     ptrue           p0.h, vl16
+    mov             x4, #16
+    mov             x5, #32
 .rept 64
     ld1h            {z0.h}, p0/z, [x0]
-    ld1h            {z1.h}, p0/z, [x0, #1, mul vl]
-    ld1h            {z2.h}, p0/z, [x0, #2, mul vl]
+    ld1h            {z1.h}, p0/z, [x0, x4, lsl #1]
+    ld1h            {z2.h}, p0/z, [x0, x5, lsl #1]
     add             x0, x0, x1
     lsl             z0.h, p0/m, z0.h, #P2S_SHIFT
     lsl             z1.h, p0/m, z1.h, #P2S_SHIFT
@@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve)
     add             z1.h, p0/m, z1.h, z31.h
     add             z2.h, p0/m, z2.h, z31.h
     st1h            {z0.h}, p0, [x2]
-    st1h            {z1.h}, p0, [x2, #1, mul vl]
-    st1h            {z2.h}, p0, [x2, #2, mul vl]
+    st1h            {z1.h}, p0, [x2, x4, lsl #1]
+    st1h            {z2.h}, p0, [x2, x5, lsl #1]
     add             x2, x2, x3
 .endr
     ret
-- 
2.34.1

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to