Add an implementation of luma_hvpp, using Neon I8MM implementation
for the horizontal part, and Armv8.0 Neon implementation for the
vertical part.
---
 source/common/aarch64/filter-neon-i8mm.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/source/common/aarch64/filter-neon-i8mm.cpp 
b/source/common/aarch64/filter-neon-i8mm.cpp
index f8334016d..fb42d6672 100644
--- a/source/common/aarch64/filter-neon-i8mm.cpp
+++ b/source/common/aarch64/filter-neon-i8mm.cpp
@@ -755,9 +755,29 @@ void interp4_horiz_pp_i8mm(const uint8_t *src, intptr_t 
srcStride, uint8_t *dst,
     }
 }
 
+// Declaration for use in interp_hv_pp_i8mm().
+template<int N, int width, int height>
+void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, uint8_t *dst,
+                         intptr_t dstStride, int coeffIdx);
+
+// Implementation of luma_hvpp, using Neon i8mm implementation for the
+// horizontal part, and Armv8.0 Neon implementation for the vertical part.
+template<int width, int height>
+void interp_hv_pp_i8mm(const pixel *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int idxX, int idxY)
+{
+    const int N_TAPS = 8;
+    ALIGN_VAR_32(int16_t, immed[width * (height + N_TAPS - 1)]);
+
+    interp8_horiz_ps_i8mm<width, height>(src, srcStride, immed, width, idxX, 
1);
+    interp_vert_sp_neon<N_TAPS, width, height>(immed + (N_TAPS / 2 - 1) * 
width,
+                                               width, dst, dstStride, idxY);
+}
+
 #define LUMA_I8MM(W, H) \
         p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_i8mm<W, H>; \
-        p.pu[LUMA_ ## W ## x ## H].luma_hps = interp8_horiz_ps_i8mm<W, H>;
+        p.pu[LUMA_ ## W ## x ## H].luma_hps = interp8_horiz_ps_i8mm<W, H>; \
+        p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_i8mm<W, H>;
 
 #define CHROMA_420_I8MM(W, H) \
         p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = \
-- 
2.42.1

>From 988b25411508117185a4d88bb519838d2c40fd0b Mon Sep 17 00:00:00 2001
Message-ID: <988b25411508117185a4d88bb519838d2c40fd0b.1725629250.git.hari.lim...@arm.com>
In-Reply-To: <cover.1725629250.git.hari.lim...@arm.com>
References: <cover.1725629250.git.hari.lim...@arm.com>
From: Hari Limaye <hari.lim...@arm.com>
Date: Thu, 16 May 2024 00:07:55 +0100
Subject: [PATCH 11/14] AArch64: Add Armv8.6 Neon I8MM implementation of
 interp_hv_pp

Add an implementation of luma_hvpp, using Neon I8MM implementation
for the horizontal part, and Armv8.0 Neon implementation for the
vertical part.
---
 source/common/aarch64/filter-neon-i8mm.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/source/common/aarch64/filter-neon-i8mm.cpp b/source/common/aarch64/filter-neon-i8mm.cpp
index f8334016d..fb42d6672 100644
--- a/source/common/aarch64/filter-neon-i8mm.cpp
+++ b/source/common/aarch64/filter-neon-i8mm.cpp
@@ -755,9 +755,29 @@ void interp4_horiz_pp_i8mm(const uint8_t *src, intptr_t srcStride, uint8_t *dst,
     }
 }
 
+// Declaration for use in interp_hv_pp_i8mm().
+template<int N, int width, int height>
+void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, uint8_t *dst,
+                         intptr_t dstStride, int coeffIdx);
+
+// Implementation of luma_hvpp, using Neon i8mm implementation for the
+// horizontal part, and Armv8.0 Neon implementation for the vertical part.
+template<int width, int height>
+void interp_hv_pp_i8mm(const pixel *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int idxX, int idxY)
+{
+    const int N_TAPS = 8;
+    ALIGN_VAR_32(int16_t, immed[width * (height + N_TAPS - 1)]);
+
+    interp8_horiz_ps_i8mm<width, height>(src, srcStride, immed, width, idxX, 1);
+    interp_vert_sp_neon<N_TAPS, width, height>(immed + (N_TAPS / 2 - 1) * width,
+                                               width, dst, dstStride, idxY);
+}
+
 #define LUMA_I8MM(W, H) \
         p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_i8mm<W, H>; \
-        p.pu[LUMA_ ## W ## x ## H].luma_hps = interp8_horiz_ps_i8mm<W, H>;
+        p.pu[LUMA_ ## W ## x ## H].luma_hps = interp8_horiz_ps_i8mm<W, H>; \
+        p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_i8mm<W, H>;
 
 #define CHROMA_420_I8MM(W, H) \
         p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = \
-- 
2.42.1

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to