Re: [x265] [PATCH 2/3] AArch64: Refactor loop unroll pragmas in dct primitives

Hari Limaye Wed, 21 Aug 2024 09:00:12 -0700

HI Chen,

Thank you for reviewing these patches.



>In my view, compiler option is not good idea, unroll these loop in manual are 
>more better, it helpful us find out optimzie point and improve algorithm 
>future.
>
>For example, split DCT into deep Even part (EE/EEE, etc) does not get good 
>performance

This series is intended to be largely NFC and to simply silence compiler 
warnings for the AArch64 intrinsics files. We plan to push optimisation patches 
for DCT primitives after this series.

Many thanks,
Hari
From: x265-devel <x265-devel-boun...@videolan.org> on behalf of Hari Limaye 
<hari.lim...@arm.com>
Date: Tuesday, 20 August 2024 at 18:42
To: x265-devel@videolan.org <x265-devel@videolan.org>
Subject: [x265] [PATCH 2/3] AArch64: Refactor loop unroll pragmas in dct 
primitives
Make #pragma unroll directives portable for Clang and GCC, as currently
GCC will simply ignore the unsupported directives.
---
 source/common/aarch64/dct-prim.cpp | 36 ++++++++++++++++++------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/source/common/aarch64/dct-prim.cpp 
b/source/common/aarch64/dct-prim.cpp
index 416532e54..acc50d4f4 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -5,6 +5,14 @@

 #include <arm_neon.h>

+#define X265_PRAGMA(text)       _Pragma(#text)
+#if defined(__clang__)
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(unroll(n))
+#elif defined(__GNUC__)
+#define X265_PRAGMA_UNROLL(n)   X265_PRAGMA(GCC unroll (n))
+#else
+#define X265_PRAGMA_UNROLL(n)
+#endif

 namespace
 {
@@ -472,12 +480,12 @@ static void partialButterflyInverse16_neon(const int16_t 
*src, int16_t *orig_dst
     const int add = 1 << (shift - 1);


-#pragma unroll(4)
+X265_PRAGMA_UNROLL(4)
     for (j = 0; j < line; j += 4)
     {
         /* Utilizing symmetry properties to the maximum to minimize the number 
of multiplications */

-#pragma unroll(2)
+X265_PRAGMA_UNROLL(2)
         for (k = 0; k < 2; k++)
         {
             int32x4_t s;
@@ -496,7 +504,7 @@ static void partialButterflyInverse16_neon(const int16_t 
*src, int16_t *orig_dst
         EE[3] = vsubq_s32(EEE[0] , EEO[0]);


-#pragma unroll(1)
+X265_PRAGMA_UNROLL(1)
         for (k = 0; k < 4; k += 4)
         {
             int32x4_t s[4];
@@ -522,14 +530,14 @@ static void partialButterflyInverse16_neon(const int16_t 
*src, int16_t *orig_dst
         static const int32x4_t max = vdupq_n_s32(32767);
         const int32x4_t minus_shift = vdupq_n_s32(-shift);

-#pragma unroll(4)
+X265_PRAGMA_UNROLL(4)
         for (k = 0; k < 4; k++)
         {
             E[k] = vaddq_s32(EE[k] , EO[k]);
             E[k + 4] = vsubq_s32(EE[3 - k] , EO[3 - k]);
         }

-#pragma unroll(2)
+X265_PRAGMA_UNROLL(2)
         for (k = 0; k < 8; k += 4)
         {
             int32x4_t s[4];
@@ -584,7 +592,7 @@ static void partialButterflyInverse16_neon(const int16_t 
*src, int16_t *orig_dst
         }


-#pragma unroll(2)
+X265_PRAGMA_UNROLL(2)
         for (k = 0; k < 8; k += 4)
         {
             int32x4_t t;
@@ -657,10 +665,10 @@ static void partialButterflyInverse32_neon(const int16_t 
*src, int16_t *orig_dst
     int16x4_t dst[32];
     int add = 1 << (shift - 1);

-#pragma unroll (8)
+X265_PRAGMA_UNROLL(8)
     for (j = 0; j < line; j += 4)
     {
-#pragma unroll (4)
+X265_PRAGMA_UNROLL(4)
         for (k = 0; k < 16; k += 4)
         {
             int32x4_t s[4];
@@ -681,7 +689,7 @@ static void partialButterflyInverse32_neon(const int16_t 
*src, int16_t *orig_dst
         }


-#pragma unroll (2)
+X265_PRAGMA_UNROLL(2)
         for (k = 0; k < 8; k += 4)
         {
             int32x4_t s[4];
@@ -721,7 +729,7 @@ static void partialButterflyInverse32_neon(const int16_t 
*src, int16_t *orig_dst
             EEO[k + 3] = s[3];
         }

-#pragma unroll (2)
+X265_PRAGMA_UNROLL(2)
         for (k = 0; k < 2; k++)
         {
             int32x4_t s;
@@ -736,14 +744,14 @@ static void partialButterflyInverse32_neon(const int16_t 
*src, int16_t *orig_dst
         EEE[1] = vaddq_s32(EEEE[1], EEEO[1]);
         EEE[2] = vsubq_s32(EEEE[1], EEEO[1]);

-#pragma unroll (4)
+X265_PRAGMA_UNROLL(4)
         for (k = 0; k < 4; k++)
         {
             EE[k] = vaddq_s32(EEE[k], EEO[k]);
             EE[k + 4] = vsubq_s32((EEE[3 - k]), (EEO[3 - k]));
         }

-#pragma unroll (8)
+X265_PRAGMA_UNROLL(8)
         for (k = 0; k < 8; k++)
         {
             E[k] = vaddq_s32(EE[k], EO[k]);
@@ -755,7 +763,7 @@ static void partialButterflyInverse32_neon(const int16_t 
*src, int16_t *orig_dst



-#pragma unroll (16)
+X265_PRAGMA_UNROLL(16)
         for (k = 0; k < 16; k++)
         {
             int32x4_t adde = vaddq_s32(vdupq_n_s32(add), E[k]);
@@ -777,7 +785,7 @@ static void partialButterflyInverse32_neon(const int16_t 
*src, int16_t *orig_dst
         }


-#pragma unroll (8)
+X265_PRAGMA_UNROLL(8)
         for (k = 0; k < 32; k += 4)
         {
             int16x4_t x0 = dst[k + 0];
--
2.42.1
IMPORTANT NOTICE: The contents of this email and any attachments are 
confidential and may also be privileged. If you are not the intended recipient, 
please notify the sender immediately and do not disclose the contents to any 
other person, use it for any purpose, or store or copy the information in any 
medium. Thank you.

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH 2/3] AArch64: Refactor loop unroll pragmas in dct primitives

Reply via email to