From a1b6bd47bcbad3f188a78616ca8c1b6134a113b5 Mon Sep 17 00:00:00 2001
From: Kieran Kunhya <kierank@obe.tv>
Date: Fri, 21 Oct 2022 04:18:11 +0100
Subject: [PATCH] RFC: v210enc optimisations and initial AVX-512

---
 libavcodec/x86/v210enc.asm    | 59 ++++++++++++++++++++++-------------
 libavcodec/x86/v210enc_init.c |  7 +++++
 2 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/v210enc.asm b/libavcodec/x86/v210enc.asm
index 965f2bea3c..2d0827bbd0 100644
--- a/libavcodec/x86/v210enc.asm
+++ b/libavcodec/x86/v210enc.asm
@@ -38,13 +38,16 @@ cextern pb_1
 cextern pb_FE
 %define v210_enc_max_8 pb_FE
 
-v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
-v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
+v210_enc_mult_8: times 8 dw 4,64
+v210_enc_shift_8: times 8 dw 2,6
+v210_enc_mask_8: times 8 db 0x00,0xff,0x00,0x00
+v210_enc_mask2_8: times 8 db 0xff,0x03,0xff,0xff
 
-v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
-v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
+v210_enc_luma_shuf1_8: times 2 db -1,0,-1,-1,1,-1,2,-1,-1,3,-1,-1,4,-1,5,-1
+v210_enc_luma_shuf2_8: times 2 db -1,6,-1,-1,7,-1,8,-1,-1,9,-1,-1,10,-1,11,-1
 
-v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
+v210_enc_chroma_shuf1_8: times 2 db 0,-1,8,-1,-1,1,-1,-1,9,-1,2,-1,-1,10,-1,-1
+v210_enc_chroma_shuf2_8: times 2 db 3,-1,11,-1,-1,4,-1,-1,12,-1,5,-1,-1,13,-1,-1
 
 SECTION .text
 
@@ -115,7 +118,7 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
 
     mova    m4, [v210_enc_min_8]
     mova    m5, [v210_enc_max_8]
-    pxor    m6, m6
+    mova    m6, [v210_enc_mask_8]
 
 .loop:
     movu        xm1, [yq+widthq*2]
@@ -124,16 +127,6 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
 %endif
     CLIPUB  m1, m4, m5
 
-    punpcklbw m0, m1, m6
-    ; can't unpack high bytes in the same way because we process
-    ; only six bytes at a time
-    pshufb  m1, [v210_enc_luma_shuf_8]
-
-    pmullw  m0, [v210_enc_luma_mult_8]
-    pmullw  m1, [v210_enc_luma_mult_8]
-    pshufb  m0, [v210_enc_luma_shuf_10]
-    pshufb  m1, [v210_enc_luma_shuf_10]
-
     movq         xm3, [uq+widthq]
     movhps       xm3, [vq+widthq]
 %if cpuflag(avx2)
@@ -143,14 +136,33 @@ cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
 %endif
     CLIPUB  m3, m4, m5
 
-    ; shuffle and multiply to get the same packing as in 10-bit
+    ; vpermi2b is obvious choice but too slow
+    pshufb  m0, m1, [v210_enc_luma_shuf1_8]
+    pshufb  m1, [v210_enc_luma_shuf2_8]
+
     pshufb  m2, m3, [v210_enc_chroma_shuf1_8]
     pshufb  m3, [v210_enc_chroma_shuf2_8]
 
-    pmullw  m2, [v210_enc_chroma_mult_8]
-    pmullw  m3, [v210_enc_chroma_mult_8]
-    pshufb  m2, [v210_enc_chroma_shuf_10]
-    pshufb  m3, [v210_enc_chroma_shuf_10]
+    por     m0, m2
+    por     m1, m3
+
+    ; TODO: avx-512 masked mov?
+    pand    m2, m6, m0
+    pand    m3, m6, m1
+
+    pslld   m2, 4
+    pslld   m3, 4
+
+%if cpuflag(avx512)
+    vpsllvw  m0, [v210_enc_shift_8]
+    vpsllvw  m1, [v210_enc_shift_8]
+%else
+    pmullw  m0, [v210_enc_mult_8]
+    pmullw  m1, [v210_enc_mult_8]
+%endif
+
+    pand m0, [v210_enc_mask2_8]
+    pand m1, [v210_enc_mask2_8]
 
     por     m0, m2
     por     m1, m3
@@ -182,3 +194,8 @@ v210_planar_pack_8
 INIT_YMM avx2
 v210_planar_pack_8
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_YMM avx512
+v210_planar_pack_8
+%endif
diff --git a/libavcodec/x86/v210enc_init.c b/libavcodec/x86/v210enc_init.c
index 13a351dd1d..095ed5e913 100644
--- a/libavcodec/x86/v210enc_init.c
+++ b/libavcodec/x86/v210enc_init.c
@@ -27,6 +27,8 @@ void ff_v210_planar_pack_8_avx(const uint8_t *y, const uint8_t *u,
                                const uint8_t *v, uint8_t *dst, ptrdiff_t width);
 void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
                                 const uint8_t *v, uint8_t *dst, ptrdiff_t width);
+void ff_v210_planar_pack_8_avx512(const uint8_t *y, const uint8_t *u,
+                                const uint8_t *v, uint8_t *dst, ptrdiff_t width);
 void ff_v210_planar_pack_10_ssse3(const uint16_t *y, const uint16_t *u,
                                   const uint16_t *v, uint8_t *dst,
                                   ptrdiff_t width);
@@ -52,4 +54,9 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
         s->sample_factor_10 = 2;
         s->pack_line_10     = ff_v210_planar_pack_10_avx2;
     }
+
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        s->sample_factor_8  = 2;
+        s->pack_line_8      = ff_v210_planar_pack_8_avx512;
+    }
 }
-- 
2.24.1.windows.2

