[libav-devel] [PATCH 1/4] h264/aarch64: sign extend int stride in loop filter asm

2019-01-01 Thread Janne Grunau
---
 libavcodec/aarch64/h264dsp_neon.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavcodec/aarch64/h264dsp_neon.S 
b/libavcodec/aarch64/h264dsp_neon.S
index 9b4610a4d4..60ffa24500 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -130,6 +130,7 @@ endfunc
 
 function ff_h264_h_loop_filter_luma_neon, export=1
 h264_loop_filter_start
+sxtwx1,  w1
 
 sub x0,  x0,  #4
 ld1 {v6.8B},  [x0], x1
@@ -210,6 +211,7 @@ endfunc
 
 function ff_h264_v_loop_filter_chroma_neon, export=1
 h264_loop_filter_start
+sxtwx1,  w1
 
 sub x0,  x0,  x1, lsl #1
 ld1 {v18.8B}, [x0], x1
@@ -228,6 +230,7 @@ endfunc
 
 function ff_h264_h_loop_filter_chroma_neon, export=1
 h264_loop_filter_start
+sxtwx1,  w1
 
 sub x0,  x0,  #2
 ld1 {v18.S}[0], [x0], x1
-- 
2.20.1

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 2/4] checkasm/h264: add loop filter tests

2019-01-01 Thread Janne Grunau
---
 tests/checkasm/h264dsp.c | 124 +++
 1 file changed, 124 insertions(+)

diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c
index f355a72a74..706fc79397 100644
--- a/tests/checkasm/h264dsp.c
+++ b/tests/checkasm/h264dsp.c
@@ -28,6 +28,7 @@
 #include "libavutil/intreadwrite.h"
 
 static const uint32_t pixel_mask[3] = { 0x, 0x01ff01ff, 0x03ff03ff };
+static const uint32_t pixel_mask_lf[3] = { 0xff0fff0f, 0x01ff000f, 0x03ff000f 
};
 
 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
 #define SIZEOF_COEF  (2 * ((bit_depth + 7) / 8))
@@ -312,9 +313,132 @@ static void check_idct_multiple(void)
 }
 }
 
+
+static void check_loop_filter(void)
+{
+LOCAL_ALIGNED_16(uint8_t, dst, [32 * 16 * 2]);
+LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 16 * 2]);
+LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 16 * 2]);
+H264DSPContext h;
+int bit_depth;
+int alphas[36], betas[36];
+int8_t tc0[36][4];
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, int stride,
+  int alpha, int beta, int8_t *tc0);
+
+for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+int i, j, a, c;
+uint32_t mask = pixel_mask_lf[bit_depth - 8];
+ff_h264dsp_init(, bit_depth, 1);
+for (i = 35, a = 255, c = 250; i >= 0; i--) {
+alphas[i] = a << (bit_depth - 8);
+betas[i]  = (i + 1) / 2 << (bit_depth - 8);
+tc0[i][0] = tc0[i][3] = (c + 6) / 10;
+tc0[i][1] = (c + 7) / 15;
+tc0[i][2] = (c + 9) / 20;
+a = a*9/10;
+c = c*9/10;
+}
+
+#define CHECK_LOOP_FILTER(name, align, ...) \
+do {\
+if (check_func(h.name, #name "_%dbpp", bit_depth)) {\
+for (j = 0; j < 36; j++) {  \
+intptr_t off = 8 * 32 + (j & 15) * 4 * !align;  \
+for (i = 0; i < 1024; i+=4) {   \
+AV_WN32A(dst + i, rnd() & mask);\
+}   \
+memcpy(dst0, dst, 32 * 16 * 2); \
+memcpy(dst1, dst, 32 * 16 * 2); \
+\
+call_ref(dst0 + off, 32, alphas[j], betas[j], tc0[j]); \
+call_new(dst1 + off, 32, alphas[j], betas[j], tc0[j]); \
+if (memcmp(dst0, dst1, 32 * 16 * SIZEOF_PIXEL)) {   \
+fprintf(stderr, #name ": j:%d, alpha:%d beta:%d " \
+"tc0:{%d,%d,%d,%d}\n", j, alphas[j], betas[j], 
\
+tc0[j][0], tc0[j][1], tc0[j][2], tc0[j][3]); \
+fail(); \
+}   \
+bench_new(dst1, 32, alphas[j], betas[j], tc0[j]);   \
+}   \
+}   \
+} while (0)
+
+CHECK_LOOP_FILTER(h264_v_loop_filter_luma, 1);
+CHECK_LOOP_FILTER(h264_h_loop_filter_luma, 0);
+CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff, 0);
+CHECK_LOOP_FILTER(h264_v_loop_filter_chroma, 1);
+CHECK_LOOP_FILTER(h264_h_loop_filter_chroma, 0);
+CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff, 0);
+#undef CHECK_LOOP_FILTER
+}
+}
+
+static void check_loop_filter_intra(void)
+{
+LOCAL_ALIGNED_16(uint8_t, dst, [32 * 16 * 2]);
+LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 16 * 2]);
+LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 16 * 2]);
+H264DSPContext h;
+int bit_depth;
+int alphas[36], betas[36];
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, int stride,
+  int alpha, int beta);
+
+for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
+int i, j, a;
+uint32_t mask = pixel_mask_lf[bit_depth - 8];
+ff_h264dsp_init(, bit_depth, 1);
+for (i = 35, a = 255; i >= 0; i--) {
+alphas[i] = a << (bit_depth - 8);
+betas[i]  = (i + 1) / 2 << (bit_depth - 8);
+a = a*9/10;
+}
+
+#define CHECK_LOOP_FILTER(name, align)  \
+do {\
+if (check_func(h.name, #name "_%dbpp", bit_depth)) {\
+for (j = 0; j < 36; j++) {  \
+intptr_t off = 8 * 32 + (j & 15) * 4 * !align;  \
+for (i = 0; i < 1024; i+=4) {   \
+AV_WN32A(dst + i, rnd() & mask);

[libav-devel] [PATCH 4/4] h264/aarch64: add intra loop filter neon asm

2019-01-01 Thread Janne Grunau
Add my neon asm from x264 relicensed under the LGPL 2.1 or later. Ported
(x264 uses nv12 chroma) and optimized.

Cycle count for checkasm --bench on a Snapdragon 820e:
h264_h_loop_filter_luma_intra_8bpp_c: 60.0
h264_h_loop_filter_luma_intra_8bpp_neon: 54.2
h264_v_loop_filter_luma_intra_8bpp_c: 148.3
h264_v_loop_filter_luma_intra_8bpp_neon: 73.8
h264_h_loop_filter_chroma_intra_8bpp_c: 27.8
h264_h_loop_filter_chroma_intra_8bpp_neon: 21.4
h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 15.8
h264_h_loop_filter_chroma_mbaff_intra_8bpp_neon: 15.7
h264_v_loop_filter_chroma_intra_8bpp_c: 45.8
h264_v_loop_filter_chroma_intra_8bpp_neon: 17.3
---
 libavcodec/aarch64/h264dsp_init_aarch64.c |  16 ++
 libavcodec/aarch64/h264dsp_neon.S | 297 ++
 2 files changed, 313 insertions(+)

diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c 
b/libavcodec/aarch64/h264dsp_init_aarch64.c
index b106f11134..07bda2ff07 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -29,10 +29,20 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int 
stride, int alpha,
  int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
  int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+   int beta);
+void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
+   int beta);
 void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
+ int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride,
+   int alpha, int beta);
 
 void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
@@ -77,8 +87,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, 
const int bit_depth,
 if (have_neon(cpu_flags) && bit_depth == 8) {
 c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
 c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+c->h264_v_loop_filter_luma_intra= 
ff_h264_v_loop_filter_luma_intra_neon;
+c->h264_h_loop_filter_luma_intra= 
ff_h264_h_loop_filter_luma_intra_neon;
+
 c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
 c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+c->h264_v_loop_filter_chroma_intra = 
ff_h264_v_loop_filter_chroma_intra_neon;
+c->h264_h_loop_filter_chroma_intra = 
ff_h264_h_loop_filter_chroma_intra_neon;
+c->h264_h_loop_filter_chroma_mbaff_intra = 
ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
 
 c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
 c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
diff --git a/libavcodec/aarch64/h264dsp_neon.S 
b/libavcodec/aarch64/h264dsp_neon.S
index b649f1d018..448e575b8c 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2008 Mans Rullgard 
  * Copyright (c) 2013 Janne Grunau 
+ * Copyright (c) 2014 Janne Grunau 
  *
  * This file is part of Libav.
  *
@@ -181,6 +182,203 @@ function ff_h264_h_loop_filter_luma_neon, export=1
 ret
 endfunc
 
+
+.macro h264_loop_filter_start_intra
+orr w4,  w2,  w3
+cbnzw4,  1f
+ret
+1:
+sxtwx1,  w1
+dup v30.16b, w2// alpha
+dup v31.16b, w3// beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+uabdv16.16b, v7.16b,  v0.16b// abs(p0 - q0)
+uabdv17.16b, v6.16b,  v7.16b// abs(p1 - p0)
+uabdv18.16b, v1.16b,  v0.16b// abs(q1 - q0)
+cmhiv19.16b, v30.16b, v16.16b   // < alpha
+cmhiv17.16b, v31.16b, v17.16b   // < beta
+cmhiv18.16b, v31.16b, v18.16b   // < beta
+
+moviv29.16b, #2
+ushrv30.16b, v30.16b, #2// alpha >> 2
+add v30.16b, v30.16b, v29.16b   // (alpha >> 2) + 2
+cmhiv16.16b, v30.16b, v16.16b   // < (alpha >> 2) + 2
+
+and v19.16b, v19.16b, v17.16b
+and v19.16b, 

[libav-devel] [PATCH 3/4] h264/aarch64: optimize neon loop filter

2019-01-01 Thread Janne Grunau
Exit as soon as possible if no filtering will be done.

Improves the checkasm --bench cycle count on a Snapdragon 820e:
h264_h_loop_filter_luma_8bpp_c:  72.4 ->  72.5
h264_h_loop_filter_luma_8bpp_neon:   97.1 ->  56.3
h264_v_loop_filter_luma_8bpp_c: 174.0 -> 173.5
h264_v_loop_filter_luma_8bpp_neon:   62.9 ->  60.9
h264_h_loop_filter_chroma_8bpp_c:30.2 ->  30.3
h264_h_loop_filter_chroma_8bpp_neon: 51.6 ->  25.7
h264_v_loop_filter_chroma_8bpp_c:57.3 ->  57.3
h264_v_loop_filter_chroma_8bpp_neon: 28.0 ->  24.0
---
 libavcodec/aarch64/h264dsp_neon.S | 33 ++-
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/libavcodec/aarch64/h264dsp_neon.S 
b/libavcodec/aarch64/h264dsp_neon.S
index 60ffa24500..b649f1d018 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -54,9 +54,12 @@
 uabdv17.16B, v20.16B, v16.16B   // abs(p2 - p0)
 and v21.16B, v21.16B, v28.16B
 uabdv19.16B,  v4.16B,  v0.16B   // abs(q2 - q0)
+and v21.16B, v21.16B, v30.16B  // < beta
+shrnv30.8b,  v21.8h,  #4
+mov x7, v30.d[0]
 cmhiv17.16B, v22.16B, v17.16B   // < beta
-and v21.16B, v21.16B, v30.16B
 cmhiv19.16B, v22.16B, v19.16B   // < beta
+cbz x7,  9f
 and v17.16B, v17.16B, v21.16B
 and v19.16B, v19.16B, v21.16B
 and v24.16B, v24.16B, v21.16B
@@ -124,7 +127,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
 st1 {v16.16B}, [x0], x1
 st1 {v0.16B},  [x0], x1
 st1 {v19.16B}, [x0]
-
+9:
 ret
 endfunc
 
@@ -174,32 +177,34 @@ function ff_h264_h_loop_filter_luma_neon, export=1
 st1 {v16.S}[3], [x0], x1
 st1 {v0.S}[3],  [x0], x1
 st1 {v19.S}[3], [x0], x1
-
+9:
 ret
 endfunc
 
 .macro  h264_loop_filter_chroma
 dup v22.8B, w2  // alpha
+dup v23.8B, w3  // beta
 uxtlv24.8H, v24.8B
 uabdv26.8B, v16.8B, v0.8B   // abs(p0 - q0)
-uxtlv4.8H,  v0.8B
 uabdv28.8B, v18.8B, v16.8B  // abs(p1 - p0)
+uabdv30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
+cmhiv26.8B, v22.8B, v26.8B  // < alpha
+cmhiv28.8B, v23.8B, v28.8B  // < beta
+cmhiv30.8B, v23.8B, v30.8B  // < beta
+uxtlv4.8H,  v0.8B
+and v26.8B, v26.8B, v28.8B
 usubw   v4.8H,  v4.8H,  v16.8B
-sli v24.8H, v24.8H, #8
+and v26.8B, v26.8B, v30.8B
 shl v4.8H,  v4.8H,  #2
-uabdv30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
+mov x2,  v26.d[0]
+sli v24.8H, v24.8H, #8
 uaddw   v4.8H,  v4.8H,  v18.8B
-cmhiv26.8B, v22.8B, v26.8B  // < alpha
+cbz x2,  9f
 usubw   v4.8H,  v4.8H,  v2.8B
-dup v22.8B, w3  // beta
 rshrn   v4.8B,  v4.8H,  #3
-cmhiv28.8B, v22.8B, v28.8B  // < beta
-cmhiv30.8B, v22.8B, v30.8B  // < beta
 sminv4.8B,  v4.8B,  v24.8B
 neg v25.8B, v24.8B
-and v26.8B, v26.8B, v28.8B
 smaxv4.8B,  v4.8B,  v25.8B
-and v26.8B, v26.8B, v30.8B
 uxtlv22.8H, v0.8B
 and v4.8B,  v4.8B,  v26.8B
 uxtlv28.8H, v16.8B
@@ -224,7 +229,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1
 sub x0,  x0,  x1, lsl #1
 st1 {v16.8B}, [x0], x1
 st1 {v0.8B},  [x0], x1
-
+9:
 ret
 endfunc
 
@@ -257,7 +262,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
 st1 {v16.S}[1], [x0], x1
 st1 {v0.S}[1],  [x0], x1
 st1 {v2.S}[1],  [x0], x1
-
+9:
 ret
 endfunc
 
-- 
2.20.1

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel