[libav-devel] [PATCH 1/4] h264/aarch64: sign extend int stride in loop filter asm
--- libavcodec/aarch64/h264dsp_neon.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 9b4610a4d4..60ffa24500 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -130,6 +130,7 @@ endfunc function ff_h264_h_loop_filter_luma_neon, export=1 h264_loop_filter_start +sxtwx1, w1 sub x0, x0, #4 ld1 {v6.8B}, [x0], x1 @@ -210,6 +211,7 @@ endfunc function ff_h264_v_loop_filter_chroma_neon, export=1 h264_loop_filter_start +sxtwx1, w1 sub x0, x0, x1, lsl #1 ld1 {v18.8B}, [x0], x1 @@ -228,6 +230,7 @@ endfunc function ff_h264_h_loop_filter_chroma_neon, export=1 h264_loop_filter_start +sxtwx1, w1 sub x0, x0, #2 ld1 {v18.S}[0], [x0], x1 -- 2.20.1 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/4] checkasm/h264: add loop filter tests
--- tests/checkasm/h264dsp.c | 124 +++ 1 file changed, 124 insertions(+) diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c index f355a72a74..706fc79397 100644 --- a/tests/checkasm/h264dsp.c +++ b/tests/checkasm/h264dsp.c @@ -28,6 +28,7 @@ #include "libavutil/intreadwrite.h" static const uint32_t pixel_mask[3] = { 0x, 0x01ff01ff, 0x03ff03ff }; +static const uint32_t pixel_mask_lf[3] = { 0xff0fff0f, 0x01ff000f, 0x03ff000f }; #define SIZEOF_PIXEL ((bit_depth + 7) / 8) #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8)) @@ -312,9 +313,132 @@ static void check_idct_multiple(void) } } + +static void check_loop_filter(void) +{ +LOCAL_ALIGNED_16(uint8_t, dst, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 16 * 2]); +H264DSPContext h; +int bit_depth; +int alphas[36], betas[36]; +int8_t tc0[36][4]; + +declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, int stride, + int alpha, int beta, int8_t *tc0); + +for (bit_depth = 8; bit_depth <= 10; bit_depth++) { +int i, j, a, c; +uint32_t mask = pixel_mask_lf[bit_depth - 8]; +ff_h264dsp_init(, bit_depth, 1); +for (i = 35, a = 255, c = 250; i >= 0; i--) { +alphas[i] = a << (bit_depth - 8); +betas[i] = (i + 1) / 2 << (bit_depth - 8); +tc0[i][0] = tc0[i][3] = (c + 6) / 10; +tc0[i][1] = (c + 7) / 15; +tc0[i][2] = (c + 9) / 20; +a = a*9/10; +c = c*9/10; +} + +#define CHECK_LOOP_FILTER(name, align, ...) \ +do {\ +if (check_func(h.name, #name "_%dbpp", bit_depth)) {\ +for (j = 0; j < 36; j++) { \ +intptr_t off = 8 * 32 + (j & 15) * 4 * !align; \ +for (i = 0; i < 1024; i+=4) { \ +AV_WN32A(dst + i, rnd() & mask);\ +} \ +memcpy(dst0, dst, 32 * 16 * 2); \ +memcpy(dst1, dst, 32 * 16 * 2); \ +\ +call_ref(dst0 + off, 32, alphas[j], betas[j], tc0[j]); \ +call_new(dst1 + off, 32, alphas[j], betas[j], tc0[j]); \ +if (memcmp(dst0, dst1, 32 * 16 * SIZEOF_PIXEL)) { \ +fprintf(stderr, #name ": j:%d, alpha:%d beta:%d " \ +"tc0:{%d,%d,%d,%d}\n", j, alphas[j], betas[j], \ +tc0[j][0], tc0[j][1], tc0[j][2], tc0[j][3]); \ +fail(); \ +} \ +bench_new(dst1, 32, alphas[j], betas[j], tc0[j]); \ +} \ +} \ +} while (0) + +CHECK_LOOP_FILTER(h264_v_loop_filter_luma, 1); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma, 0); +CHECK_LOOP_FILTER(h264_h_loop_filter_luma_mbaff, 0); +CHECK_LOOP_FILTER(h264_v_loop_filter_chroma, 1); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma, 0); +CHECK_LOOP_FILTER(h264_h_loop_filter_chroma_mbaff, 0); +#undef CHECK_LOOP_FILTER +} +} + +static void check_loop_filter_intra(void) +{ +LOCAL_ALIGNED_16(uint8_t, dst, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst0, [32 * 16 * 2]); +LOCAL_ALIGNED_16(uint8_t, dst1, [32 * 16 * 2]); +H264DSPContext h; +int bit_depth; +int alphas[36], betas[36]; + +declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *pix, int stride, + int alpha, int beta); + +for (bit_depth = 8; bit_depth <= 10; bit_depth++) { +int i, j, a; +uint32_t mask = pixel_mask_lf[bit_depth - 8]; +ff_h264dsp_init(, bit_depth, 1); +for (i = 35, a = 255; i >= 0; i--) { +alphas[i] = a << (bit_depth - 8); +betas[i] = (i + 1) / 2 << (bit_depth - 8); +a = a*9/10; +} + +#define CHECK_LOOP_FILTER(name, align) \ +do {\ +if (check_func(h.name, #name "_%dbpp", bit_depth)) {\ +for (j = 0; j < 36; j++) { \ +intptr_t off = 8 * 32 + (j & 15) * 4 * !align; \ +for (i = 0; i < 1024; i+=4) { \ +AV_WN32A(dst + i, rnd() & mask);
[libav-devel] [PATCH 4/4] h264/aarch64: add intra loop filter neon asm
Add my neon asm from x264 relicensed under the LGPL 2.1 or later. Ported (x264 uses nv12 chroma) and optimized. Cycle count for checkasm --bench on a Snapdragon 820e: h264_h_loop_filter_luma_intra_8bpp_c: 60.0 h264_h_loop_filter_luma_intra_8bpp_neon: 54.2 h264_v_loop_filter_luma_intra_8bpp_c: 148.3 h264_v_loop_filter_luma_intra_8bpp_neon: 73.8 h264_h_loop_filter_chroma_intra_8bpp_c: 27.8 h264_h_loop_filter_chroma_intra_8bpp_neon: 21.4 h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 15.8 h264_h_loop_filter_chroma_mbaff_intra_8bpp_neon: 15.7 h264_v_loop_filter_chroma_intra_8bpp_c: 45.8 h264_v_loop_filter_chroma_intra_8bpp_neon: 17.3 --- libavcodec/aarch64/h264dsp_init_aarch64.c | 16 ++ libavcodec/aarch64/h264dsp_neon.S | 297 ++ 2 files changed, 313 insertions(+) diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c index b106f11134..07bda2ff07 100644 --- a/libavcodec/aarch64/h264dsp_init_aarch64.c +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -29,10 +29,20 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha, + int beta); +void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha, + int beta); void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride, + int alpha, int beta); void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, int log2_den, int weight, int offset); @@ -77,8 +87,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, if (have_neon(cpu_flags) && bit_depth == 8) { c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; +c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon; +c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon; + c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; +c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon; +c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon; +c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon; c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index b649f1d018..448e575b8c 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -1,6 +1,7 @@ /* * Copyright (c) 2008 Mans Rullgard * Copyright (c) 2013 Janne Grunau + * Copyright (c) 2014 Janne Grunau * * This file is part of Libav. * @@ -181,6 +182,203 @@ function ff_h264_h_loop_filter_luma_neon, export=1 ret endfunc + +.macro h264_loop_filter_start_intra +orr w4, w2, w3 +cbnzw4, 1f +ret +1: +sxtwx1, w1 +dup v30.16b, w2// alpha +dup v31.16b, w3// beta +.endm + +.macro h264_loop_filter_luma_intra +uabdv16.16b, v7.16b, v0.16b// abs(p0 - q0) +uabdv17.16b, v6.16b, v7.16b// abs(p1 - p0) +uabdv18.16b, v1.16b, v0.16b// abs(q1 - q0) +cmhiv19.16b, v30.16b, v16.16b // < alpha +cmhiv17.16b, v31.16b, v17.16b // < beta +cmhiv18.16b, v31.16b, v18.16b // < beta + +moviv29.16b, #2 +ushrv30.16b, v30.16b, #2// alpha >> 2 +add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 +cmhiv16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 + +and v19.16b, v19.16b, v17.16b +and v19.16b,
[libav-devel] [PATCH 3/4] h264/aarch64: optimize neon loop filter
Exit as soon as possible if no filtering will be done. Improves the checkasm --bench cycle count on a Snapdragon 820e: h264_h_loop_filter_luma_8bpp_c: 72.4 -> 72.5 h264_h_loop_filter_luma_8bpp_neon: 97.1 -> 56.3 h264_v_loop_filter_luma_8bpp_c: 174.0 -> 173.5 h264_v_loop_filter_luma_8bpp_neon: 62.9 -> 60.9 h264_h_loop_filter_chroma_8bpp_c:30.2 -> 30.3 h264_h_loop_filter_chroma_8bpp_neon: 51.6 -> 25.7 h264_v_loop_filter_chroma_8bpp_c:57.3 -> 57.3 h264_v_loop_filter_chroma_8bpp_neon: 28.0 -> 24.0 --- libavcodec/aarch64/h264dsp_neon.S | 33 ++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S index 60ffa24500..b649f1d018 100644 --- a/libavcodec/aarch64/h264dsp_neon.S +++ b/libavcodec/aarch64/h264dsp_neon.S @@ -54,9 +54,12 @@ uabdv17.16B, v20.16B, v16.16B // abs(p2 - p0) and v21.16B, v21.16B, v28.16B uabdv19.16B, v4.16B, v0.16B // abs(q2 - q0) +and v21.16B, v21.16B, v30.16B // < beta +shrnv30.8b, v21.8h, #4 +mov x7, v30.d[0] cmhiv17.16B, v22.16B, v17.16B // < beta -and v21.16B, v21.16B, v30.16B cmhiv19.16B, v22.16B, v19.16B // < beta +cbz x7, 9f and v17.16B, v17.16B, v21.16B and v19.16B, v19.16B, v21.16B and v24.16B, v24.16B, v21.16B @@ -124,7 +127,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1 st1 {v16.16B}, [x0], x1 st1 {v0.16B}, [x0], x1 st1 {v19.16B}, [x0] - +9: ret endfunc @@ -174,32 +177,34 @@ function ff_h264_h_loop_filter_luma_neon, export=1 st1 {v16.S}[3], [x0], x1 st1 {v0.S}[3], [x0], x1 st1 {v19.S}[3], [x0], x1 - +9: ret endfunc .macro h264_loop_filter_chroma dup v22.8B, w2 // alpha +dup v23.8B, w3 // beta uxtlv24.8H, v24.8B uabdv26.8B, v16.8B, v0.8B // abs(p0 - q0) -uxtlv4.8H, v0.8B uabdv28.8B, v18.8B, v16.8B // abs(p1 - p0) +uabdv30.8B, v2.8B, v0.8B // abs(q1 - q0) +cmhiv26.8B, v22.8B, v26.8B // < alpha +cmhiv28.8B, v23.8B, v28.8B // < beta +cmhiv30.8B, v23.8B, v30.8B // < beta +uxtlv4.8H, v0.8B +and v26.8B, v26.8B, v28.8B usubw v4.8H, v4.8H, v16.8B -sli v24.8H, v24.8H, #8 +and v26.8B, v26.8B, v30.8B shl v4.8H, v4.8H, #2 -uabdv30.8B, v2.8B, v0.8B // abs(q1 - q0) +mov x2, v26.d[0] +sli v24.8H, v24.8H, #8 uaddw v4.8H, v4.8H, v18.8B -cmhiv26.8B, v22.8B, v26.8B // < alpha +cbz x2, 9f usubw v4.8H, v4.8H, v2.8B -dup v22.8B, w3 // beta rshrn v4.8B, v4.8H, #3 -cmhiv28.8B, v22.8B, v28.8B // < beta -cmhiv30.8B, v22.8B, v30.8B // < beta sminv4.8B, v4.8B, v24.8B neg v25.8B, v24.8B -and v26.8B, v26.8B, v28.8B smaxv4.8B, v4.8B, v25.8B -and v26.8B, v26.8B, v30.8B uxtlv22.8H, v0.8B and v4.8B, v4.8B, v26.8B uxtlv28.8H, v16.8B @@ -224,7 +229,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1 sub x0, x0, x1, lsl #1 st1 {v16.8B}, [x0], x1 st1 {v0.8B}, [x0], x1 - +9: ret endfunc @@ -257,7 +262,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1 st1 {v16.S}[1], [x0], x1 st1 {v0.S}[1], [x0], x1 st1 {v2.S}[1], [x0], x1 - +9: ret endfunc -- 2.20.1 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel