[FFmpeg-devel] [PATCH v2] avcodec/mips: msa optimizations for vc1dsp
Performance of WMV3 decoding has speed up from 3.66x to 5.23x tested on 3A4000. --- libavcodec/mips/Makefile| 1 + libavcodec/mips/vc1dsp_init_mips.c | 30 ++- libavcodec/mips/vc1dsp_mips.h | 23 ++ libavcodec/mips/vc1dsp_msa.c| 461 libavutil/mips/generic_macros_msa.h | 3 + 5 files changed, 514 insertions(+), 4 deletions(-) create mode 100644 libavcodec/mips/vc1dsp_msa.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index c5b54d5..b4993f6 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -89,3 +89,4 @@ MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o +MSA-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_msa.o diff --git a/libavcodec/mips/vc1dsp_init_mips.c b/libavcodec/mips/vc1dsp_init_mips.c index 4adc9e1..c0007ff 100644 --- a/libavcodec/mips/vc1dsp_init_mips.c +++ b/libavcodec/mips/vc1dsp_init_mips.c @@ -23,6 +23,10 @@ #include "vc1dsp_mips.h" #include "config.h" +#define FN_ASSIGN(OP, X, Y, INSN) \ +dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \ +dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN + #if HAVE_MMI static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) { @@ -49,10 +53,6 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi; dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi; -#define FN_ASSIGN(OP, X, Y, INSN) \ -dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \ -dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN - FN_ASSIGN(put_, 0, 0, _mmi); FN_ASSIGN(put_, 0, 1, _mmi); FN_ASSIGN(put_, 0, 2, _mmi); @@ -100,9 +100,31 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) } #endif /* HAVE_MMI */ +#if HAVE_MSA +static av_cold void vc1dsp_init_msa(VC1DSPContext *dsp) +{ +dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_msa; +dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_msa; +dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_msa; + +FN_ASSIGN(put_, 1, 1, _msa); +FN_ASSIGN(put_, 1, 2, _msa); +FN_ASSIGN(put_, 1, 3, _msa); +FN_ASSIGN(put_, 2, 1, _msa); +FN_ASSIGN(put_, 2, 2, _msa); +FN_ASSIGN(put_, 2, 3, _msa); +FN_ASSIGN(put_, 3, 1, _msa); +FN_ASSIGN(put_, 3, 2, _msa); +FN_ASSIGN(put_, 3, 3, _msa); +} +#endif /* HAVE_MSA */ + av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp) { #if HAVE_MMI vc1dsp_init_mmi(dsp); #endif /* HAVE_MMI */ +#if HAVE_MSA +vc1dsp_init_msa(dsp); +#endif /* HAVE_MSA */ } diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h index 0db85fa..5f72e60 100644 --- a/libavcodec/mips/vc1dsp_mips.h +++ b/libavcodec/mips/vc1dsp_mips.h @@ -191,4 +191,27 @@ void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, int stride, int h, int x, int y); +void ff_vc1_inv_trans_8x8_msa(int16_t block[64]); +void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block); +void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block); + +#define FF_PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \ +void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t stride, int rnd); \ +void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t stride, int rnd); + +FF_PUT_VC1_MSPEL_MC_MSA(1, 1); +FF_PUT_VC1_MSPEL_MC_MSA(1, 2); +FF_PUT_VC1_MSPEL_MC_MSA(1, 3); + +FF_PUT_VC1_MSPEL_MC_MSA(2, 1); +FF_PUT_VC1_MSPEL_MC_MSA(2, 2); +FF_PUT_VC1_MSPEL_MC_MSA(2, 3); + +FF_PUT_VC1_MSPEL_MC_MSA(3, 1); +FF_PUT_VC1_MSPEL_MC_MSA(3, 2); +FF_PUT_VC1_MSPEL_MC_MSA(3, 3); #endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */ diff --git a/libavcodec/mips/vc1dsp_msa.c b/libavcodec/mips/vc1dsp_msa.c new file mode 100644 index 000..6e588e8 --- /dev/null +++ b/libavcodec/mips/vc1dsp_msa.c @@ -0,0 +1,461 @@ +/* + * Loongson SIMD optimized vc1dsp + * + * Copyright (c) 2019 Loongson Technology Corporation Limited + *gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Fo
Re: [FFmpeg-devel] [PATCH] avcodec/mips: msa optimizations for vc1dsp
>>+TRANSPOSE4x4_SW_SW(in_l0, in_l1, in_l2, in_l3, t_l1, t_l2, t_l3, t_l4); >>+TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_l0, in_l1, in_l2, in_l3); >>+TRANSPOSE4x4_SW_SW(in_l4, in_l5, in_l6, in_l7, in_l4, in_l5, in_l6, in_l7); >>+in_r4 = t_l1, in_r5 = t_l2, in_r6 = t_l3, in_r7 = t_l4; > >It's better to transpose 'in_l0, in_l1, in_l2, in_l3' directly into themselves, and ' in_r4, in_r5, in_r6, in_r7' the same. >>+PUT_VC1_MSPEL_MC_MSA(2, 1); >>+PUT_VC1_MSPEL_MC_MSA(2, 2); >>+PUT_VC1_MSPEL_MC_MSA(2, 3); >>+ >>+PUT_VC1_MSPEL_MC_MSA(3, 1); >>+PUT_VC1_MSPEL_MC_MSA(3, 2); >>+PUT_VC1_MSPEL_MC_MSA(3, 3); > >About the 'cnst_para*' used in put_vc1_mspel_mc_h_v_msa, maybe you can reference the usage of 'shift_value'. >It may reduce some if clause especially in 'ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa'. Thx, will fix in v2. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] avcodec/mips: Fixed four warnings in vc1dsp
Change the stride argument to ptrdiff_t in the following functions: ff_put_no_rnd_vc1_chroma_mc8_mmi, ff_put_no_rnd_vc1_chroma_mc4_mmi, ff_avg_no_rnd_vc1_chroma_mc8_mmi, ff_avg_no_rnd_vc1_chroma_mc4_mmi. --- libavcodec/mips/vc1dsp_mips.h | 8 libavcodec/mips/vc1dsp_mmi.c | 8 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h index 5f72e60..5897dae 100644 --- a/libavcodec/mips/vc1dsp_mips.h +++ b/libavcodec/mips/vc1dsp_mips.h @@ -180,16 +180,16 @@ void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq); void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y); + ptrdiff_t stride, int h, int x, int y); void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y); + ptrdiff_t stride, int h, int x, int y); void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y); + ptrdiff_t stride, int h, int x, int y); void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y); + ptrdiff_t stride, int h, int x, int y); void ff_vc1_inv_trans_8x8_msa(int16_t block[64]); void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block); diff --git a/libavcodec/mips/vc1dsp_mmi.c b/libavcodec/mips/vc1dsp_mmi.c index db314de..9837868 100644 --- a/libavcodec/mips/vc1dsp_mmi.c +++ b/libavcodec/mips/vc1dsp_mmi.c @@ -2241,7 +2241,7 @@ DECLARE_FUNCTION(3, 3) void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y) + ptrdiff_t stride, int h, int x, int y) { const int A = (8 - x) * (8 - y); const int B = (x) * (8 - y); @@ -2296,7 +2296,7 @@ void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y) + ptrdiff_t stride, int h, int x, int y) { const int A = (8 - x) * (8 - y); const int B = (x) * (8 - y); @@ -2349,7 +2349,7 @@ void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y) + ptrdiff_t stride, int h, int x, int y) { const int A = (8 - x) * (8 - y); const int B = (x) * (8 - y); @@ -2407,7 +2407,7 @@ void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */, void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, - int stride, int h, int x, int y) + ptrdiff_t stride, int h, int x, int y) { const int A = (8 - x) * (8 - y); const int B = (x) * (8 - y); -- 2.1.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] avcodec/mips: msa optimizations for vc1dsp
Performance of WMV3 decoding has speed up from 3.66x to 5.23x tested on 3A4000. --- libavcodec/mips/Makefile| 1 + libavcodec/mips/vc1dsp_init_mips.c | 30 ++- libavcodec/mips/vc1dsp_mips.h | 23 ++ libavcodec/mips/vc1dsp_msa.c| 483 libavutil/mips/generic_macros_msa.h | 3 + 5 files changed, 536 insertions(+), 4 deletions(-) create mode 100644 libavcodec/mips/vc1dsp_msa.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index c5b54d5..b4993f6 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -89,3 +89,4 @@ MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o +MSA-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_msa.o diff --git a/libavcodec/mips/vc1dsp_init_mips.c b/libavcodec/mips/vc1dsp_init_mips.c index 4adc9e1..c0007ff 100644 --- a/libavcodec/mips/vc1dsp_init_mips.c +++ b/libavcodec/mips/vc1dsp_init_mips.c @@ -23,6 +23,10 @@ #include "vc1dsp_mips.h" #include "config.h" +#define FN_ASSIGN(OP, X, Y, INSN) \ +dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \ +dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN + #if HAVE_MMI static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) { @@ -49,10 +53,6 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi; dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi; -#define FN_ASSIGN(OP, X, Y, INSN) \ -dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \ -dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN - FN_ASSIGN(put_, 0, 0, _mmi); FN_ASSIGN(put_, 0, 1, _mmi); FN_ASSIGN(put_, 0, 2, _mmi); @@ -100,9 +100,31 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) } #endif /* HAVE_MMI */ +#if HAVE_MSA +static av_cold void vc1dsp_init_msa(VC1DSPContext *dsp) +{ +dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_msa; +dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_msa; +dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_msa; + +FN_ASSIGN(put_, 1, 1, _msa); +FN_ASSIGN(put_, 1, 2, _msa); +FN_ASSIGN(put_, 1, 3, _msa); +FN_ASSIGN(put_, 2, 1, _msa); +FN_ASSIGN(put_, 2, 2, _msa); +FN_ASSIGN(put_, 2, 3, _msa); +FN_ASSIGN(put_, 3, 1, _msa); +FN_ASSIGN(put_, 3, 2, _msa); +FN_ASSIGN(put_, 3, 3, _msa); +} +#endif /* HAVE_MSA */ + av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp) { #if HAVE_MMI vc1dsp_init_mmi(dsp); #endif /* HAVE_MMI */ +#if HAVE_MSA +vc1dsp_init_msa(dsp); +#endif /* HAVE_MSA */ } diff --git a/libavcodec/mips/vc1dsp_mips.h b/libavcodec/mips/vc1dsp_mips.h index 0db85fa..5f72e60 100644 --- a/libavcodec/mips/vc1dsp_mips.h +++ b/libavcodec/mips/vc1dsp_mips.h @@ -191,4 +191,27 @@ void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, uint8_t *src /* align 1 */, int stride, int h, int x, int y); +void ff_vc1_inv_trans_8x8_msa(int16_t block[64]); +void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block); +void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block); + +#define FF_PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \ +void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t stride, int rnd); \ +void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst, \ + const uint8_t *src, \ + ptrdiff_t stride, int rnd); + +FF_PUT_VC1_MSPEL_MC_MSA(1, 1); +FF_PUT_VC1_MSPEL_MC_MSA(1, 2); +FF_PUT_VC1_MSPEL_MC_MSA(1, 3); + +FF_PUT_VC1_MSPEL_MC_MSA(2, 1); +FF_PUT_VC1_MSPEL_MC_MSA(2, 2); +FF_PUT_VC1_MSPEL_MC_MSA(2, 3); + +FF_PUT_VC1_MSPEL_MC_MSA(3, 1); +FF_PUT_VC1_MSPEL_MC_MSA(3, 2); +FF_PUT_VC1_MSPEL_MC_MSA(3, 3); #endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */ diff --git a/libavcodec/mips/vc1dsp_msa.c b/libavcodec/mips/vc1dsp_msa.c new file mode 100644 index 000..1619ea4 --- /dev/null +++ b/libavcodec/mips/vc1dsp_msa.c @@ -0,0 +1,483 @@ +/* + * Loongson SIMD optimized vc1dsp + * + * Copyright (c) 2019 Loongson Technology Corporation Limited + *gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Fo
[FFmpeg-devel] [PATCH] avcodec/mips: simplified code in vp3dsp_idct_msa.c.
Use the macros of ADD8 to replace continuous addition operations. --- libavcodec/mips/vp3dsp_idct_msa.c | 80 - libavutil/mips/generic_macros_msa.h | 6 +++ 2 files changed, 22 insertions(+), 64 deletions(-) diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c index 90c578f..e4cd377 100644 --- a/libavcodec/mips/vp3dsp_idct_msa.c +++ b/libavcodec/mips/vp3dsp_idct_msa.c @@ -178,14 +178,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) c0, c1, c2, c3); ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, c4, c5, c6, c7); -A += c0; -B += c7; -C += c1; -D += c2; -E += c3; -F += c4; -G += c5; -H += c6; +ADD8(A, c0, B, c7, C, c1, D, c2, E, c3, F, c4, G, c5, H, c6, + A, B, C, D, E, F, G, H); } CLIP_SW8_0_255(A, B, C, D, E, F, G, H); sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); @@ -208,14 +202,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) Gd = Bdd; Hd = Bdd; } else { -Ad = Add + c0; -Bd = Add + c1; -Cd = Add + c2; -Dd = Add + c3; -Ed = Add + c4; -Fd = Add + c5; -Gd = Add + c6; -Hd = Add + c7; +ADD8(Add, c0, Add, c1, Add, c2, Add, c3, Add, c4, Add, c5, Add, c6, + Add, c7, Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); } Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); @@ -235,14 +223,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); -r0_r = Ad + A; -r1_r = Bd + C; -r2_r = Cd + D; -r3_r = Dd + E; -r0_l = Ed + F; -r1_l = Fd + G; -r2_l = Gd + H; -r3_l = Hd + B; +ADD8(Ad, A, Bd, C, Cd, D, Dd, E, Ed, F, Fd, G, Gd, H, Hd, B, + r0_r, r1_r, r2_r, r3_r, r0_l, r1_l, r2_l, r3_l); /* Row 4 to 7 */ TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r, @@ -286,14 +268,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) c0, c1, c2, c3); ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, c4, c5, c6, c7); -A += c0; -B += c7; -C += c1; -D += c2; -E += c3; -F += c4; -G += c5; -H += c6; +ADD8(A, c0, B, c7, C, c1, D, c2, E, c3, F, c4, G, c5, H, c6, + A, B, C, D, E, F, G, H); } CLIP_SW8_0_255(A, B, C, D, E, F, G, H); sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); @@ -316,14 +292,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) Gd = Bdd; Hd = Bdd; } else { -Ad = Add + c0; -Bd = Add + c1; -Cd = Add + c2; -Dd = Add + c3; -Ed = Add + c4; -Fd = Add + c5; -Gd = Add + c6; -Hd = Add + c7; +ADD8(Add, c0, Add, c1, Add, c2, Add, c3, Add, c4, Add, c5, Add, c6, + Add, c7, Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); } Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); @@ -343,14 +313,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); -r4_r = Ad + A; -r5_r = Bd + C; -r6_r = Cd + D; -r7_r = Dd + E; -r4_l = Ed + F; -r5_l = Fd + G; -r6_l = Gd + H; -r7_l = Hd + B; +ADD8(Ad, A, Bd, C, Cd, D, Dd, E, Ed, F, Fd, G, Gd, H, Hd, B, + r4_r, r5_r, r6_r, r7_r, r4_l, r5_l, r6_l, r7_l); VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1); VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3); VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5); @@ -400,14 +364,8 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) e0, e1, e2, e3); ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, e4, e5, e6, e7); -e0 += dc; -e1 += dc; -e2 += dc; -e3 += dc; -e4 += dc; -e5 += dc; -e6 += dc; -e7 += dc; +ADD8(e0, dc, e1, dc, e2, dc, e3, dc, e4, dc, e5, dc, e6, dc, e7, dc, + e0, e1, e2, e3, e4, e5, e6, e7); CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7); /* Left part */ @@ -415,14 +373,8 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) r0, r1, r2, r3); ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, r4, r5, r6, r7); -r0 += dc; -r1 += dc; -r2 += dc; -r3 += dc; -r4 += dc; -r5 += dc; -r6
[FFmpeg-devel] [PATCH v4] avutil/mips: refine msa macros CLIP_*.
Changing details as following: 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in source vector. 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'. Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x). Performance of H264 decoding has speed up about 0.5%(from 4.35x to 4.37x). Performance of Theora decoding has speed up about 0.7%(from 5.79x to 5.83x). 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255' instead, because there are no difference in the effect of this two macros. --- libavcodec/mips/h264dsp_msa.c | 39 +-- libavcodec/mips/h264idct_msa.c | 7 +- libavcodec/mips/hevc_idct_msa.c | 21 +++--- libavcodec/mips/hevc_lpf_sao_msa.c | 132 ++-- libavcodec/mips/hevc_mc_bi_msa.c| 44 ++-- libavcodec/mips/hevc_mc_biw_msa.c | 56 +++ libavcodec/mips/hevc_mc_uniw_msa.c | 40 +-- libavcodec/mips/hevcpred_msa.c | 8 +-- libavcodec/mips/idctdsp_msa.c | 9 +-- libavcodec/mips/qpeldsp_msa.c | 4 +- libavcodec/mips/simple_idct_msa.c | 98 +++--- libavcodec/mips/vp3dsp_idct_msa.c | 68 +++ libavcodec/mips/vp8_idct_msa.c | 5 +- libavcodec/mips/vp9_idct_msa.c | 10 ++- libavutil/mips/generic_macros_msa.h | 119 +--- 15 files changed, 280 insertions(+), 380 deletions(-) diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index c4ba8c4..dd05982 100644 --- a/libavcodec/mips/h264dsp_msa.c +++ b/libavcodec/mips/h264dsp_msa.c @@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); -CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); -CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); +CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, SRA_4V(temp0, temp1, temp2, temp3, denom); SRA_4V(temp4, temp5, temp6, temp7, denom); -CLIP_SH4_0_255(temp0, temp1, temp2, temp3); -CLIP_SH4_0_255(temp4, temp5, temp6, temp7); +CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, dst0, dst1, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, temp = p1_or_q1_org_in << 1; \ clip3 = clip3 - temp; \ clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\ -clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \ +CLIP_SH(clip3, negate_tc_in, tc_in); \ p1_or_q1_out = p1_or_q1_org_in + clip3; \ } @@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta >>= 3;\ \ -delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \ +CLIP_SH(delta, negate_threshold_in, threshold_in); \ \ p0_or_q0_out = p0_or_q0_org_in + delta; \ q0_or_p0_out = q0_or_p0_org_in - delta; \ @@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc); \ \ ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\ \ @@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, q0_sub_p0 <<= 2; \ delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ -
[FFmpeg-devel] [PATCH v3] avutil/mips: refine msa macros CLIP_*.
Changing details as following: 1. Remove the local variable 'out_m' in 'CLIP_SH' and store the result in source vector. 2. Refine the implementation of macro 'CLIP_SH_0_255' and 'CLIP_SW_0_255'. Performance of VP8 decoding has speed up about 1.1%(from 7.03x to 7.11x). 3. Remove redundant macro 'CLIP_SH/Wn_0_255_MAX_SATU' and use 'CLIP_SH/Wn_0_255' instead, because there are no difference in the effect of this two macros. --- libavcodec/mips/h264dsp_msa.c | 39 +-- libavcodec/mips/h264idct_msa.c | 7 +- libavcodec/mips/hevc_idct_msa.c | 21 +++--- libavcodec/mips/hevc_lpf_sao_msa.c | 132 ++-- libavcodec/mips/hevc_mc_bi_msa.c| 44 ++-- libavcodec/mips/hevc_mc_biw_msa.c | 56 +++ libavcodec/mips/hevc_mc_uniw_msa.c | 40 +-- libavcodec/mips/hevcpred_msa.c | 8 +-- libavcodec/mips/idctdsp_msa.c | 9 +-- libavcodec/mips/qpeldsp_msa.c | 4 +- libavcodec/mips/simple_idct_msa.c | 98 +++--- libavcodec/mips/vp3dsp_idct_msa.c | 68 +++ libavcodec/mips/vp8_idct_msa.c | 5 +- libavcodec/mips/vp9_idct_msa.c | 10 ++- libavutil/mips/generic_macros_msa.h | 119 +--- 15 files changed, 280 insertions(+), 380 deletions(-) diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index c4ba8c4..dd05982 100644 --- a/libavcodec/mips/h264dsp_msa.c +++ b/libavcodec/mips/h264dsp_msa.c @@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); -CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); -CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); +CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, SRA_4V(temp0, temp1, temp2, temp3, denom); SRA_4V(temp4, temp5, temp6, temp7, denom); -CLIP_SH4_0_255(temp0, temp1, temp2, temp3); -CLIP_SH4_0_255(temp4, temp5, temp6, temp7); +CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, dst0, dst1, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, temp = p1_or_q1_org_in << 1; \ clip3 = clip3 - temp; \ clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\ -clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \ +CLIP_SH(clip3, negate_tc_in, tc_in); \ p1_or_q1_out = p1_or_q1_org_in + clip3; \ } @@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta >>= 3;\ \ -delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \ +CLIP_SH(delta, negate_threshold_in, threshold_in); \ \ p0_or_q0_out = p0_or_q0_org_in + delta; \ q0_or_p0_out = q0_or_p0_org_in - delta; \ @@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc); \ \ ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\ \ @@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, q0_sub_p0 <<= 2; \ delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc); \
[FFmpeg-devel] [PATCH v2] avutil/mips: refine msa macros CLIP_*.
Changing details as following: 1. Remove the local variable out_m in CLIP_SH. Results are assigned to input vector, reduced the data replication. 2. Reimplement the macro CLIP_SH/Wn_0_255. The VP8 decoding performance has improved by 1.1%(7.03x to 7.11x, tested on loongson 3A4000). 3. Remove CLIP_SH/Wn_0_255_MAX_SATU. CLIP_SH/Wn_0_255_MAX_SATU and CLIP_SH/Wn_0_255 have the same function. It is not necessary to keep both, use CLIP_SH/Wn_0_255 instead. --- libavcodec/mips/h264dsp_msa.c | 39 +-- libavcodec/mips/h264idct_msa.c | 7 +- libavcodec/mips/hevc_idct_msa.c | 21 +++--- libavcodec/mips/hevc_lpf_sao_msa.c | 132 ++-- libavcodec/mips/hevc_mc_bi_msa.c| 44 ++-- libavcodec/mips/hevc_mc_biw_msa.c | 56 +++ libavcodec/mips/hevc_mc_uniw_msa.c | 40 +-- libavcodec/mips/hevcpred_msa.c | 8 +-- libavcodec/mips/idctdsp_msa.c | 9 +-- libavcodec/mips/qpeldsp_msa.c | 4 +- libavcodec/mips/simple_idct_msa.c | 98 +++--- libavcodec/mips/vp3dsp_idct_msa.c | 68 +++ libavcodec/mips/vp8_idct_msa.c | 5 +- libavcodec/mips/vp9_idct_msa.c | 10 ++- libavutil/mips/generic_macros_msa.h | 119 +--- 15 files changed, 280 insertions(+), 380 deletions(-) diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index c4ba8c4..dd05982 100644 --- a/libavcodec/mips/h264dsp_msa.c +++ b/libavcodec/mips/h264dsp_msa.c @@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); -CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); -CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); +CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, SRA_4V(temp0, temp1, temp2, temp3, denom); SRA_4V(temp4, temp5, temp6, temp7, denom); -CLIP_SH4_0_255(temp0, temp1, temp2, temp3); -CLIP_SH4_0_255(temp4, temp5, temp6, temp7); +CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, dst0, dst1, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, temp = p1_or_q1_org_in << 1; \ clip3 = clip3 - temp; \ clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\ -clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \ +CLIP_SH(clip3, negate_tc_in, tc_in); \ p1_or_q1_out = p1_or_q1_org_in + clip3; \ } @@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta >>= 3;\ \ -delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \ +CLIP_SH(delta, negate_threshold_in, threshold_in); \ \ p0_or_q0_out = p0_or_q0_org_in + delta; \ q0_or_p0_out = q0_or_p0_org_in - delta; \ @@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc); \ \ ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\ \ @@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, q0_sub_p0 <<= 2; \ delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc);
[FFmpeg-devel] [PATCH] avutil/mips: refine msa macros CLIP_*.
Changing details as following: 1. Refine CLIP_SH, results are in placed to input vectors. 2. Reimplement the macro CLIP_SH/Wn_0_255. The new macro is more efficient than before. 3. Remove CLIP_SH/Wn_0_255_MAX_SATU. CLIP_SH/Wn_0_255_MAX_SATU and CLIP_SH/Wn_0_255 have the same function. It is not necessary to keep both, use CLIP_SH/Wn_0_255 instead. --- libavcodec/mips/h264dsp_msa.c | 39 +-- libavcodec/mips/h264idct_msa.c | 7 +- libavcodec/mips/hevc_idct_msa.c | 21 +++--- libavcodec/mips/hevc_lpf_sao_msa.c | 132 ++-- libavcodec/mips/hevc_mc_bi_msa.c| 44 ++-- libavcodec/mips/hevc_mc_biw_msa.c | 56 +++ libavcodec/mips/hevc_mc_uniw_msa.c | 40 +-- libavcodec/mips/hevcpred_msa.c | 8 +-- libavcodec/mips/idctdsp_msa.c | 9 +-- libavcodec/mips/qpeldsp_msa.c | 4 +- libavcodec/mips/simple_idct_msa.c | 98 +++--- libavcodec/mips/vp3dsp_idct_msa.c | 68 +++ libavcodec/mips/vp8_idct_msa.c | 5 +- libavcodec/mips/vp9_idct_msa.c | 10 ++- libavutil/mips/generic_macros_msa.h | 119 +--- 15 files changed, 280 insertions(+), 380 deletions(-) diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index c4ba8c4..dd05982 100644 --- a/libavcodec/mips/h264dsp_msa.c +++ b/libavcodec/mips/h264dsp_msa.c @@ -413,8 +413,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, tmp7 = __msa_dpadd_s_h(offset, wgt, vec7); SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); SRA_4V(tmp4, tmp5, tmp6, tmp7, denom); -CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); -CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); +CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -475,8 +474,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, SRA_4V(temp0, temp1, temp2, temp3, denom); SRA_4V(temp4, temp5, temp6, temp7, denom); -CLIP_SH4_0_255(temp0, temp1, temp2, temp3); -CLIP_SH4_0_255(temp4, temp5, temp6, temp7); +CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7); PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, dst0, dst1, dst2, dst3); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride); @@ -531,7 +529,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, temp = p1_or_q1_org_in << 1; \ clip3 = clip3 - temp; \ clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);\ -clip3 = CLIP_SH(clip3, negate_tc_in, tc_in); \ +CLIP_SH(clip3, negate_tc_in, tc_in); \ p1_or_q1_out = p1_or_q1_org_in + clip3; \ } @@ -549,7 +547,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta >>= 3;\ \ -delta = CLIP_SH(delta, negate_threshold_in, threshold_in); \ +CLIP_SH(delta, negate_threshold_in, threshold_in); \ \ p0_or_q0_out = p0_or_q0_org_in + delta; \ q0_or_p0_out = q0_or_p0_org_in - delta; \ @@ -598,7 +596,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc); \ \ ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);\ \ @@ -662,7 +660,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, q0_sub_p0 <<= 2; \ delta = q0_sub_p0 + p1_sub_q1; \ delta = __msa_srari_h(delta, 3); \ -delta = CLIP_SH(delta, -tc, tc); \ +CLIP_SH(delta, -tc, tc); \ \
[FFmpeg-devel] [PATCH v2] avutil/mips: refactor msa SLDI_Bn_0 and SLDI_Bn macros.
Changing details as following: 1. The previous order of parameters are irregular and difficult to understand. Adjust the order of the parameters according to the rule: (RTYPE, input registers, input mask/input index/..., output registers). Most of the existing msa macros follow the rule. 2. Remove the redundant macro SLDI_Bn_0 and use SLDI_Bn instead. --- libavcodec/mips/h264dsp_msa.c | 9 ++-- libavcodec/mips/h264qpel_msa.c | 64 ++-- libavcodec/mips/hevc_lpf_sao_msa.c | 70 --- libavcodec/mips/hevcpred_msa.c | 30 ++--- libavcodec/mips/hpeldsp_msa.c | 66 ++--- libavcodec/mips/me_cmp_msa.c| 8 ++-- libavcodec/mips/qpeldsp_msa.c | 84 ++--- libavcodec/mips/vp8_mc_msa.c| 4 +- libavcodec/mips/vp9_idct_msa.c | 3 +- libavcodec/mips/vp9_lpf_msa.c | 3 +- libavcodec/mips/vp9_mc_msa.c| 16 +++ libavutil/mips/generic_macros_msa.h | 80 ++- 12 files changed, 222 insertions(+), 215 deletions(-) diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index 89fe399..c4ba8c4 100644 --- a/libavcodec/mips/h264dsp_msa.c +++ b/libavcodec/mips/h264dsp_msa.c @@ -620,7 +620,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, \ out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \ out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);\ -SLDI_B2_0_UB(out1, out2, out2, out3, 2); \ +SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \ } #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ @@ -1025,7 +1025,8 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3); ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5); -SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8); +SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5, + 8, src0, src2, src4, src7); p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); @@ -1116,10 +1117,10 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3); ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4); -SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8); +SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5); dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0); dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1); -SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8); +SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y); out0 = __msa_copy_u_w((v4i32) dst0, 0); out1 = __msa_copy_u_h((v8i16) dst0, 2); diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c index df7e3e2..e435c18 100644 --- a/libavcodec/mips/h264qpel_msa.c +++ b/libavcodec/mips/h264qpel_msa.c @@ -790,8 +790,8 @@ void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, minus5b, res4, res5, res6, res7); DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7); -SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2); -SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2); +SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2, + src0, src2, src4, src6); SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res4, res5, res6, res7, 5); SAT_SH4_SH(res0, res1, res2, res3, 7); @@ -858,8 +858,8 @@ void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, minus5b, res4, res5, res6, res7); DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7); -SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3); -SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3); +SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3, + src0, src2, src4, src6); SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res4, res5, res6, res7, 5); SAT_SH4_SH(res0, res1, res2, res3, 7); @@ -911,10 +911,10 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7); -SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2); -SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2); -SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2); -SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2); +SLDI_B4_SB(src0,
[FFmpeg-devel] [PATCH] avutil/mips: refactor msa SLDI_Bn_0 and SLDI_Bn macros.
Changing details as following: 1. Modified the parameters order of SLDI_Bn. The previous order of parameters is difficult to understand. 2. Remove the redundant macro SLDI_Bn_0 and use SLDI_Bn instead. --- libavcodec/mips/h264dsp_msa.c | 9 ++-- libavcodec/mips/h264qpel_msa.c | 64 ++-- libavcodec/mips/hevc_lpf_sao_msa.c | 70 --- libavcodec/mips/hevcpred_msa.c | 30 ++--- libavcodec/mips/hpeldsp_msa.c | 66 ++--- libavcodec/mips/me_cmp_msa.c| 8 ++-- libavcodec/mips/qpeldsp_msa.c | 84 ++--- libavcodec/mips/vp8_mc_msa.c| 4 +- libavcodec/mips/vp9_idct_msa.c | 3 +- libavcodec/mips/vp9_lpf_msa.c | 3 +- libavcodec/mips/vp9_mc_msa.c| 16 +++ libavutil/mips/generic_macros_msa.h | 80 ++- 12 files changed, 222 insertions(+), 215 deletions(-) diff --git a/libavcodec/mips/h264dsp_msa.c b/libavcodec/mips/h264dsp_msa.c index 89fe399..c4ba8c4 100644 --- a/libavcodec/mips/h264dsp_msa.c +++ b/libavcodec/mips/h264dsp_msa.c @@ -620,7 +620,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, \ out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \ out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);\ -SLDI_B2_0_UB(out1, out2, out2, out3, 2); \ +SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \ } #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ @@ -1025,7 +1025,8 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3); ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5); -SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8); +SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5, + 8, src0, src2, src4, src7); p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); @@ -1116,10 +1117,10 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3); ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4); -SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8); +SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5); dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0); dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1); -SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8); +SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y); out0 = __msa_copy_u_w((v4i32) dst0, 0); out1 = __msa_copy_u_h((v8i16) dst0, 2); diff --git a/libavcodec/mips/h264qpel_msa.c b/libavcodec/mips/h264qpel_msa.c index df7e3e2..e435c18 100644 --- a/libavcodec/mips/h264qpel_msa.c +++ b/libavcodec/mips/h264qpel_msa.c @@ -790,8 +790,8 @@ void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, minus5b, res4, res5, res6, res7); DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7); -SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2); -SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2); +SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2, + src0, src2, src4, src6); SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res4, res5, res6, res7, 5); SAT_SH4_SH(res0, res1, res2, res3, 7); @@ -858,8 +858,8 @@ void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, minus5b, res4, res5, res6, res7); DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7); -SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3); -SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3); +SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3, + src0, src2, src4, src6); SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res4, res5, res6, res7, 5); SAT_SH4_SH(res0, res1, res2, res3, 7); @@ -911,10 +911,10 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7); -SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2); -SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2); -SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2); -SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2); +SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2, + src0, src1, src2, src3); +SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2, +
Re: [FFmpeg-devel] [PATCH v2] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions
> 在 2019年2月24日,上午10:55,Shiyou Yin 写道: > > > >> -Original Message- >> From: ffmpeg-devel-boun...@ffmpeg.org >> <mailto:ffmpeg-devel-boun...@ffmpeg.org> >> [mailto:ffmpeg-devel-boun...@ffmpeg.org >> <mailto:ffmpeg-devel-boun...@ffmpeg.org>] On Behalf Of gxw >> Sent: Thursday, February 21, 2019 8:39 PM >> To: ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> >> Cc: gxw >> Subject: [FFmpeg-devel] [PATCH v2] avcodec/mips: [loongson] mmi >> optimizations for VP9 put and avg >> functions >> >> VP9 decoding speed improved about 60.5%(from 38fps to 61fps, tested on >> loongson 3A3000). >> --- >> libavcodec/mips/Makefile | 1 + >> libavcodec/mips/vp9_mc_mmi.c | 692 >> + >> libavcodec/mips/vp9dsp_init_mips.c | 42 +++ >> libavcodec/mips/vp9dsp_mips.h | 50 +++ >> libavutil/mips/mmiutils.h | 15 + >> 5 files changed, 800 insertions(+) >> create mode 100644 libavcodec/mips/vp9_mc_mmi.c >> >> diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile >> index c827649..c5b54d5 100644 >> --- a/libavcodec/mips/Makefile >> +++ b/libavcodec/mips/Makefile >> @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= >> mips/vc1dsp_mmi.o >> MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o >> MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o >> MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o >> +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o >> diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c >> new file mode 100644 >> index 000..58a920b >> --- /dev/null >> +++ b/libavcodec/mips/vp9_mc_mmi.c >> @@ -0,0 +1,692 @@ >> +/* >> + * Copyright (c) 2019 gxw >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> + */ >> + >> +#include "libavcodec/vp9dsp.h" >> +#include "libavutil/mips/mmiutils.h" >> +#include "vp9dsp_mips.h" >> + >> +#define GET_DATA_H_MMI \ >> +"pmaddhw%[ftmp4],%[ftmp4], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp5],%[ftmp5], %[filter2]\n\t" \ >> +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ >> +"punpckhwd %[ftmp5],%[ftmp4], %[ftmp0] \n\t" \ >> +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ >> +"pmaddhw%[ftmp6],%[ftmp6], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp7],%[ftmp7], %[filter2]\n\t" \ >> +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ >> +"punpckhwd %[ftmp7],%[ftmp6], %[ftmp0] \n\t" \ >> +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ >> +"punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ >> +"pmaddhw%[ftmp8],%[ftmp8], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp9],%[ftmp9], %[filter2]\n\t" \ >> +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ >> +"punpckhwd %[ftmp9],%[ftmp8], %[ftmp0] \n\t" \ >> +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ >> +"pmaddhw%[ftmp10], %[ftmp10], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp11], %[ftmp11], %[filter2]\n\t" \ >> +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ >> +"punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ >> +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ >> +"punpcklwd
[FFmpeg-devel] [PATCH v3] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions
VP9 decoding speed improved about 60.5%(from 38fps to 61fps, tested on loongson 3A3000). --- libavcodec/mips/Makefile | 1 + libavcodec/mips/vp9_mc_mmi.c | 628 + libavcodec/mips/vp9dsp_init_mips.c | 42 +++ libavcodec/mips/vp9dsp_mips.h | 50 +++ libavutil/mips/mmiutils.h | 15 + 5 files changed, 736 insertions(+) create mode 100644 libavcodec/mips/vp9_mc_mmi.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index c827649..c5b54d5 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c new file mode 100644 index 000..e7a8387 --- /dev/null +++ b/libavcodec/mips/vp9_mc_mmi.c @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2019 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/mmiutils.h" +#include "vp9dsp_mips.h" + +#define GET_DATA_H_MMI \ +"pmaddhw%[ftmp4],%[ftmp4], %[filter1]\n\t" \ +"pmaddhw%[ftmp5],%[ftmp5], %[filter2]\n\t" \ +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ +"punpckhwd %[ftmp5],%[ftmp4], %[ftmp0] \n\t" \ +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[ftmp6],%[ftmp6], %[filter1]\n\t" \ +"pmaddhw%[ftmp7],%[ftmp7], %[filter2]\n\t" \ +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ +"punpckhwd %[ftmp7],%[ftmp6], %[ftmp0] \n\t" \ +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ +"punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ +"pmaddhw%[ftmp8],%[ftmp8], %[filter1]\n\t" \ +"pmaddhw%[ftmp9],%[ftmp9], %[filter2]\n\t" \ +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ +"punpckhwd %[ftmp9],%[ftmp8], %[ftmp0] \n\t" \ +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ +"pmaddhw%[ftmp10], %[ftmp10], %[filter1]\n\t" \ +"pmaddhw%[ftmp11], %[ftmp11], %[filter2]\n\t" \ +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ +"punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ +"punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ +"punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[srcl], %[srcl],%[filter10] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter32] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter54] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter76] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[srch], %[srch],%[filter10] \n\t" \ +"punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter32] \n\t" \ +"paddw %[srch], %[srch],%[ftmp12]
[FFmpeg-devel] [PATCH v2] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions
VP9 decoding speed improved about 60.5%(from 38fps to 61fps, tested on loongson 3A3000). --- libavcodec/mips/Makefile | 1 + libavcodec/mips/vp9_mc_mmi.c | 692 + libavcodec/mips/vp9dsp_init_mips.c | 42 +++ libavcodec/mips/vp9dsp_mips.h | 50 +++ libavutil/mips/mmiutils.h | 15 + 5 files changed, 800 insertions(+) create mode 100644 libavcodec/mips/vp9_mc_mmi.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index c827649..c5b54d5 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c new file mode 100644 index 000..58a920b --- /dev/null +++ b/libavcodec/mips/vp9_mc_mmi.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2019 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/mmiutils.h" +#include "vp9dsp_mips.h" + +#define GET_DATA_H_MMI \ +"pmaddhw%[ftmp4],%[ftmp4], %[filter1]\n\t" \ +"pmaddhw%[ftmp5],%[ftmp5], %[filter2]\n\t" \ +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ +"punpckhwd %[ftmp5],%[ftmp4], %[ftmp0] \n\t" \ +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[ftmp6],%[ftmp6], %[filter1]\n\t" \ +"pmaddhw%[ftmp7],%[ftmp7], %[filter2]\n\t" \ +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ +"punpckhwd %[ftmp7],%[ftmp6], %[ftmp0] \n\t" \ +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ +"punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ +"pmaddhw%[ftmp8],%[ftmp8], %[filter1]\n\t" \ +"pmaddhw%[ftmp9],%[ftmp9], %[filter2]\n\t" \ +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ +"punpckhwd %[ftmp9],%[ftmp8], %[ftmp0] \n\t" \ +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ +"pmaddhw%[ftmp10], %[ftmp10], %[filter1]\n\t" \ +"pmaddhw%[ftmp11], %[ftmp11], %[filter2]\n\t" \ +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ +"punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ +"punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ +"punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[srcl], %[srcl],%[filter10] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter32] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter54] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter76] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[srch], %[srch],%[filter10] \n\t" \ +"punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter32] \n\t" \ +"paddw %[srch], %[srch],%[ftmp12]
Re: [FFmpeg-devel] [PATCH] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions
> 在 2019年2月21日,上午9:55,Shiyou Yin 写道: > >> -Original Message- >> From: ffmpeg-devel-boun...@ffmpeg.org >> <mailto:ffmpeg-devel-boun...@ffmpeg.org> >> [mailto:ffmpeg-devel-boun...@ffmpeg.org >> <mailto:ffmpeg-devel-boun...@ffmpeg.org>] On Behalf Of gxw >> Sent: Tuesday, February 19, 2019 11:02 AM >> To: ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> >> Cc: gxw >> Subject: [FFmpeg-devel] [PATCH] avcodec/mips: [loongson] mmi optimizations >> for VP9 put and avg >> functions >> >> VP9 decoding speed improved about 109.3%(from 32fps to 67fps, tested on >> loongson 3A3000). >> --- >> libavcodec/mips/Makefile | 1 + >> libavcodec/mips/vp9_mc_mmi.c | 680 >> + >> libavcodec/mips/vp9dsp_init_mips.c | 42 +++ >> libavcodec/mips/vp9dsp_mips.h | 50 +++ >> 4 files changed, 773 insertions(+) >> create mode 100644 libavcodec/mips/vp9_mc_mmi.c >> >> diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile >> index c827649..c5b54d5 100644 >> --- a/libavcodec/mips/Makefile >> +++ b/libavcodec/mips/Makefile >> @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= >> mips/vc1dsp_mmi.o >> MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o >> MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o >> MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o >> +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o >> diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c >> new file mode 100644 >> index 000..145bbff >> --- /dev/null >> +++ b/libavcodec/mips/vp9_mc_mmi.c >> @@ -0,0 +1,680 @@ >> +/* >> + * Copyright (c) 2019 gxw >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> + */ >> + >> +#include "libavcodec/vp9dsp.h" >> +#include "libavutil/mips/mmiutils.h" >> +#include "vp9dsp_mips.h" >> + >> +#define GET_DATA_H_MMI \ >> +"pmaddhw%[ftmp4],%[ftmp4], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp5],%[ftmp5], %[filter2]\n\t" \ >> +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ >> +"punpckhwd %[ftmp5],%[ftmp4], %[ftmp0] \n\t" \ >> +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ >> +"pmaddhw%[ftmp6],%[ftmp6], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp7],%[ftmp7], %[filter2]\n\t" \ >> +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ >> +"punpckhwd %[ftmp7],%[ftmp6], %[ftmp0] \n\t" \ >> +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ >> +"punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ >> +"pmaddhw%[ftmp8],%[ftmp8], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp9],%[ftmp9], %[filter2]\n\t" \ >> +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ >> +"punpckhwd %[ftmp9],%[ftmp8], %[ftmp0] \n\t" \ >> +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ >> +"pmaddhw%[ftmp10], %[ftmp10], %[filter1]\n\t" \ >> +"pmaddhw%[ftmp11], %[ftmp11], %[filter2]\n\t" \ >> +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ >> +"punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ >> +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ >> +"punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" >>
[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] mmi optimizations for VP9 put and avg functions
VP9 decoding speed improved about 109.3%(from 32fps to 67fps, tested on loongson 3A3000). --- libavcodec/mips/Makefile | 1 + libavcodec/mips/vp9_mc_mmi.c | 680 + libavcodec/mips/vp9dsp_init_mips.c | 42 +++ libavcodec/mips/vp9dsp_mips.h | 50 +++ 4 files changed, 773 insertions(+) create mode 100644 libavcodec/mips/vp9_mc_mmi.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index c827649..c5b54d5 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -88,3 +88,4 @@ MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o +MMI-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_mmi.o diff --git a/libavcodec/mips/vp9_mc_mmi.c b/libavcodec/mips/vp9_mc_mmi.c new file mode 100644 index 000..145bbff --- /dev/null +++ b/libavcodec/mips/vp9_mc_mmi.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2019 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/mmiutils.h" +#include "vp9dsp_mips.h" + +#define GET_DATA_H_MMI \ +"pmaddhw%[ftmp4],%[ftmp4], %[filter1]\n\t" \ +"pmaddhw%[ftmp5],%[ftmp5], %[filter2]\n\t" \ +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ +"punpckhwd %[ftmp5],%[ftmp4], %[ftmp0] \n\t" \ +"paddw %[ftmp4],%[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[ftmp6],%[ftmp6], %[filter1]\n\t" \ +"pmaddhw%[ftmp7],%[ftmp7], %[filter2]\n\t" \ +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ +"punpckhwd %[ftmp7],%[ftmp6], %[ftmp0] \n\t" \ +"paddw %[ftmp6],%[ftmp6], %[ftmp7] \n\t" \ +"punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ +"pmaddhw%[ftmp8],%[ftmp8], %[filter1]\n\t" \ +"pmaddhw%[ftmp9],%[ftmp9], %[filter2]\n\t" \ +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ +"punpckhwd %[ftmp9],%[ftmp8], %[ftmp0] \n\t" \ +"paddw %[ftmp8],%[ftmp8], %[ftmp9] \n\t" \ +"pmaddhw%[ftmp10], %[ftmp10], %[filter1]\n\t" \ +"pmaddhw%[ftmp11], %[ftmp11], %[filter2]\n\t" \ +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ +"punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ +"paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ +"punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ +"punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[srcl], %[srcl],%[filter10] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter32] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter54] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter76] \n\t" \ +"paddw %[srcl], %[srcl],%[ftmp12] \n\t" \ +"punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ +"pmaddhw%[srch], %[srch],%[filter10] \n\t" \ +"punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ +"pmaddhw%[ftmp12], %[ftmp12], %[filter32] \n\t" \ +"paddw %[srch], %[srch],%[ftmp12] \n\t" \ +
[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] optimize theora decoding with mmi.
Optimize theora decoding with mmi in functions: 1. ff_vp3_idct_add_mmi 2. ff_vp3_idct_put_mmi 3. ff_vp3_idct_dc_add_mmi 4. ff_put_no_rnd_pixels_l2_mmi Theora decoding speed improved about 32%(from 88fps to 116fps, Tested on loongson 3A3000). --- libavcodec/mips/Makefile | 1 + libavcodec/mips/vp3dsp_idct_mmi.c | 769 + libavcodec/mips/vp3dsp_init_mips.c | 14 + libavcodec/mips/vp3dsp_mips.h | 6 + 4 files changed, 790 insertions(+) create mode 100644 libavcodec/mips/vp3dsp_idct_mmi.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 3029872..c827649 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -87,3 +87,4 @@ MMI-OBJS-$(CONFIG_HPELDSP)+= mips/hpeldsp_mmi.o MMI-OBJS-$(CONFIG_VC1_DECODER)+= mips/vc1dsp_mmi.o MMI-OBJS-$(CONFIG_WMV2DSP)+= mips/wmv2dsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o +MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o diff --git a/libavcodec/mips/vp3dsp_idct_mmi.c b/libavcodec/mips/vp3dsp_idct_mmi.c new file mode 100644 index 000..c5c4cf3 --- /dev/null +++ b/libavcodec/mips/vp3dsp_idct_mmi.c @@ -0,0 +1,769 @@ +/* + * Copyright (c) 2018 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vp3dsp_mips.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mips/mmiutils.h" +#include "libavutil/common.h" +#include "libavcodec/rnd_avg.h" + +#define LOAD_CONST(dst, value)\ +"li %[tmp1], "#value" \n\t" \ +"dmtc1 %[tmp1], "#dst"\n\t" \ +"pshufh "#dst", "#dst", %[ftmp10] \n\t" + +static void idct_row_mmi(int16_t *input) +{ +double ftmp[23]; +uint64_t tmp[2]; +__asm__ volatile ( +"xor%[ftmp10], %[ftmp10],%[ftmp10] \n\t" +LOAD_CONST(%[csth_1], 1) +"li %[tmp0],0x02\n\t" +"1: \n\t" +/* Load input */ +"ldc1 %[ftmp0], 0x00(%[input]) \n\t" +"ldc1 %[ftmp1], 0x10(%[input]) \n\t" +"ldc1 %[ftmp2], 0x20(%[input]) \n\t" +"ldc1 %[ftmp3], 0x30(%[input]) \n\t" +"ldc1 %[ftmp4], 0x40(%[input]) \n\t" +"ldc1 %[ftmp5], 0x50(%[input]) \n\t" +"ldc1 %[ftmp6], 0x60(%[input]) \n\t" +"ldc1 %[ftmp7], 0x70(%[input]) \n\t" +LOAD_CONST(%[ftmp8], 64277) +LOAD_CONST(%[ftmp9], 12785) +"pmulhh %[A], %[ftmp9], %[ftmp7] \n\t" +"pcmpgth%[C], %[ftmp10],%[ftmp1] \n\t" +"or %[mask],%[C], %[csth_1] \n\t" +"pmullh %[B], %[ftmp1], %[mask] \n\t" +"pmulhuh%[B], %[ftmp8], %[B] \n\t" +"pmullh %[B], %[B], %[mask] \n\t" +"paddh %[A], %[A], %[B] \n\t" +"paddh %[A], %[A], %[C] \n\t" +"pcmpgth%[D], %[ftmp10],%[ftmp7] \n\t" +"or %[mask],%[D], %[csth_1] \n\t" +"pmullh %[ftmp7], %[ftmp7], %[mask] \n\t" +"pmulhuh%[B], %[ftmp8], %[ftmp7] \n\t" +"pmullh %[B], %[B], %[mask] \n\t" +"pmulhh %[C], %[ftmp9], %[ftmp1] \n\t" +"psubh %[B], %[C], %[B] \n\t" +"psubh %[B], %[B],
[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] optimize theora decoding in vp3dsp.
Optimize theora decoding with msa in functions: 1. ff_vp3_idct_add_msa 2. ff_vp3_idct_put_msa 3. ff_vp3_idct_dc_add_msa 4. ff_vp3_v_loop_filter_msa 5. ff_vp3_h_loop_filter_msa 6. ff_put_no_rnd_pixels_l2_msa Theora decoding speed improved about 36%(from 22fps to 30fps, Tested on loongson 2K1000). --- libavcodec/mips/Makefile | 2 + libavcodec/mips/vp3dsp_idct_msa.c | 662 + libavcodec/mips/vp3dsp_init_mips.c | 46 +++ libavcodec/mips/vp3dsp_mips.h | 37 +++ libavcodec/vp3dsp.c| 2 + libavcodec/vp3dsp.h| 1 + 6 files changed, 750 insertions(+) create mode 100644 libavcodec/mips/vp3dsp_idct_msa.c create mode 100644 libavcodec/mips/vp3dsp_init_mips.c create mode 100644 libavcodec/mips/vp3dsp_mips.h diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 1f659a0..3571207 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -22,6 +22,7 @@ OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_init_mips.o \ mips/hevcpred_init_mips.o OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9dsp_init_mips.o OBJS-$(CONFIG_VP8_DECODER)+= mips/vp8dsp_init_mips.o +OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_init_mips.o OBJS-$(CONFIG_H264DSP)+= mips/h264dsp_init_mips.o OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_init_mips.o OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_init_mips.o @@ -54,6 +55,7 @@ MSA-OBJS-$(CONFIG_VP9_DECODER)+= mips/vp9_mc_msa.o \ MSA-OBJS-$(CONFIG_VP8_DECODER)+= mips/vp8_mc_msa.o \ mips/vp8_idct_msa.o \ mips/vp8_lpf_msa.o +MSA-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_msa.o MSA-OBJS-$(CONFIG_H264DSP)+= mips/h264dsp_msa.o\ mips/h264idct_msa.o MSA-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_msa.o diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c new file mode 100644 index 000..5427ac5 --- /dev/null +++ b/libavcodec/mips/vp3dsp_idct_msa.c @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2018 gxw + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "vp3dsp_mips.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "libavutil/intreadwrite.h" +#include "libavcodec/rnd_avg.h" + +static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) +{ +v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign; +v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l, + r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l; +v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; +v4i32 Ed, Gd, Add, Bdd, Fd, Hd; +v16u8 sign_l; +v16i8 d0, d1, d2, d3, d4, d5, d6, d7; +v4i32 c0, c1, c2, c3, c4, c5, c6, c7; +v4i32 f0, f1, f2, f3, f4, f5, f6, f7; +v4i32 sign_t; +v16i8 zero = {0}; +v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; +v4i32 cnst64277w = {64277, 64277, 64277, 64277}; +v4i32 cnst60547w = {60547, 60547, 60547, 60547}; +v4i32 cnst54491w = {54491, 54491, 54491, 54491}; +v4i32 cnst46341w = {46341, 46341, 46341, 46341}; +v4i32 cnst36410w = {36410, 36410, 36410, 36410}; +v4i32 cnst25080w = {25080, 25080, 25080, 25080}; +v4i32 cnst12785w = {12785, 12785, 12785, 12785}; +v4i32 cnst8w = {8, 8, 8, 8}; +v4i32 cnst2048w = {2048, 2048, 2048, 2048}; +v4i32 cnst128w = {128, 128, 128, 128}; +int nstride = stride; + +/* Extended input data */ +LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7); +sign = __msa_clti_s_h(r0, 0); +r0_r = (v4i32) __msa_ilvr_h(sign, r0); +r0_l = (v4i32) __msa_ilvl_h(sign, r0); +sign = __msa_clti_s_h(r1, 0); +r1_r = (v4i32) __msa_ilvr_h(sign, r1); +r1_l = (v4i32) __msa_ilvl_h(sign, r1); +sign = __msa_clti_s_h(r2, 0); +r2_r = (v4i32) __msa_ilvr_h(sign, r2); +r2_l = (v4i32) __msa_ilvl_h(sign, r2); +sign = __msa_clti_s_h(r3, 0); +r3_r = (v4i32) __
[FFmpeg-devel] [PATCH v3] avcodec/mips: Fix failed case: hevc-conformance-AMP_A_Samsung_* when enable msa
The AV_INPUT_BUFFER_PADDING_SIZE has been increased to 64, but the value is still 32 in function ff_hevc_sao_edge_filter_8_msa. So, use AV_INPUT_BUFFER_PADDING_SIZE directly. Also, use MAX_PB_SIZE directly instead of 64. Fate tests passed. --- libavcodec/mips/hevc_lpf_sao_msa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c index 5b5537a..adcafde 100644 --- a/libavcodec/mips/hevc_lpf_sao_msa.c +++ b/libavcodec/mips/hevc_lpf_sao_msa.c @@ -2630,7 +2630,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, int16_t *sao_offset_val, int eo, int width, int height) { -ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t); +ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t); switch (eo) { case 0: -- 2.1.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v2] avcodec/mips: Fix failed case: hevc-conformance-AMP_A_Samsung_* when enable msa
The AV_INPUT_BUFFER_PADDING_SIZE has been increased to 64, but the value is still 32 in function ff_hevc_sao_edge_filter_8_msa. So, use AV_INPUT_BUFFER_PADDING_SIZE directly. Fate tests passed. --- libavcodec/mips/hevc_lpf_sao_msa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c index 5b5537a..b146bb1 100644 --- a/libavcodec/mips/hevc_lpf_sao_msa.c +++ b/libavcodec/mips/hevc_lpf_sao_msa.c @@ -2630,7 +2630,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, int16_t *sao_offset_val, int eo, int width, int height) { -ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t); +ptrdiff_t stride_src = (2 * 64 + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t); switch (eo) { case 0: -- 2.1.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/mips: [loongson] fix failed case: hevc-conformance-AMP_A_Samsung_* in loongson2k
The AV_INPUT_BUFFER_PADDING_SIZE has been increased to 64, but the value is still 32 in function ff_hevc_sao_edge_filter_8_msa. So, Modify the corresponding value to 64. Fate tests passed. --- libavcodec/mips/hevc_lpf_sao_msa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c index 5b5537a..bb883d0 100644 --- a/libavcodec/mips/hevc_lpf_sao_msa.c +++ b/libavcodec/mips/hevc_lpf_sao_msa.c @@ -2630,7 +2630,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, int16_t *sao_offset_val, int eo, int width, int height) { -ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t); +ptrdiff_t stride_src = (2 * 64 + 64) / sizeof(uint8_t); switch (eo) { case 0: -- 2.1.0 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel