Re: [FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions
LGTM Thanks, Nedeljko Od: ffmpeg-devel-boun...@ffmpeg.org [ffmpeg-devel-boun...@ffmpeg.org] u ime korisnika Shivraj Patil Poslato: 8. maj 2015 10:02 Za: ffmpeg-devel@ffmpeg.org Cc: Rob Isherwood; Shivraj Patil Tema: [FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions From: Shivraj Patil shivraj.pa...@imgtec.com Signed-off-by: Shivraj Patil shivraj.pa...@imgtec.com --- libavcodec/mips/hevcdsp_init_mips.c | 9 + libavcodec/mips/hevcdsp_mips.h | 9 + libavcodec/mips/hevcdsp_msa.c | 512 3 files changed, 530 insertions(+) diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c index 1e22f35..d2e3c60 100644 --- a/libavcodec/mips/hevcdsp_init_mips.c +++ b/libavcodec/mips/hevcdsp_init_mips.c @@ -87,6 +87,15 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c, c-put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa; c-put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa; c-put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa; + +c-put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa; +c-put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa; +c-put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa; +c-put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa; +c-put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa; +c-put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa; +c-put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa; +c-put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa; } } #endif // #if HAVE_MSA diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h index 76a6784..a8c8848 100644 --- a/libavcodec/mips/hevcdsp_mips.h +++ b/libavcodec/mips/hevcdsp_mips.h @@ -106,4 +106,13 @@ UNI_MC(qpel, v, 32); UNI_MC(qpel, v, 48); UNI_MC(qpel, v, 64); +UNI_MC(qpel, hv, 4); +UNI_MC(qpel, hv, 8); +UNI_MC(qpel, hv, 12); +UNI_MC(qpel, hv, 16); +UNI_MC(qpel, hv, 24); +UNI_MC(qpel, hv, 32); +UNI_MC(qpel, hv, 48); +UNI_MC(qpel, hv, 64); + #undef UNI_MC diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c index d0e6f64..781264d 100644 --- a/libavcodec/mips/hevcdsp_msa.c +++ b/libavcodec/mips/hevcdsp_msa.c @@ -46,6 +46,24 @@ out; \ } ) +#define HEVC_RND_W_CLIP_UNSIGNED_CHAR_W_VEC2(vec0_r, vec0_l, \ + vec1_r, vec1_l, \ + out0, out1) \ +{ \ +(vec0_r) = __msa_srari_w((vec0_r), 6); \ +(vec0_l) = __msa_srari_w((vec0_l), 6); \ +(vec1_r) = __msa_srari_w((vec1_r), 6); \ +(vec1_l) = __msa_srari_w((vec1_l), 6); \ + \ +(vec0_r) = CLIP_UNSIGNED_CHAR_W((vec0_r)); \ +(vec0_l) = CLIP_UNSIGNED_CHAR_W((vec0_l)); \ +(vec1_r) = CLIP_UNSIGNED_CHAR_W((vec1_r)); \ +(vec1_l) = CLIP_UNSIGNED_CHAR_W((vec1_l)); \ + \ +out0 = (v4i32) __msa_pckev_h((v8i16) (vec0_l), (v8i16) (vec0_r)); \ +out1 = (v4i32) __msa_pckev_h((v8i16) (vec1_l), (v8i16) (vec1_r)); \ +} + static void hevc_copy_4w_msa(uint8_t * __restrict src, int32_t src_stride, int16_t * __restrict dst, int32_t dst_stride, int32_t height) @@ -2270,6 +2288,469 @@ static void hevc_hv_8t_64w_msa(uint8_t * __restrict src, int32_t src_stride, filter_x, filter_y, height, 64); } +static void hevc_hv_uni_8t_4w_msa(uint8_t * __restrict src, + int32_t src_stride, + uint8_t * __restrict dst, + int32_t dst_stride, + const int8_t * __restrict filter_x, + const int8_t * __restrict filter_y, + int32_t height) +{ +uint32_t loop_cnt; +uint32_t out0, out1; +v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; +v8i16 filt0, filt1, filt2, filt3, filter_vec; +v4i32 filt_h0, filt_h1, filt_h2, filt_h3; +v16i8 mask1, mask2, mask3; +v8u16 const_vec; +v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; +v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; +v8i16 dst30, dst41
Re: [FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions
On Wed, May 13, 2015 at 11:46:32AM +, Nedeljko Babic wrote: LGTM applied thanks [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB I am the wisest man alive, for I know one thing, and that is that I know nothing. -- Socrates signature.asc Description: Digital signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions
From: Shivraj Patil shivraj.pa...@imgtec.com Signed-off-by: Shivraj Patil shivraj.pa...@imgtec.com --- libavcodec/mips/hevcdsp_init_mips.c | 9 + libavcodec/mips/hevcdsp_mips.h | 9 + libavcodec/mips/hevcdsp_msa.c | 512 3 files changed, 530 insertions(+) diff --git a/libavcodec/mips/hevcdsp_init_mips.c b/libavcodec/mips/hevcdsp_init_mips.c index 1e22f35..d2e3c60 100644 --- a/libavcodec/mips/hevcdsp_init_mips.c +++ b/libavcodec/mips/hevcdsp_init_mips.c @@ -87,6 +87,15 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c, c-put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa; c-put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa; c-put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa; + +c-put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa; +c-put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa; +c-put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa; +c-put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa; +c-put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa; +c-put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa; +c-put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa; +c-put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa; } } #endif // #if HAVE_MSA diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h index 76a6784..a8c8848 100644 --- a/libavcodec/mips/hevcdsp_mips.h +++ b/libavcodec/mips/hevcdsp_mips.h @@ -106,4 +106,13 @@ UNI_MC(qpel, v, 32); UNI_MC(qpel, v, 48); UNI_MC(qpel, v, 64); +UNI_MC(qpel, hv, 4); +UNI_MC(qpel, hv, 8); +UNI_MC(qpel, hv, 12); +UNI_MC(qpel, hv, 16); +UNI_MC(qpel, hv, 24); +UNI_MC(qpel, hv, 32); +UNI_MC(qpel, hv, 48); +UNI_MC(qpel, hv, 64); + #undef UNI_MC diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c index d0e6f64..781264d 100644 --- a/libavcodec/mips/hevcdsp_msa.c +++ b/libavcodec/mips/hevcdsp_msa.c @@ -46,6 +46,24 @@ out; \ } ) +#define HEVC_RND_W_CLIP_UNSIGNED_CHAR_W_VEC2(vec0_r, vec0_l, \ + vec1_r, vec1_l, \ + out0, out1) \ +{ \ +(vec0_r) = __msa_srari_w((vec0_r), 6); \ +(vec0_l) = __msa_srari_w((vec0_l), 6); \ +(vec1_r) = __msa_srari_w((vec1_r), 6); \ +(vec1_l) = __msa_srari_w((vec1_l), 6); \ + \ +(vec0_r) = CLIP_UNSIGNED_CHAR_W((vec0_r)); \ +(vec0_l) = CLIP_UNSIGNED_CHAR_W((vec0_l)); \ +(vec1_r) = CLIP_UNSIGNED_CHAR_W((vec1_r)); \ +(vec1_l) = CLIP_UNSIGNED_CHAR_W((vec1_l)); \ + \ +out0 = (v4i32) __msa_pckev_h((v8i16) (vec0_l), (v8i16) (vec0_r)); \ +out1 = (v4i32) __msa_pckev_h((v8i16) (vec1_l), (v8i16) (vec1_r)); \ +} + static void hevc_copy_4w_msa(uint8_t * __restrict src, int32_t src_stride, int16_t * __restrict dst, int32_t dst_stride, int32_t height) @@ -2270,6 +2288,469 @@ static void hevc_hv_8t_64w_msa(uint8_t * __restrict src, int32_t src_stride, filter_x, filter_y, height, 64); } +static void hevc_hv_uni_8t_4w_msa(uint8_t * __restrict src, + int32_t src_stride, + uint8_t * __restrict dst, + int32_t dst_stride, + const int8_t * __restrict filter_x, + const int8_t * __restrict filter_y, + int32_t height) +{ +uint32_t loop_cnt; +uint32_t out0, out1; +v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; +v8i16 filt0, filt1, filt2, filt3, filter_vec; +v4i32 filt_h0, filt_h1, filt_h2, filt_h3; +v16i8 mask1, mask2, mask3; +v8u16 const_vec; +v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; +v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; +v8i16 dst30, dst41, dst52, dst63, dst66, dst87; +v4i32 dst0_r, dst1_r; +v8i16 dst10_r, dst32_r, dst54_r, dst76_r; +v8i16 dst21_r, dst43_r, dst65_r, dst87_r; +v16i8 tmp; +v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; +v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 }; + +src -= ((3 * src_stride) + 3); + +filter_vec =