Re: [FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions

2015-05-13 Thread Nedeljko Babic
LGTM

Thanks,
Nedeljko

Od: ffmpeg-devel-boun...@ffmpeg.org [ffmpeg-devel-boun...@ffmpeg.org] u ime 
korisnika Shivraj Patil
Poslato: 8. maj 2015 10:02
Za: ffmpeg-devel@ffmpeg.org
Cc: Rob Isherwood; Shivraj Patil
Tema: [FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations 
for HEVC uni hv mc functions

From: Shivraj Patil shivraj.pa...@imgtec.com

Signed-off-by: Shivraj Patil shivraj.pa...@imgtec.com
---
 libavcodec/mips/hevcdsp_init_mips.c |   9 +
 libavcodec/mips/hevcdsp_mips.h  |   9 +
 libavcodec/mips/hevcdsp_msa.c   | 512 
 3 files changed, 530 insertions(+)

diff --git a/libavcodec/mips/hevcdsp_init_mips.c 
b/libavcodec/mips/hevcdsp_init_mips.c
index 1e22f35..d2e3c60 100644
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -87,6 +87,15 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
 c-put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
 c-put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
 c-put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
+
+c-put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
+c-put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
+c-put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
+c-put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
+c-put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
+c-put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
+c-put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
+c-put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
 }
 }
 #endif  // #if HAVE_MSA
diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h
index 76a6784..a8c8848 100644
--- a/libavcodec/mips/hevcdsp_mips.h
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -106,4 +106,13 @@ UNI_MC(qpel, v, 32);
 UNI_MC(qpel, v, 48);
 UNI_MC(qpel, v, 64);

+UNI_MC(qpel, hv, 4);
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 12);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
 #undef UNI_MC
diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
index d0e6f64..781264d 100644
--- a/libavcodec/mips/hevcdsp_msa.c
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -46,6 +46,24 @@
 out;   
\
 } )

+#define HEVC_RND_W_CLIP_UNSIGNED_CHAR_W_VEC2(vec0_r, vec0_l,   \
+ vec1_r, vec1_l,   \
+ out0, out1)   \
+{  \
+(vec0_r) = __msa_srari_w((vec0_r), 6); \
+(vec0_l) = __msa_srari_w((vec0_l), 6); \
+(vec1_r) = __msa_srari_w((vec1_r), 6); \
+(vec1_l) = __msa_srari_w((vec1_l), 6); \
+   \
+(vec0_r) = CLIP_UNSIGNED_CHAR_W((vec0_r)); \
+(vec0_l) = CLIP_UNSIGNED_CHAR_W((vec0_l)); \
+(vec1_r) = CLIP_UNSIGNED_CHAR_W((vec1_r)); \
+(vec1_l) = CLIP_UNSIGNED_CHAR_W((vec1_l)); \
+   \
+out0 = (v4i32) __msa_pckev_h((v8i16) (vec0_l), (v8i16) (vec0_r));  \
+out1 = (v4i32) __msa_pckev_h((v8i16) (vec1_l), (v8i16) (vec1_r));  \
+}
+
 static void hevc_copy_4w_msa(uint8_t * __restrict src, int32_t src_stride,
  int16_t * __restrict dst, int32_t dst_stride,
  int32_t height)
@@ -2270,6 +2288,469 @@ static void hevc_hv_8t_64w_msa(uint8_t * __restrict 
src, int32_t src_stride,
filter_x, filter_y, height, 64);
 }

+static void hevc_hv_uni_8t_4w_msa(uint8_t * __restrict src,
+  int32_t src_stride,
+  uint8_t * __restrict dst,
+  int32_t dst_stride,
+  const int8_t * __restrict filter_x,
+  const int8_t * __restrict filter_y,
+  int32_t height)
+{
+uint32_t loop_cnt;
+uint32_t out0, out1;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+v8i16 filt0, filt1, filt2, filt3, filter_vec;
+v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v16i8 mask1, mask2, mask3;
+v8u16 const_vec;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+v8i16 dst30, dst41

Re: [FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions

2015-05-13 Thread Michael Niedermayer
On Wed, May 13, 2015 at 11:46:32AM +, Nedeljko Babic wrote:
 LGTM

applied

thanks

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I am the wisest man alive, for I know one thing, and that is that I know
nothing. -- Socrates


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni hv mc functions

2015-05-08 Thread shivraj.patil
From: Shivraj Patil shivraj.pa...@imgtec.com

Signed-off-by: Shivraj Patil shivraj.pa...@imgtec.com
---
 libavcodec/mips/hevcdsp_init_mips.c |   9 +
 libavcodec/mips/hevcdsp_mips.h  |   9 +
 libavcodec/mips/hevcdsp_msa.c   | 512 
 3 files changed, 530 insertions(+)

diff --git a/libavcodec/mips/hevcdsp_init_mips.c 
b/libavcodec/mips/hevcdsp_init_mips.c
index 1e22f35..d2e3c60 100644
--- a/libavcodec/mips/hevcdsp_init_mips.c
+++ b/libavcodec/mips/hevcdsp_init_mips.c
@@ -87,6 +87,15 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
 c-put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_msa;
 c-put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_msa;
 c-put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_msa;
+
+c-put_hevc_qpel_uni[1][1][1] = ff_hevc_put_hevc_uni_qpel_hv4_8_msa;
+c-put_hevc_qpel_uni[3][1][1] = ff_hevc_put_hevc_uni_qpel_hv8_8_msa;
+c-put_hevc_qpel_uni[4][1][1] = ff_hevc_put_hevc_uni_qpel_hv12_8_msa;
+c-put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_8_msa;
+c-put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_8_msa;
+c-put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_8_msa;
+c-put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
+c-put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
 }
 }
 #endif  // #if HAVE_MSA
diff --git a/libavcodec/mips/hevcdsp_mips.h b/libavcodec/mips/hevcdsp_mips.h
index 76a6784..a8c8848 100644
--- a/libavcodec/mips/hevcdsp_mips.h
+++ b/libavcodec/mips/hevcdsp_mips.h
@@ -106,4 +106,13 @@ UNI_MC(qpel, v, 32);
 UNI_MC(qpel, v, 48);
 UNI_MC(qpel, v, 64);
 
+UNI_MC(qpel, hv, 4);
+UNI_MC(qpel, hv, 8);
+UNI_MC(qpel, hv, 12);
+UNI_MC(qpel, hv, 16);
+UNI_MC(qpel, hv, 24);
+UNI_MC(qpel, hv, 32);
+UNI_MC(qpel, hv, 48);
+UNI_MC(qpel, hv, 64);
+
 #undef UNI_MC
diff --git a/libavcodec/mips/hevcdsp_msa.c b/libavcodec/mips/hevcdsp_msa.c
index d0e6f64..781264d 100644
--- a/libavcodec/mips/hevcdsp_msa.c
+++ b/libavcodec/mips/hevcdsp_msa.c
@@ -46,6 +46,24 @@
 out;   
\
 } )
 
+#define HEVC_RND_W_CLIP_UNSIGNED_CHAR_W_VEC2(vec0_r, vec0_l,   \
+ vec1_r, vec1_l,   \
+ out0, out1)   \
+{  \
+(vec0_r) = __msa_srari_w((vec0_r), 6); \
+(vec0_l) = __msa_srari_w((vec0_l), 6); \
+(vec1_r) = __msa_srari_w((vec1_r), 6); \
+(vec1_l) = __msa_srari_w((vec1_l), 6); \
+   \
+(vec0_r) = CLIP_UNSIGNED_CHAR_W((vec0_r)); \
+(vec0_l) = CLIP_UNSIGNED_CHAR_W((vec0_l)); \
+(vec1_r) = CLIP_UNSIGNED_CHAR_W((vec1_r)); \
+(vec1_l) = CLIP_UNSIGNED_CHAR_W((vec1_l)); \
+   \
+out0 = (v4i32) __msa_pckev_h((v8i16) (vec0_l), (v8i16) (vec0_r));  \
+out1 = (v4i32) __msa_pckev_h((v8i16) (vec1_l), (v8i16) (vec1_r));  \
+}
+
 static void hevc_copy_4w_msa(uint8_t * __restrict src, int32_t src_stride,
  int16_t * __restrict dst, int32_t dst_stride,
  int32_t height)
@@ -2270,6 +2288,469 @@ static void hevc_hv_8t_64w_msa(uint8_t * __restrict 
src, int32_t src_stride,
filter_x, filter_y, height, 64);
 }
 
+static void hevc_hv_uni_8t_4w_msa(uint8_t * __restrict src,
+  int32_t src_stride,
+  uint8_t * __restrict dst,
+  int32_t dst_stride,
+  const int8_t * __restrict filter_x,
+  const int8_t * __restrict filter_y,
+  int32_t height)
+{
+uint32_t loop_cnt;
+uint32_t out0, out1;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+v8i16 filt0, filt1, filt2, filt3, filter_vec;
+v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v16i8 mask1, mask2, mask3;
+v8u16 const_vec;
+v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
+v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
+v4i32 dst0_r, dst1_r;
+v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
+v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
+v16i8 tmp;
+v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+
+src -= ((3 * src_stride) + 3);
+
+filter_vec =