Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted hv mc msa functions

2017-11-01 Thread Michael Niedermayer
On Wed, Nov 01, 2017 at 09:27:22AM +, Manojkumar Bhosale wrote:
> LGTM

will apply

thx

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If you think the mosad wants you dead since a long time then you are either
wrong or dead since a long time.


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted hv mc msa functions

2017-11-01 Thread Manojkumar Bhosale
LGTM

-Original Message-
From: ffmpeg-devel [mailto:ffmpeg-devel-boun...@ffmpeg.org] On Behalf Of 
kaustubh.ra...@imgtec.com
Sent: Tuesday, October 31, 2017 6:31 PM
To: ffmpeg-devel@ffmpeg.org
Cc: Kaustubh Raste
Subject: [FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted hv mc 
msa functions

From: Kaustubh Raste <kaustubh.ra...@imgtec.com>

Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste <kaustubh.ra...@imgtec.com>
---
 libavcodec/mips/hevc_macros_msa.h  |9 +
 libavcodec/mips/hevc_mc_uniw_msa.c | 1598 +---
 2 files changed, 965 insertions(+), 642 deletions(-)

diff --git a/libavcodec/mips/hevc_macros_msa.h 
b/libavcodec/mips/hevc_macros_msa.h
index 7dcfea0..27c69ff 100644
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -80,6 +80,15 @@
 out_m;   \
 } )
 
+#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)\
+( {  \
+v8i16 out_m; \
+ \
+out_m = __msa_dotp_s_h((v16i8) in0, (v16i8) filt0);  \
+out_m = __msa_dpadd_s_h(out_m, (v16i8) in1, (v16i8) filt1);  \
+out_m;   \
+} )
+
 #define HEVC_FILT_4TAP(in0, in1, filt0, filt1)   \
 ( {  \
 v4i32 out_m; \
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index 28c7062f..0796b0a 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -1801,40 +1801,42 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
  int32_t rnd_val)
 {
 uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+v16u8 out;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 v8i16 filt0, filt1, filt2, filt3;
-v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 v16i8 mask1, mask2, mask3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
-v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
-v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
-v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 
 src -= ((3 * src_stride) + 3);
 filter_vec = LD_SH(filter_x);
 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 mask1 = mask0 + 2;
 mask2 = mask0 + 4;
 mask3 = mask0 + 6;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
-
 weight_vec = __msa_fill_w(weight);
 offset_vec = __msa_fill_w(offset);
 rnd_vec = __msa_fill_w(rnd_val);
+denom_vec = rnd_vec - 6;
+
+const_128 = __msa_ldi_w(128);
+const_128 *= weight_vec;
+offset_vec += __msa_srar_w(const_128, denom_vec);
 
 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 src += (7 * src_stride);
@@ -1847,64 +1849,68 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
vec8, vec9, vec10, vec11);
 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
vec12, vec13, vec14, vec15);
-dst30 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst30, dst30, dst30, dst30);
-dst41 = const_vec;
-DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
- dst41, dst41, dst41, dst41);
-dst52 = const_vec;
-DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
- dst52, dst52, dst52, dst52);
-dst63 = const_vec;
-DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
- dst63, dst63, dst63, dst63);
-
-ILVR_H3_SH(dst41, dst30, dst52, d

[FFmpeg-devel] [PATCH] avcodec/mips: Improve hevc uni weighted hv mc msa functions

2017-10-31 Thread kaustubh.raste
From: Kaustubh Raste 

Use immediate unsigned saturation for clip to max saving one vector register.

Signed-off-by: Kaustubh Raste 
---
 libavcodec/mips/hevc_macros_msa.h  |9 +
 libavcodec/mips/hevc_mc_uniw_msa.c | 1598 +---
 2 files changed, 965 insertions(+), 642 deletions(-)

diff --git a/libavcodec/mips/hevc_macros_msa.h 
b/libavcodec/mips/hevc_macros_msa.h
index 7dcfea0..27c69ff 100644
--- a/libavcodec/mips/hevc_macros_msa.h
+++ b/libavcodec/mips/hevc_macros_msa.h
@@ -80,6 +80,15 @@
 out_m;   \
 } )
 
+#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)\
+( {  \
+v8i16 out_m; \
+ \
+out_m = __msa_dotp_s_h((v16i8) in0, (v16i8) filt0);  \
+out_m = __msa_dpadd_s_h(out_m, (v16i8) in1, (v16i8) filt1);  \
+out_m;   \
+} )
+
 #define HEVC_FILT_4TAP(in0, in1, filt0, filt1)   \
 ( {  \
 v4i32 out_m; \
diff --git a/libavcodec/mips/hevc_mc_uniw_msa.c 
b/libavcodec/mips/hevc_mc_uniw_msa.c
index 28c7062f..0796b0a 100644
--- a/libavcodec/mips/hevc_mc_uniw_msa.c
+++ b/libavcodec/mips/hevc_mc_uniw_msa.c
@@ -1801,40 +1801,42 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
  int32_t rnd_val)
 {
 uint32_t loop_cnt;
-v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+v16u8 out;
+v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
 v8i16 filt0, filt1, filt2, filt3;
-v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
+v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
 v16i8 mask1, mask2, mask3;
-v8i16 filter_vec, const_vec;
+v8i16 filter_vec;
 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
-v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
-v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
-v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
-v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
-v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
+v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
+v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
+v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
+v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
+v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
 
 src -= ((3 * src_stride) + 3);
 filter_vec = LD_SH(filter_x);
 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
 filter_vec = LD_SH(filter_y);
-vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
-filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
+UNPCK_R_SB_SH(filter_vec, filter_vec);
 
-SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
+SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
 
 mask1 = mask0 + 2;
 mask2 = mask0 + 4;
 mask3 = mask0 + 6;
 
-const_vec = __msa_ldi_h(128);
-const_vec <<= 6;
-
 weight_vec = __msa_fill_w(weight);
 offset_vec = __msa_fill_w(offset);
 rnd_vec = __msa_fill_w(rnd_val);
+denom_vec = rnd_vec - 6;
+
+const_128 = __msa_ldi_w(128);
+const_128 *= weight_vec;
+offset_vec += __msa_srar_w(const_128, denom_vec);
 
 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
 src += (7 * src_stride);
@@ -1847,64 +1849,68 @@ static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
vec8, vec9, vec10, vec11);
 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
vec12, vec13, vec14, vec15);
-dst30 = const_vec;
-DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
- dst30, dst30, dst30, dst30);
-dst41 = const_vec;
-DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
- dst41, dst41, dst41, dst41);
-dst52 = const_vec;
-DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
- dst52, dst52, dst52, dst52);
-dst63 = const_vec;
-DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
- dst63, dst63, dst63, dst63);
-
-ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
-   dst10_r, dst21_r, dst32_r);
-
-dst43_r = __msa_ilvl_h(dst41, dst30);
-dst54_r = __msa_ilvl_h(dst52, dst41);
-dst65_r = __msa_ilvl_h(dst63, dst52);
+dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
+  filt3);
+dst41 =