Re: [libav-devel] [PATCH] h264: integrate clear_blocks calls with IDCT.

2013-02-11 Thread Martin Storsjö

On Sat, 9 Feb 2013, Ronald S. Bultje wrote:


From: "Ronald S. Bultje" 

In case of no-transform, integrate it with put_pixels4/8(). Intra PCM
is changed to not use h->mb anymore (saves a memcpy). The one during
update_thread_context() init is removed by removing the memcpy() that
clobbered it in the first place. Together, this makes the H264 decoder
almost-independent of dsputil.

Arm assembly changes untested.
---
libavcodec/arm/h264idct_neon.S |  20 +--
libavcodec/get_bits.h  |   3 +-
libavcodec/h264.c  |   7 ++-
libavcodec/h264.h  |   1 +
libavcodec/h264_cabac.c|   4 +-
libavcodec/h264_cavlc.c|  10 ++--
libavcodec/h264_mb_template.c  |  20 +++
libavcodec/h264idct_template.c |  16 --
libavcodec/h264pred.h  |   8 +--
libavcodec/h264pred_template.c |  28 ++
libavcodec/ppc/h264_altivec.c  |   3 ++
libavcodec/svq3.c  |   2 +
libavcodec/x86/h264_idct.asm   | 108 -
libavcodec/x86/h264_idct_10bit.asm |  53 --
14 files changed, 204 insertions(+), 79 deletions(-)


Didn't apply cleanly to master, and failed tests on both x86 and arm after 
fixing up the conflicts - didn't look further into it yet since some x86 
tests were failing as well.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264: integrate clear_blocks calls with IDCT.

2013-02-09 Thread Ronald S. Bultje
From: "Ronald S. Bultje" 

In case of no-transform, integrate it with put_pixels4/8(). Intra PCM
is changed to not use h->mb anymore (saves a memcpy). The one during
update_thread_context() init is removed by removing the memcpy() that
clobbered it in the first place. Together, this makes the H264 decoder
almost-independent of dsputil.

Arm assembly changes untested.
---
 libavcodec/arm/h264idct_neon.S |  20 +--
 libavcodec/get_bits.h  |   3 +-
 libavcodec/h264.c  |   7 ++-
 libavcodec/h264.h  |   1 +
 libavcodec/h264_cabac.c|   4 +-
 libavcodec/h264_cavlc.c|  10 ++--
 libavcodec/h264_mb_template.c  |  20 +++
 libavcodec/h264idct_template.c |  16 --
 libavcodec/h264pred.h  |   8 +--
 libavcodec/h264pred_template.c |  28 ++
 libavcodec/ppc/h264_altivec.c  |   3 ++
 libavcodec/svq3.c  |   2 +
 libavcodec/x86/h264_idct.asm   | 108 -
 libavcodec/x86/h264_idct_10bit.asm |  53 --
 14 files changed, 204 insertions(+), 79 deletions(-)

diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index 1b349ce..73b2260 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -22,9 +22,12 @@
 
 function ff_h264_idct_add_neon, export=1
 vld1.64 {d0-d3},  [r1,:128]
+vmov.i16q15, #0
 
 vswpd1,  d2
+vst1.16 {q15},[r1,:128]!
 vadd.i16d4,  d0,  d1
+vst1.16 {q15},[r1,:128]!
 vshr.s16q8,  q1,  #1
 vsub.i16d5,  d0,  d1
 vadd.i16d6,  d2,  d17
@@ -69,7 +72,9 @@ function ff_h264_idct_add_neon, export=1
 endfunc
 
 function ff_h264_idct_dc_add_neon, export=1
+mov r3,   #0
 vld1.16 {d2[],d3[]}, [r1,:16]
+strhr3,   [r1]
 vrshr.s16   q1,  q1,  #6
 vld1.32 {d0[0]},  [r0,:32], r2
 vld1.32 {d0[1]},  [r0,:32], r2
@@ -180,7 +185,8 @@ endfunc
 qb  .reqq14
 vshr.s16q2,  q10, #1
 vadd.i16q0,  q8,  q12
-vld1.16 {q14-q15},[r1,:128]!
+vld1.16 {q14-q15},[r1,:128]
+vst1.16 {q3}, [r1,:128]!
 vsub.i16q1,  q8,  q12
 vshr.s16q3,  q14, #1
 vsub.i16q2,  q2,  q14
@@ -259,9 +265,13 @@ endfunc
 .endm
 
 function ff_h264_idct8_add_neon, export=1
-vld1.16 {q8-q9},  [r1,:128]!
-vld1.16 {q10-q11},[r1,:128]!
-vld1.16 {q12-q13},[r1,:128]!
+vmov.i16q7,   #0
+vld1.16 {q8-q9},  [r1,:128]
+vst1.16 {q3}, [r1,:128]!
+vld1.16 {q10-q11},[r1,:128]
+vst1.16 {q3}, [r1,:128]!
+vld1.16 {q12-q13},[r1,:128]
+vst1.16 {q3}, [r1,:128]!
 
 idct8x8_cols0
 idct8x8_cols1
@@ -313,7 +323,9 @@ function ff_h264_idct8_add_neon, export=1
 endfunc
 
 function ff_h264_idct8_dc_add_neon, export=1
+mov r3,   #0
 vld1.16 {d30[],d31[]},[r1,:16]
+strhr3,   [r1]
 vld1.32 {d0}, [r0,:64], r2
 vrshr.s16   q15, q15, #6
 vld1.32 {d1}, [r0,:64], r2
diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
index 7129b17..f16a508 100644
--- a/libavcodec/get_bits.h
+++ b/libavcodec/get_bits.h
@@ -415,11 +415,12 @@ static inline int init_get_bits8(GetBitContext *s, const 
uint8_t *buffer,
 return init_get_bits(s, buffer, byte_size * 8);
 }
 
-static inline void align_get_bits(GetBitContext *s)
+static inline const uint8_t *align_get_bits(GetBitContext *s)
 {
 int n = -get_bits_count(s) & 7;
 if (n)
 skip_bits(s, n);
+return s->buffer + (s->index >> 3);
 }
 
 #define init_vlc(vlc, nb_bits, nb_codes,\
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index cfcb552..a0bf031 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -1249,7 +1249,9 @@ static int decode_update_thread_context(AVCodecContext 
*dst,
 
 // copy all fields after MpegEnc
 memcpy(&h->s + 1, &h1->s + 1,
-   sizeof(H264Context) - sizeof(MpegEncContext));
+   offsetof(H264Context, intra_gb) - sizeof(MpegEncContext));
+memcpy(&h->cabac, &h1->cabac,
+   sizeof(H264Context) - offsetof(H264Context, cabac));
 memset(h->sps_buffers, 0, sizeof(h->sps_buffers));
 memset(h->pps_buffers, 0, sizeof(h->pps_buffers));
 
@@ -1269,9 +1271,6 @@ static int decode_update_thread_context(AVCodecContext 
*dst,
 h->bipred_scratchpad = NULL;
 
 h->thread_context[0] = h;
-
-s->dsp.clear_blocks(h->mb);
-s->dsp.clear_blocks(h->mb + (24 * 16 << h->pixel_shift));
 }