From: "Ronald S. Bultje"
In case of no-transform, integrate it with put_pixels4/8(). Intra PCM
is changed to not use h->mb anymore (saves a memcpy). The one during
update_thread_context() init is removed by removing the memcpy() that
clobbered it in the first place. Together, this makes the H264 decoder
almost-independent of dsputil.
Arm assembly changes untested.
---
libavcodec/arm/h264idct_neon.S | 20 +--
libavcodec/get_bits.h | 3 +-
libavcodec/h264.c | 7 ++-
libavcodec/h264.h | 1 +
libavcodec/h264_cabac.c| 4 +-
libavcodec/h264_cavlc.c| 10 ++--
libavcodec/h264_mb_template.c | 20 +++
libavcodec/h264idct_template.c | 16 --
libavcodec/h264pred.h | 8 +--
libavcodec/h264pred_template.c | 28 ++
libavcodec/ppc/h264_altivec.c | 3 ++
libavcodec/svq3.c | 2 +
libavcodec/x86/h264_idct.asm | 108 -
libavcodec/x86/h264_idct_10bit.asm | 53 --
14 files changed, 204 insertions(+), 79 deletions(-)
diff --git a/libavcodec/arm/h264idct_neon.S b/libavcodec/arm/h264idct_neon.S
index 1b349ce..73b2260 100644
--- a/libavcodec/arm/h264idct_neon.S
+++ b/libavcodec/arm/h264idct_neon.S
@@ -22,9 +22,12 @@
function ff_h264_idct_add_neon, export=1
vld1.64 {d0-d3}, [r1,:128]
+vmov.i16q15, #0
vswpd1, d2
+vst1.16 {q15},[r1,:128]!
vadd.i16d4, d0, d1
+vst1.16 {q15},[r1,:128]!
vshr.s16q8, q1, #1
vsub.i16d5, d0, d1
vadd.i16d6, d2, d17
@@ -69,7 +72,9 @@ function ff_h264_idct_add_neon, export=1
endfunc
function ff_h264_idct_dc_add_neon, export=1
+mov r3, #0
vld1.16 {d2[],d3[]}, [r1,:16]
+strhr3, [r1]
vrshr.s16 q1, q1, #6
vld1.32 {d0[0]}, [r0,:32], r2
vld1.32 {d0[1]}, [r0,:32], r2
@@ -180,7 +185,8 @@ endfunc
qb .reqq14
vshr.s16q2, q10, #1
vadd.i16q0, q8, q12
-vld1.16 {q14-q15},[r1,:128]!
+vld1.16 {q14-q15},[r1,:128]
+vst1.16 {q3}, [r1,:128]!
vsub.i16q1, q8, q12
vshr.s16q3, q14, #1
vsub.i16q2, q2, q14
@@ -259,9 +265,13 @@ endfunc
.endm
function ff_h264_idct8_add_neon, export=1
-vld1.16 {q8-q9}, [r1,:128]!
-vld1.16 {q10-q11},[r1,:128]!
-vld1.16 {q12-q13},[r1,:128]!
+vmov.i16q7, #0
+vld1.16 {q8-q9}, [r1,:128]
+vst1.16 {q3}, [r1,:128]!
+vld1.16 {q10-q11},[r1,:128]
+vst1.16 {q3}, [r1,:128]!
+vld1.16 {q12-q13},[r1,:128]
+vst1.16 {q3}, [r1,:128]!
idct8x8_cols0
idct8x8_cols1
@@ -313,7 +323,9 @@ function ff_h264_idct8_add_neon, export=1
endfunc
function ff_h264_idct8_dc_add_neon, export=1
+mov r3, #0
vld1.16 {d30[],d31[]},[r1,:16]
+strhr3, [r1]
vld1.32 {d0}, [r0,:64], r2
vrshr.s16 q15, q15, #6
vld1.32 {d1}, [r0,:64], r2
diff --git a/libavcodec/get_bits.h b/libavcodec/get_bits.h
index 7129b17..f16a508 100644
--- a/libavcodec/get_bits.h
+++ b/libavcodec/get_bits.h
@@ -415,11 +415,12 @@ static inline int init_get_bits8(GetBitContext *s, const
uint8_t *buffer,
return init_get_bits(s, buffer, byte_size * 8);
}
-static inline void align_get_bits(GetBitContext *s)
+static inline const uint8_t *align_get_bits(GetBitContext *s)
{
int n = -get_bits_count(s) & 7;
if (n)
skip_bits(s, n);
+return s->buffer + (s->index >> 3);
}
#define init_vlc(vlc, nb_bits, nb_codes,\
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index cfcb552..a0bf031 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -1249,7 +1249,9 @@ static int decode_update_thread_context(AVCodecContext
*dst,
// copy all fields after MpegEnc
memcpy(&h->s + 1, &h1->s + 1,
- sizeof(H264Context) - sizeof(MpegEncContext));
+ offsetof(H264Context, intra_gb) - sizeof(MpegEncContext));
+memcpy(&h->cabac, &h1->cabac,
+ sizeof(H264Context) - offsetof(H264Context, cabac));
memset(h->sps_buffers, 0, sizeof(h->sps_buffers));
memset(h->pps_buffers, 0, sizeof(h->pps_buffers));
@@ -1269,9 +1271,6 @@ static int decode_update_thread_context(AVCodecContext
*dst,
h->bipred_scratchpad = NULL;
h->thread_context[0] = h;
-
-s->dsp.clear_blocks(h->mb);
-s->dsp.clear_blocks(h->mb + (24 * 16 << h->pixel_shift));
}