From: Meng Wang <wangmeng.k...@bytedance.com> Signed-off-by: Meng Wang <wangmeng.k...@bytedance.com> --- libavcodec/arm/hevcdsp_idct_neon.S | 241 +++++++++++++++++++++++++++++++++++++ libavcodec/arm/hevcdsp_init_neon.c | 2 + 2 files changed, 243 insertions(+)
diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S index e39d00634b..272abf279c 100644 --- a/libavcodec/arm/hevcdsp_idct_neon.S +++ b/libavcodec/arm/hevcdsp_idct_neon.S @@ -451,6 +451,247 @@ function ff_hevc_transform_8x8_neon_8, export=1 bx lr endfunc +/* 16x16 even line combine, input: q3-q10 output: q8-q15 */ +.macro tr8_combine + vsub.s32 q12, q3, q10 // e_8[3] - o_8[3], dst[4] + vadd.s32 q11, q3, q10 // e_8[3] + o_8[3], dst[3] + + vsub.s32 q13, q6, q9 // e_8[2] - o_8[2], dst[5] + vadd.s32 q10, q6, q9 // e_8[2] + o_8[2], dst[2] + + vsub.s32 q14, q5, q8 // e_8[1] - o_8[1], dst[6] + vadd.s32 q9, q5, q8 // e_8[1] + o_8[1], dst[1] + + vsub.s32 q15, q4, q7 // e_8[0] - o_8[0], dst[7] + vadd.s32 q8, q4, q7 // e_8[0] + o_8[0], dst[0] +.endm + +.macro tr16_begin in0, in1, in2, in3, in4, in5, in6, in7 + vmull.s16 q2, \in0, d2[1] // 90 * src1 + vmull.s16 q3, \in0, d2[0] // 87 * src1 + vmull.s16 q4, \in0, d2[3] // 80 * src1 + vmull.s16 q5, \in0, d2[2] // 70 * src1 + vmull.s16 q6, \in0, d3[1] // 57 * src1 + vmull.s16 q7, \in0, d3[0] // 43 * src1 + vmull.s16 q8, \in0, d3[3] // 25 * src1 + vmull.s16 q9, \in0, d3[2] // 9 * src1 + + vmlal.s16 q2, \in1, d2[0] // 87 * src3 + vmlal.s16 q3, \in1, d3[1] // 57 * src3 + vmlal.s16 q4, \in1, d3[2] // 9 * src3 + vmlsl.s16 q5, \in1, d3[0] //-43 * src3 + vmlsl.s16 q6, \in1, d2[3] //-80 * src3 + vmlsl.s16 q7, \in1, d2[1] //-90 * src3 + vmlsl.s16 q8, \in1, d2[2] //-70 * src3 + vmlsl.s16 q9, \in1, d3[3] //-25 * src3 + + vmlal.s16 q2, \in2, d2[3] // 80 * src5 + vmlal.s16 q3, \in2, d3[2] // 9 * src5 + vmlsl.s16 q4, \in2, d2[2] //-70 * src5 + vmlsl.s16 q5, \in2, d2[0] //-87 * src5 + vmlsl.s16 q6, \in2, d3[3] //-25 * src5 + vmlal.s16 q7, \in2, d3[1] // 57 * src5 + vmlal.s16 q8, \in2, d2[1] // 90 * src5 + vmlal.s16 q9, \in2, d3[0] // 43 * src5 + + vmlal.s16 q2, \in3, d2[2] // 70 * src7 + vmlsl.s16 q3, \in3, d3[0] //-43 * src7 + vmlsl.s16 q4, \in3, d2[0] //-87 * src7 + vmlal.s16 q5, \in3, d3[2] // 9 * src7 + vmlal.s16 q6, \in3, d2[1] // 90 * src7 + vmlal.s16 q7, \in3, d3[3] // 25 * src7 + vmlsl.s16 q8, \in3, d2[3] //-80 * src7 + vmlsl.s16 q9, \in3, d3[1] //-57 * src7 + + vmlal.s16 q2, \in4, d3[1] // 57 * src9 + vmlsl.s16 q3, \in4, d2[3] //-80 * src9 + vmlsl.s16 q4, \in4, d3[3] //-25 * src9 + vmlal.s16 q5, \in4, d2[1] // 90 * src9 + vmlsl.s16 q6, \in4, d3[2] // -9 * src9 + vmlsl.s16 q7, \in4, d2[0] //-87 * src9 + vmlal.s16 q8, \in4, d3[0] // 43 * src9 + vmlal.s16 q9, \in4, d2[2] // 70 * src9 + + vmlal.s16 q2, \in5, d3[0] // 43 * src11 + vmlsl.s16 q3, \in5, d2[1] //-90 * src11 + vmlal.s16 q4, \in5, d3[1] // 57 * src11 + vmlal.s16 q5, \in5, d3[3] // 25 * src11 + vmlsl.s16 q6, \in5, d2[0] //-87 * src11 + vmlal.s16 q7, \in5, d2[2] // 70 * src11 + vmlal.s16 q8, \in5, d3[2] // 9 * src11 + vmlsl.s16 q9, \in5, d2[3] //-80 * src11 + + vmlal.s16 q2, \in6, d3[3] // 25 * src13 + vmlsl.s16 q3, \in6, d2[2] //-70 * src13 + vmlal.s16 q4, \in6, d2[1] // 90 * src13 + vmlsl.s16 q5, \in6, d2[3] //-80 * src13 + vmlal.s16 q6, \in6, d3[0] // 43 * src13 + vmlal.s16 q7, \in6, d3[2] // 9 * src13 + vmlsl.s16 q8, \in6, d3[1] //-57 * src13 + vmlal.s16 q9, \in6, d2[0] // 87 * src13 + + + vmlal.s16 q2, \in7, d3[2] // 9 * src15 + vmlsl.s16 q3, \in7, d3[3] //-25 * src15 + vmlal.s16 q4, \in7, d3[0] // 43 * src15 + vmlsl.s16 q5, \in7, d3[1] //-57 * src15 + vmlal.s16 q6, \in7, d2[2] // 70 * src15 + vmlsl.s16 q7, \in7, d2[3] //-80 * src15 + vmlal.s16 q8, \in7, d2[0] // 87 * src15 + vmlsl.s16 q9, \in7, d2[1] //-90 * src15 +.endm + +.macro tr16_end shift + vpop {q2-q3} + vadd.s32 q4, q8, q2 + vsub.s32 q5, q8, q2 + vqrshrn.s32 d12, q4, \shift + vqrshrn.s32 d15, q5, \shift + + vadd.s32 q4, q9, q3 + vsub.s32 q5, q9, q3 + vqrshrn.s32 d13, q4, \shift + vqrshrn.s32 d14, q5, \shift + + vpop {q2-q3} + vadd.s32 q4, q10, q2 + vsub.s32 q5, q10, q2 + vqrshrn.s32 d16, q4, \shift + vqrshrn.s32 d19, q5, \shift + + vadd.s32 q4, q11, q3 + vsub.s32 q5, q11, q3 + vqrshrn.s32 d17, q4, \shift + vqrshrn.s32 d18, q5, \shift + + vpop {q2-q3} + vadd.s32 q4, q12, q2 + vsub.s32 q5, q12, q2 + vqrshrn.s32 d20, q4, \shift + vqrshrn.s32 d23, q5, \shift + + vadd.s32 q4, q13, q3 + vsub.s32 q5, q13, q3 + vqrshrn.s32 d21, q4, \shift + vqrshrn.s32 d22, q5, \shift + + vpop {q2-q3} + vadd.s32 q4, q14, q2 + vsub.s32 q5, q14, q2 + vqrshrn.s32 d24, q4, \shift + vqrshrn.s32 d27, q5, \shift + + vadd.s32 q4, q15, q3 + vsub.s32 q5, q15, q3 + vqrshrn.s32 d25, q4, \shift + vqrshrn.s32 d26, q5, \shift +.endm + +function ff_hevc_transform_16x16_neon_8, export=1 + push {r4-r8} + vpush {d8-d15} + mov r5, #64 + mov r6, #32 + mov r7, #0 + adr r3, tr4f + vld1.16 {d0, d1, d2, d3}, [r3] + mov r8, r0 +0: + add r7, #4 + add r0, #32 + // odd line + vld1.16 {d24}, [r0], r5 + vld1.16 {d25}, [r0], r5 + vld1.16 {d26}, [r0], r5 + vld1.16 {d27}, [r0], r5 + vld1.16 {d28}, [r0], r5 + vld1.16 {d29}, [r0], r5 + vld1.16 {d30}, [r0], r5 + vld1.16 {d31}, [r0], r5 + sub r0, #544 + + tr16_begin d24, d25, d26, d27, d28, d29, d30, d31 + vpush {q2-q9} + + // even line + vld1.16 {d24}, [r0], r5 + vld1.16 {d25}, [r0], r5 + vld1.16 {d26}, [r0], r5 + vld1.16 {d27}, [r0], r5 + vld1.16 {d28}, [r0], r5 + vld1.16 {d29}, [r0], r5 + vld1.16 {d30}, [r0], r5 + vld1.16 {d31}, [r0], r5 + sub r0, #512 + + tr8_begin d25, d27, d29, d31 + tr4 d24, d26, d28, d30 + tr8_combine + + // combine + tr16_end #7 + + // store + vst1.16 {d12}, [r0], r6 + vst1.16 {d13}, [r0], r6 + vst1.16 {d16}, [r0], r6 + vst1.16 {d17}, [r0], r6 + vst1.16 {d20}, [r0], r6 + vst1.16 {d21}, [r0], r6 + vst1.16 {d24}, [r0], r6 + vst1.16 {d25}, [r0], r6 + vst1.16 {d26}, [r0], r6 + vst1.16 {d27}, [r0], r6 + vst1.16 {d22}, [r0], r6 + vst1.16 {d23}, [r0], r6 + vst1.16 {d18}, [r0], r6 + vst1.16 {d19}, [r0], r6 + vst1.16 {d14}, [r0], r6 + vst1.16 {d15}, [r0], r6 + sub r0, #504 // 512 - 8 + + cmp r1, r7 + blt 1f + + cmp r7, #16 + blt 0b + +1: mov r0, r8 + mov r7, #4 +2: subs r7, #1 + // 1st 4 line + vldm r0, {q8-q15} // coeffs + transpose_16b_4x4 d16, d20, d24, d28 + transpose_16b_4x4 d17, d21, d25, d29 + transpose_16b_4x4 d18, d22, d26, d30 + transpose_16b_4x4 d19, d23, d27, d31 + vpush {q12-q13} // 16x16 even line (8x8 odd line) + vpush {q8-q9} // 16x16 even line (8x8 even line) + tr16_begin d20, d28, d21, d29, d22, d30, d23, d31 // odd line transform 2n+1 + vpop {q12-q15} // pop even line + vpush {q2-q9} // push results of 16x16 odd line + tr8_begin d28, d29, d30, d31 // even line transform 2n + tr4 d24, d25, d26, d27 + tr8_combine + tr16_end #12 + transpose_16b_4x4 d12, d13, d16, d17 + transpose_16b_4x4 d20, d21, d24, d25 + transpose_16b_4x4 d26, d27, d22, d23 + transpose_16b_4x4 d18, d19, d14, d15 + vswp d13, d20 + vswp d14, d23 + vswp d17, d24 + vswp d18, d27 + vswp q8, q10 + vswp q7, q13 + vstm r0!, {q6-q13} + bne 2b + + vpop {d8-d15} + pop {r4-r8} + bx lr +endfunc + .align 4 tr4f: .word 0x00240053 // 36 and d1[0] = 83 diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c index 33cc44ef40..d846d01081 100644 --- a/libavcodec/arm/hevcdsp_init_neon.c +++ b/libavcodec/arm/hevcdsp_init_neon.c @@ -36,6 +36,7 @@ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_t void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit); void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit); +void ff_hevc_transform_16x16_neon_8(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs); void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs); void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs); @@ -550,6 +551,7 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_neon_wrapper; c->idct[0] = ff_hevc_transform_4x4_neon_8; c->idct[1] = ff_hevc_transform_8x8_neon_8; + c->idct[2] = ff_hevc_transform_16x16_neon_8; c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8; c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8; c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8; -- 2.13.6 (Apple Git-96) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel